In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
response = requests.get('https://www.accenture.com/us-en', headers={'USER-AGENT': 'Lucile/Search Engine'})

In [3]:
soup = BeautifulSoup(response.content, 'html.parser')

## Title

The title of page should be less than 60 characters or 70 characters at most

In [5]:
title = soup.find('title')
title

<title>Accenture | Let There Be Change</title>

## Description

Description should be less than 150 characters. We can also check for `itemprop` type meta tags.

In [7]:
description = soup.find('meta', attrs={'name': 'description'})
description

<meta content="Accenture embraces the power of change to create 360° value &amp; shared success in the U.S. for our clients, people, shareholders, partners and communities. Read more." name="description"/>

In [13]:
itemprop_name = soup.find('meta', attrs={'itemprop': 'name'})
itemprop_description = soup.find('meta', attrs={'itemprop': 'description'})

print(itemprop_name)
print(itemprop_description)


<meta content="Let There Be Change | Accenture" itemprop="name"/>
<meta content="Accenture embraces the power of change to create 360° value &amp; shared success in the U.S. for our clients, people, shareholders, partners and communities. Read more." itemprop="description"/>


## Structured data

The page should have structured data tags

In [15]:
structured_data = soup.find_all('script', attrs={'type': 'application/ld+json'})


[<script type="application/ld+json">
 		{
   "@context" : "https://schema.org",
   "@type" : "Organization",
   "name" : "Let There Be Change | Accenture",
   "url" : "https://www.accenture.com/us-en/",
   "logo" : "/content/experience-fragments/acom/us-en/header/global-header/master/_jcr_content/root/globalheader/logo.coreimg.png/1678919291529/acc-logo-black-purple-rgb.png",
   "description" : " Accenture embraces the power of change to create 360° value \\x26 shared success in the U.S. for our clients, people, shareholders, partners and communities.",
   "contactPoint" : {
     "@type" : "ContactPoint",
     "telephone" : "+1 312 842 5012",
     "contactType" : "customer service"
   },
   "sameAs" : [ "https://www.linkedin.com/company/accenture", "https://twitter.com/Accenture_US", "https://www.facebook.com/AccentureUS", "https://www.instagram.com/accentureus/", "https://www.youtube.com/accenture", "https://www.youtube.com/accentureus" ]
 }
 	</script>]

In [96]:
# An url Iterator

from urllib.parse import urlparse, urljoin

URLS_TO_VISIT = set()

INITIAL_DOMAIN = urlparse('https://www.accenture.com/us-en')

links = soup.find_all('a')

class LinkIterator:
    urls_to_visit = []

    def __init__(self, url, items):
        self.url = url
        self.items = items

    def __str__(self):
        self.get_links()
        return str(set(self.urls_to_visit))

    def __iter__(self):
        self.get_links()
        for link in set(self.urls_to_visit):
            yield link

    def __len__(self):
        self.get_links()
        return len(self.urls_to_visit)

    def get_links(self):
        for link in links:
            # 1. Skip urls with no "href"
            if not link.has_attr('href'):
                continue

            url = link.attrs['href']

            # 2. Skip anchors
            if url.startswith('#'):
                continue
            
            # Rebuild if is a path
            if url.startswith('/'):
                url = urljoin(
                    INITIAL_DOMAIN.geturl(),
                    url
                )

            # 3. Skip if url has 
            # different domain
            url_object = urlparse(url)
            if url_object.netloc != INITIAL_DOMAIN.netloc:
                continue

            # 4. Skip if url is same 
            # as initial domain
            if url_object == INITIAL_DOMAIN:
                continue
            
            # 5. Skip if "xml"
            extensions = ['xml']
            items = url_object.path.split('.')
            if items and len(items) > 1:
                _, extension = items
                if extension in extensions:
                    continue
            self.urls_to_visit.append(url_object.geturl())
            # URLS_TO_VISIT.add(url_object.geturl())


iterator = LinkIterator('http://example.com', links)

{'https://www.accenture.com/us-en/insights/cloud/edge-computing-index', 'https://www.accenture.com/us-en/support/accessibility-statement', 'https://www.accenture.com/hu-en', 'https://www.accenture.com/za-en', 'https://www.accenture.com/us-en/careers/local/military-veterans', 'https://www.accenture.com/lu-en', 'https://www.accenture.com/ch-en', 'https://www.accenture.com/us-en/industries/public-service-index', 'https://www.accenture.com/it-it', 'https://www.accenture.com/us-en/case-studiesnew/song/signet-jewelers-virtual-shopping-experience', 'https://www.accenture.com/us-en/careers/explore-careers/area-of-interest/technology-careers', 'https://www.accenture.com/us-en/about/location-index', 'https://www.accenture.com/content/acom/us-en/about/contact-us.html', 'https://www.accenture.com/cl-es', 'https://www.accenture.com/us-en/insights/5g-index', 'https://www.accenture.com/us-en/insights/voices', 'https://www.accenture.com/us-en/careers/explore-careers/area-of-interest/journey-to-accentu