In [None]:
# download libraries
!pip install --upgrade pip
!pip install bs4
!pip install pandas
!pip install requests
!pip install numpy
!pip install urllib3
!pip install phonenumbers
!pip install validate_email

In [None]:
# scraping multiple timestamps of govt list of pcr providers (obtaining comany name, website, number & email)
from bs4 import BeautifulSoup
import pandas as pd
import requests
from urllib.parse import urlparse
import numpy as np
import phonenumbers

urls = [
    'https://www.find-travel-test-provider.service.gov.uk/test-type/amber', 
    'https://web.archive.org/web/20210519121145/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210520140910/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210526072031/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210529103556/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210531150239/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210604100539/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210618121419/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210626230824/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210629104343/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210701142411/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210708123655/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210713165043/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210729180818/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210808132243/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210811073050/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210812113842/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210817114915/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210818174329/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210823032256/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210824065330/https://www.find-travel-test-provider.service.gov.uk/test-type/amber'
]

provider_details = pd.DataFrame(columns=['company_name','company_link', 'company_number', 'company_email'])

for url in urls:
    print(url)
    # scrape gvmt test provider site table
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    providers_table = soup.find('table', {'class', 'govuk-table'}).find('tbody')

    url_provider_details = pd.DataFrame(columns=['company_name', 'company_link', 'company_number', 'company_email'])
    for row in providers_table.find_all('tr'):
        # test provider saved in cell with id 'provider'
        provider = row.find(id='provider').find('a')
        name = provider.get_text().rstrip().lower()
        link = provider['href']
    
        # test provider number and email saved in only cell(s) with no id
        number_email = row.find_all('td', id=None)
        
        # remove web archive prefix from wayback machine
        link = link[43:] if 'web.archive.org' in link else link

        # old format stores number and email in separate cells
        # new format stores number and email in same cell
        if len(number_email) == 1:
            number_email = number_email[0].find_all('a')
        number = str(number_email[0].get_text())
        email = number_email[1].get_text()
        
        # apply standard format to numbers, emails and links
        if number and len(number) >= 10:
            number = ' '.join(number.rstrip().split())
            number = phonenumbers.format_number(phonenumbers.parse(number, 'GB'), phonenumbers.PhoneNumberFormat.INTERNATIONAL)
        else:
            number = np.nan 

        if email:
            email = str(email).lower()

        if link:
            link = urlparse(link.lower()).netloc

        url_provider_details = url_provider_details.append({
            'company_name': name,
            'company_link': link,
            'company_number': number,
            'company_email': email
        }, ignore_index=True)

    provider_details = pd.merge(
        provider_details, 
        url_provider_details, 
        how="outer", 
        on=['company_name','company_link','company_number','company_email']
    )

provider_details.to_csv('datasets/provider_details.csv')

In [64]:
# group all numbers, emails, and links for each company together
grouped_details = pd.DataFrame(columns=['company_name'])
provider_details = pd.read_csv('datasets/provider_details.csv')

for category in ['company_number', 'company_email', 'company_link']:
    category_details = provider_details[['company_name', category]].drop_duplicates()
    df = (category_details.set_index(['company_name', category_details.groupby('company_name').cumcount()])[category]
            .unstack(fill_value='')
            .add_prefix(category+'_')
            .reset_index())

    grouped_details = pd.merge(grouped_details, df, how='outer',on=['company_name'])

grouped_details.to_csv('datasets/grouped_details.csv')

In [65]:
import phonenumbers

# validate scraped phone numbers
grouped_details = pd.read_csv('datasets/grouped_details.csv')
phone_cols = [x for x in grouped_details.columns if 'company_number' in x]

def check_number(x):
    results = []
    for cell in x:
        if pd.isna(cell) or len(str(cell)) < 9:
            results.append('')
        else:
            number = phonenumbers.parse(cell,'GB')
            valid = phonenumbers.is_valid_number(number)
            results.append(valid)
    return results

phone_details = grouped_details[phone_cols]
phone_validated = phone_details.apply(check_number)
phone_validated['company_name'] = grouped_details['company_name']

phone_validated.to_csv('datasets/phone_validation.csv')

In [None]:
from validate_email import validate_email

# validate scraped email addresses
grouped_details = pd.read_csv('datasets/grouped_details.csv')
email_cols = [x for x in grouped_details.columns if 'company_email' in x]

def check_email(x):
    results = []
    for cell in x:
        if not cell:
            results.append('')
        else:
            valid = validate_email(email=str(cell))
            results.append(valid)

email_details = grouped_details[email_cols]
email_validated = email_details.apply(check_email)
email_validated['company_name'] = grouped_details['company_name']

email_validated.to_csv('datasets/email_validation.csv')


In [None]:
# obtaining trustpilot links
grouped_details = pd.read_csv('datasets/grouped_details.csv')
link_cols = [x for x in grouped_details.columns if 'company_link' in x]

def check_trustpilot(x):
    results = []
    for cell in x:
        if pd.isna(cell) or not cell:
            results.append('')
        else:
            print(cell)
            page = requests.get("https://uk.trustpilot.com/review/"+cell)
            soup = BeautifulSoup(page.content, "html.parser")
            score = soup.find('p', {'class', 'header_trustscore'})
                
            results.append(score.get_text() if score else '')

    return results


link_details = grouped_details[link_cols]
trustpilot_scores = link_details.apply(check_trustpilot) 
trustpilot_scores['company_name'] = grouped_details['company_name']

trustpilot_scores.to_csv('datasets/trustpilot_scores.csv')

In [43]:
# match registered CH companies 
import requests
from bs4 import BeautifulSoup
import pandas as pd
from fuzzywuzzy.fuzz import partial_ratio

api_key = '3511f785-bc60-40b3-b697-36bedb63c848'

grouped_details = pd.read_csv('datasets/grouped_details.csv')
matched_companies = pd.DataFrame(columns=['company_name', 'registered_name'])
matched_companies['company_name'] = grouped_details['company_name']

matched = []
for company in grouped_details['company_name']:
    match = ''

    company = company.lower()
    response = requests.get(
        'https://api.company-information.service.gov.uk/search?items_per_page=1&q='+'+'.join(company.split()), 
        auth=(api_key, '')
    )

    if response:
        response_json = response.json()
        if response_json['total_results'] > 1:
            company_found = response.json()['items'][0]['title'].lower()
            
            if partial_ratio(company_found, company) >= 80:
                match = company_found
        
    matched.append(match)

matched_companies['registered_name'] = matched
matched_companies.to_csv('./datasets/ch_matched.csv')


<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403]>
<Response [403

KeyboardInterrupt: 