In [40]:
# Scraping multiple timestamps of govt list of pcr providers (obtaining comany name, website, number & email)

from bs4 import BeautifulSoup
import pandas as pd
import requests
from urllib.parse import urlparse
import phonenumbers
import numpy as np

urls = [
    'https://www.find-travel-test-provider.service.gov.uk/test-type/amber', 
    'https://web.archive.org/web/20210519121145/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210520140910/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210526072031/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210529103556/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210531150239/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210604100539/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210618121419/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210626230824/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210629104343/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210701142411/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210708123655/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210713165043/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210729180818/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210808132243/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210811073050/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210812113842/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210817114915/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210818174329/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210823032256/https://www.find-travel-test-provider.service.gov.uk/test-type/amber',
    'https://web.archive.org/web/20210824065330/https://www.find-travel-test-provider.service.gov.uk/test-type/amber'
]

provider_details = pd.DataFrame(columns=['company_name','company_link', 'company_number', 'company_email'])

for url in urls:
    print(url)
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    providers_table = soup.find('table', {'class', 'govuk-table'}).find('tbody')

    url_provider_details = pd.DataFrame(columns=['company_name','company_link', 'company_number', 'company_email'])

    for row in providers_table.find_all('tr'):
        provider = row.find(id='provider').find('a')
        name = provider.get_text().rstrip().lower()
        link = provider['href']
        
        cells_without_id = row.find_all('td', id=None)
        
        link = link[43:] if 'web.archive.org' in link else link
        link = urlparse(link).netloc

        if len(cells_without_id) > 1:
            number = cells_without_id[0].get_text()
            email = cells_without_id[1].get_text()
        else:
            links_in_cell = cells_without_id[0].find_all('a')
            number = links_in_cell[0].get_text()
            email = links_in_cell[1].get_text()
        
        if number and len(number) >= 10:
            number = ' '.join(number.rstrip().split())
            number = phonenumbers.format_number(phonenumbers.parse(number, 'GB'), phonenumbers.PhoneNumberFormat.INTERNATIONAL)
        else:
            number = np.nan       
        if email:
            email = email.lower()
        if link:
            link = link.lower()

        url_provider_details = url_provider_details.append({
            'company_name': name,
            'company_link': link,
            'company_number': number,
            'company_email': email
        }, ignore_index=True)

    provider_details = pd.merge(provider_details, url_provider_details, how="outer", on=['company_name','company_link','company_number','company_email'])      
    

https://www.find-travel-test-provider.service.gov.uk/test-type/amber
https://web.archive.org/web/20210519121145/https://www.find-travel-test-provider.service.gov.uk/test-type/amber
https://web.archive.org/web/20210520140910/https://www.find-travel-test-provider.service.gov.uk/test-type/amber
https://web.archive.org/web/20210526072031/https://www.find-travel-test-provider.service.gov.uk/test-type/amber
https://web.archive.org/web/20210529103556/https://www.find-travel-test-provider.service.gov.uk/test-type/amber
https://web.archive.org/web/20210531150239/https://www.find-travel-test-provider.service.gov.uk/test-type/amber
https://web.archive.org/web/20210604100539/https://www.find-travel-test-provider.service.gov.uk/test-type/amber
https://web.archive.org/web/20210618121419/https://www.find-travel-test-provider.service.gov.uk/test-type/amber
https://web.archive.org/web/20210626230824/https://www.find-travel-test-provider.service.gov.uk/test-type/amber
https://web.archive.org/web/2021062

In [41]:
# removing duplicate rows
overall_df = pd.DataFrame(columns=['company_name'])
for category in ['company_number', 'company_email', 'company_link']:
    provider_numbers = provider_details[['company_name', category]].drop_duplicates()
    df1 = (provider_numbers.set_index(['company_name', provider_numbers.groupby('company_name').cumcount()])[category]
            .unstack(fill_value='')
            .add_prefix(category+'_')
            .reset_index())

    df1.to_csv('provider_details_' + category + '.csv')
    overall_df = pd.merge(overall_df, df1, how='outer',on=['company_name'])

overall_df.to_csv('overall_df.csv')



In [42]:
# validating phone numbers
from phonenumbers import carrier, timezone, geocoder

phone_cols = [x for x in overall_df if 'company_number' in x]
print(phone_cols)
validated_df = overall_df
for column in phone_cols:
    results = []
    for i in overall_df.index:
        cell = overall_df[column][i]
        if pd.isna(cell) or len(str(cell)) < 9:
            results.append('')
        else:
            comp_number = cell
            number = phonenumbers.parse(str(comp_number),'GB')
            valid = phonenumbers.is_valid_number(number)
            results.append(valid)
    
    validated_df['is_valid_'+column] = results

validated_df.to_csv('validated_df.csv')


# overall_df['Number Validity'] = results

# print(results.count(True))
# print(results.count(False))
# print(results.count('N/A'))

['company_number_0', 'company_number_1', 'company_number_2', 'company_number_3']


In [62]:
# validating emails
from validate_email import validate_email

email_cols = [x for x in overall_df if 'company_email' in x]
validated_df2 = overall_df
for column in email_cols:
    results = []
    for i in overall_df.index:
        cell = overall_df[column][i]
        if not cell:
            results.append('')
        else:
            valid = validate_email(email_address=cell)
            results.append(valid)
    
    validated_df2['is_valid_'+column] = results

validated_df2.to_csv('validated_df2.csv')


In [None]:
# obtaining trustpilot links
link_cols = [x for x in overall_df if 'company_link' in x]
trustpilot_df = overall_df
for column in link_cols:
    results = []
    for i in overall_df.index:
        cell = overall_df[column][i]
        if pd.isna(cell) or not cell:
            results.append('')
        else:
            page = requests.get("https://uk.trustpilot.com/review/"+cell)
            soup = BeautifulSoup(page.content, "html.parser")
            score = soup.find('p', {'class', 'header_trustscore'}).get_text()
            results.append(score)

    trustpilot_df['trustpilot_'+column] = results

trustpilot_df.to_csv('trustpilot_df.csv')

print(results)