In [18]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import jellyfish
import random
import re

In [11]:
companies_path = 'assignment/companies-house-web-scraper/companies.csv'

In [3]:
# import csv
# companies = []
# with open(companies_path) as companies_file:
#     companies_reader = csv.DictReader(companies_file)
#     for row in companies_reader:
#         companies.append(dict(row))

In [None]:
companies = pd.read_csv(companies_path)
#companies.loc[(companies['city'] == '') & (companies['officer'] == '')]
companies.iloc[0]

In [12]:
def chooseHeaderRandomly():
    user_agent_list = [
    #Chrome
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    #Firefox
    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
    ]
    user_agent = random.choice(user_agent_list)
    headers = {'User-Agent': user_agent}
    return headers

In [None]:
def search(company):
    # Step 1: Search for companies using companies house website 
    # https://beta.companieshouse.gov.uk/search?q=
    response = requests.get('https://beta.companieshouse.gov.uk/search/companies?q='+company['name'], headers=chooseHeaderRandomly())
    html = BeautifulSoup(response.text, "lxml")
    
    search_results = html.find('ul',id="results").find_all('li', 'type-company')
    candidates = []
    # Step 2: Check how well each search result matches the company information in the given file.
    for search_result in search_results:
        score = 0
        
        # Criterion 1 (company name):
        result_name = search_result.find('a').text
        # Jaro distance is used to measure how similar the company name in the search result is to the company name used to conduct the search.
        # The higher the Jaro distance for two strings is, the more similar the strings are.
        # Source: https://rosettacode.org/wiki/Jaro_distance
        name_similarity = jellyfish.jaro_distance(result_name, company['name'])
        name_matched = True if name_similarity > 2/3 else False
        score += name_similarity
        
        p_list = search_result.find_all('p')
        # Get Company Number
        company_number = p_list[0].find('strong')
        if company_number:
            company_number = company_number.text
        
        # Criterion 2 (city):
        result_address = p_list[1]
        if result_address:
            result_address = result_address.text
        city_matched = True if any([re.search(, result_address) for city in company['city'].split(' / ')]) else False
        if city_matched:
            score += 1
        
        # Criterion 3 (officer):
        result_officer = ''
        officers = get_officers(company_number)
        officer_matched = False
        for officer in officers:
            no_hyphen_officer = ' '.join(reversed(officer.replace("'").replace('-', '').split(', ')))
            officer_name_similarity = jellyfish.jaro_distance(no_hyphen_officer.lower(), company['officer'].lower())
            if officer_name_similarity > 0.75:
                officer_matched = True
                result_officer = officer
                score += officer_name_similarity
                break
        if not officer_matched:
            result_officer = '||'.join(officers)

def get_officers(company_number):
    response = requests.get('https://beta.companieshouse.gov.uk/company/%s/officers' % company_number, headers=chooseHeaderRandomly())
    html = BeautifulSoup(response.text, 'lxml')
    officers = html.find('div', 'appointments-list')
    if officers:
        officers = list(map(lambda x: x.text,officers.find_all('a')))
    else:
        officers = []
    return officers
    
search(companies.iloc[0])