# Purpose

Upload.

# Import dependencies

In [2]:
import pandas as pd
import numpy as np
import os
from fuzzywuzzy import fuzz, process
from rapidfuzz import process as faster_process
from datetime import datetime, timedelta
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import requests
import chwrapper
import json
import ast



# Load data

In [19]:
def load_grid_data(cleaning=False):
    
    os.chdir(os.getcwd() + '\\grid')
    
    institutes_df = pd.read_csv('grid.csv').rename(columns={'ID': 'grid_id'})
    type_df = pd.read_csv('types.csv')
    address_df = pd.read_csv('addresses.csv', usecols=['grid_id', 'lat', 'lng', 'city'])
    #acronyms_df = pd.read_csv('acronyms.csv')
    
    # Filter for only UK companies
    uk_institutes_df = institutes_df[institutes_df['Country'] == 'United Kingdom']
    
    # Join data frames
    merged_df = pd.merge(uk_institutes_df, type_df, on='grid_id', how='left')
    merged_df = pd.merge(merged_df, address_df, on='grid_id', how='left')
    
    if cleaning:
        print(merged_df.shape[0])
        merged_df = pd.merge(merged_df, pd.read_csv('institutes.csv'), on='grid_id', how='left')
        print(merged_df.shape[0])
        return merged_df
    
    return merged_df

In [3]:
def load_scopus_affils():
    os.chdir('..')
    df = pd.read_csv('scopus_uk_affils.csv').drop(columns=['Unnamed: 0'])
    return df

In [4]:
grid_df = load_grid_data()
scopus_affils_df = load_scopus_affils()

# Prepare Scopus and GRID data for matching

In [7]:
# If GRID name includes (United Kingdom), remove it
uk = '(United Kingdom)'
grid_df['formatted_name'] = grid_df['Name'].apply(lambda x: x.replace(uk, '') if uk in x else x)

# fuzzywuzzy

Fuzzywuzzy is the pure Python implementation whereas rapidfuzz is implemented in C++ for significant speed improvements. Fuzzywuzzy has four main functionalities: `partial_ratio`, `ratio`, `token_sort_ratio` and `token_set_ratio`. Rapidfuzz has this functionality and more!

In [8]:
str_1 = 'Howdie partner, how are you today?'
str_2 = 'how are you today partner?'
print(fuzz.partial_ratio(str_1, str_2))
print(fuzz.ratio(str_1, str_2))

# Token methods pre-process the strings by converting to lower case, removing punctuation
# and tokenising. Next, the tokens are sorted alphabetically and then joined together.
# fuzz.ratio() is then called on the resulting string.
print(fuzz.token_sort_ratio(str_1, str_2))

# Takes a set of the intersection and then makes pairwise comparisons using fuzz.ratio()
# The logic here is that ____
print(fuzz.token_set_ratio(str_1, str_2))

82
60
88
100


In [9]:
str_3 = 'Are you ok today partner?'
str_4 = 'Good morning, how are you?'
print(f'Original phrase: {str_1}')
print(process.extract(str_1, [str_2, str_3, str_4]))

Original phrase: Howdie partner, how are you today?
[('how are you today partner?', 95), ('Are you ok today partner?', 88), ('Good morning, how are you?', 60)]


# Match Scopus against GRID

__Scopus data:__

There are 48060 affiliations. 45132 of those do not have an affiliation type associated with it.

Of the 45132 affiliations without an affiliation type: 

* 11117 of them do not have an address or post code associated with them from Scopus
* 5426 of them have an address but not a post code
* 2652 of them have a post code but no address

__What data do we have to match?__

* Scopus `affil_name` against GRID `formatted_name`
* Scopus `affil_name` agaisnt Companies House `CompanyName` and `PreviousName_<number>.CompanyName`
* Scopus `post_code` against GRID 
* Scopus `post_code` against Companies House `RegAddress.PostCode`

In [13]:
# Add the method for finding affil_type
# Confidence not used anymore but I don't want to misalign any columns so keeping it
add_type_source = lambda x: 'scopus' if pd.notnull(x) else np.nan
scopus_confidence = lambda x: 10 if x == 'scopus' else 0
scopus_affils_df['type_source'] = scopus_affils_df['affil_type'].apply(add_type_source)
scopus_affils_df['type_confidence'] = scopus_affils_df['type_source'].apply(scopus_confidence)

In [16]:
# How long will fuzzy matching take?
grid_l = grid_df.formatted_name.to_list()
fuzzy_match = lambda x: faster_process.extract(x, grid_l)
% timeit fuzzy_match('Newcastle University')

70.2 ms ± 6.19 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
0.07 * 48000 / 60

56.00000000000001

In [19]:
# Fuzzy match Scopus affil_name against GRID formatted_name - ROUGHLY ONE HOUR PROCESSING TIME
grid_l = grid_df.formatted_name.to_list()
fuzzy_match = lambda x: faster_process.extract(x, grid_l)
scopus_affils_df['grid_name_matches'] = scopus_affils_df.affil_name.apply(fuzzy_match)

In [48]:
# Find best scores and matches from fuzzy matching
find_best_score = lambda x: x[0][1]
find_best_match = lambda x: x[0][0]
scopus_affils_df['grid_name_best_score'] = scopus_affils_df['grid_name_matches'].apply(find_best_score)
scopus_affils_df['grid_name_best_match'] = scopus_affils_df['grid_name_matches'].apply(find_best_match)

In [23]:
scopus_affils_df[scopus_affils_df['grid_name_best_score'] > 90].shape[0]

4397

# Geocoding with geopy

In [25]:
geolocator = Nominatim(user_agent='dissertation_data_prep')

In [26]:
bath_location = geolocator.geocode('University of Bath')

In [27]:
print('Example information that can be found using geopy:')
for k, v in bath_location.raw.items():
    print(f'{k}:')
    print(v)

Example information that can be found using geopy:
place_id:
172212413
licence:
Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright
osm_type:
way
osm_id:
345133857
boundingbox:
['51.3711087', '51.3822835', '-2.3356602', '-2.3168895']
lat:
51.37658545
lon:
-2.323423021360605
display_name:
University of Bath, All Saints Place, Claverton Down, Bath, Bath and North East Somerset, South West England, England, BA2 6DU, United Kingdom
class:
amenity
type:
university
importance:
0.7921407251596808
icon:
https://nominatim.openstreetmap.org/ui/mapicons//education_university.p.20.png


In [28]:
bath_ll = (bath_location.longitude, bath_location.latitude)
spa_location = geolocator.geocode('Bath Spa University')
spa_ll = (spa_location.longitude, spa_location.latitude)

In [29]:
print('You can even calculate the distance between two (lng, lat) tuples.')
geodesic(bath_ll, spa_ll).km

You can even calculate the distance between two (lng, lat) tuples.


12.749361491893458

# Geocoding with a database

This means that we don't have to worry about the rate limit on the geopy.

* 16543 post code is null
* 31517 post code is not null

In [30]:
def load_post_code_data():
    os.chdir(os.getcwd() + '\\uk_post_codes')
    df = pd.read_csv('ukpostcodes.csv')
    return df

post_code_df = load_post_code_data()

In [39]:
def geocode(x):
    
    # If there is a postcode in scopus df
    if pd.notnull(x):
        # Search for post code in post code df
        row = post_code_df[post_code_df['postcode'] == x]
        # If the post code is not in the post code df, return nan
        if row.shape[0] == 0:
            return np.nan
        else:
            return (row['latitude'].values[0], row['longitude'].values[0])
    # If no post code in scopus df > return nan
    else:
        return np.nan
    

scopus_affils_df['lat_lng'] = scopus_affils_df['post_code'].apply(geocode)

In [51]:
# Re-shuffle data frame to get lat_lng after post_code
columns = list(scopus_affils_df.columns)
columns.remove('lat_lng')
columns.insert(5, 'lat_lng')
scopus_affils_df = scopus_affils_df.reindex(columns=columns)
scopus_affils_df.head(1)

Unnamed: 0,af_id,affil_name,affil_city,affil_address,post_code,lat_lng,affil_type,type_source,type_confidence,grid_name_matches,grid_name_best_score,grid_name_best_match,grid_best_match_distance_km
0,60030480,University of Bath,Bath,Claverton Down,BA2 7AY,"(51.38044, -2.330673)",univ,scopus,10,"[(University of Bath, 100.0, 48), (Bath Spa Un...",100.0,University of Bath,


In [58]:
def find_best_grid_match_distance(df):
    
    df['grid_best_match_distance_km'] = np.nan
    
    for i, row in df.iterrows():
        
        #if row['type_source'] != 'scopus':
        if pd.notnull(row['lat_lng']):

            grid_match = row['grid_name_best_match']
            grid_row = grid_df[grid_df['formatted_name'] == f'{grid_match}']
            grid_lat_lng = (grid_row['lat'].values[0], grid_row['lng'].values[0])

            if pd.notnull(grid_lat_lng):
                try:
                    df.iloc[i, 12] = geodesic(row['lat_lng'], grid_lat_lng).km
                except:
                    print('Error found on index:', i)
                    print('Scopus row:\n', row['lat_lng'])
                    print('GRID row:\n', grid_lat_lng)
            else:
                print(grid_row)
                print('Scopus lat lng:', row['lat_lng'])
                print('GRID lat lng:', grid_lat_lng)
                df.iloc[i, 11] = np.nan

        else:
            df.iloc[i, 12] = np.nan
            
    return df


scopus_affils_df = find_best_grid_match_distance(scopus_affils_df)

Error found on index: 405
Scopus row:
 (54.530374, -1.563733)
GRID row:
 (nan, nan)
Error found on index: 479
Scopus row:
 (53.408494, -2.9675689999999997)
GRID row:
 (nan, nan)
Error found on index: 543
Scopus row:
 (52.231660999999995, 0.709176)
GRID row:
 (nan, nan)
Error found on index: 791
Scopus row:
 (51.403328, 0.02412)
GRID row:
 (nan, nan)
Error found on index: 818
Scopus row:
 (51.458611, -2.595196)
GRID row:
 (nan, nan)


  return cls(*args)


Error found on index: 974
Scopus row:
 (99.999999, 0.0)
GRID row:
 (53.588108, -0.666981)
Error found on index: 1124
Scopus row:
 (99.999999, 0.0)
GRID row:
 (53.992582999999996, -1.542993)
Error found on index: 1148
Scopus row:
 (99.999999, 0.0)
GRID row:
 (51.503360014460704, -0.0871449708938599)
Error found on index: 1228
Scopus row:
 (99.999999, 0.0)
GRID row:
 (50.931689, -1.784703)
Error found on index: 1369
Scopus row:
 (99.999999, 0.0)
GRID row:
 (53.324496999999994, -2.694013)
Error found on index: 1430
Scopus row:
 (51.494417999999996, -3.2264180000000002)
GRID row:
 (nan, nan)
Error found on index: 1953
Scopus row:
 (51.540636, -0.665563)
GRID row:
 (nan, nan)
Error found on index: 1999
Scopus row:
 (53.374345999999996, -1.5173219999999998)
GRID row:
 (nan, nan)
Error found on index: 2456
Scopus row:
 (51.369069, -0.781)
GRID row:
 (nan, nan)
Error found on index: 2568
Scopus row:
 (99.999999, 0.0)
GRID row:
 (55.944894999999995, -3.1892840000000002)
Error found on index: 27

Error found on index: 23680
Scopus row:
 (99.999999, 0.0)
GRID row:
 (51.442108000000005, -0.154569)
Error found on index: 23758
Scopus row:
 (51.280638, 0.49673999999999996)
GRID row:
 (nan, nan)
Error found on index: 24157
Scopus row:
 (51.532369, -0.09919)
GRID row:
 (nan, nan)
Error found on index: 24316
Scopus row:
 (51.435761, -0.223584)
GRID row:
 (nan, nan)
Error found on index: 24337
Scopus row:
 (53.399621999999994, -2.974211)
GRID row:
 (nan, nan)
Error found on index: 24342
Scopus row:
 (99.999999, 0.0)
GRID row:
 (51.495712, -0.126351)
Error found on index: 24531
Scopus row:
 (99.999999, 0.0)
GRID row:
 (53.324496999999994, -2.694013)
Error found on index: 24994
Scopus row:
 (51.685194, -2.301473)
GRID row:
 (nan, nan)
Error found on index: 25344
Scopus row:
 (51.940166, -3.254311)
GRID row:
 (nan, nan)
Error found on index: 25404
Scopus row:
 (51.521673, -3.5826300000000004)
GRID row:
 (nan, nan)
Error found on index: 25852
Scopus row:
 (52.487588, -1.911994)
GRID row:
 (

Error found on index: 44268
Scopus row:
 (51.071202, -1.350085)
GRID row:
 (nan, nan)
Error found on index: 44520
Scopus row:
 (54.49006800000001, -2.3370509999999998)
GRID row:
 (nan, nan)
Error found on index: 44725
Scopus row:
 (51.025058, -0.340317)
GRID row:
 (nan, nan)
Error found on index: 44812
Scopus row:
 (51.803247999999996, -1.920667)
GRID row:
 (nan, nan)
Error found on index: 45039
Scopus row:
 (53.649283999999994, -1.7804650000000002)
GRID row:
 (nan, nan)
Error found on index: 45338
Scopus row:
 (54.807044999999995, -1.777852)
GRID row:
 (nan, nan)
Error found on index: 45586
Scopus row:
 (99.999999, 0.0)
GRID row:
 (52.190121999999995, 0.128301)
Error found on index: 45667
Scopus row:
 (57.144351, -2.107627)
GRID row:
 (nan, nan)
Error found on index: 45964
Scopus row:
 (99.999999, 0.0)
GRID row:
 (51.223108, -2.3272049999999997)
Error found on index: 46154
Scopus row:
 (99.999999, 0.0)
GRID row:
 (51.505289000000005, -0.10880799999999999)
Error found on index: 46377
S

In [59]:
# There seem to be some rows that have a post code but no lat_lng
# Why is this happening?
# Some of the GRID institutes do not have lat and lng!
scopus_affils_df

Unnamed: 0,af_id,affil_name,affil_city,affil_address,post_code,lat_lng,affil_type,type_source,type_confidence,grid_name_matches,grid_name_best_score,grid_name_best_match,grid_best_match_distance_km
0,60030480,University of Bath,Bath,Claverton Down,BA2 7AY,"(51.38044, -2.330673)",univ,scopus,10,"[(University of Bath, 100.0, 48), (Bath Spa Un...",100.000000,University of Bath,0.208249
1,60022871,St Mary's Hospital,London,Praed Street,W2 1NY,"(51.516969, -0.173569)",hosp,scopus,10,"[(St Mary's Hospital, 100.0, 570), (St Mary's ...",100.000000,St Mary's Hospital,257.163038
2,60022148,University College London,London,Gower Street,WC1E 6BT,"(51.523569, -0.13242400000000001)",univ,scopus,10,"[(University College London, 100.0, 218), (Uni...",100.000000,University College London,0.147594
3,60011520,King's College London,London,Strand,WC2R 2LS,"(51.511612, -0.116253)",univ,scopus,10,"[(King's College London, 100.0, 100), (Royal H...",100.000000,King's College London,0.039293
4,60003771,The University of Manchester,Manchester,Oxford Road,M13 9PL,"(53.467925, -2.2332240000000003)",univ,scopus,10,"[(University of Manchester, 95.0, 31), (Univer...",95.000000,University of Manchester,0.106995
...,...,...,...,...,...,...,...,...,...,...,...,...,...
48055,108616326,Lntervet Innovation GmbH,,Schwabenheim,,,,,0,"[(Kite Innovation , 85.5, 1905), (Nonwovens In...",85.500000,Kite Innovation,
48056,107918540,Conder Brow Observatory,Lancaster,"Fell Acre, Conder Brow, Little Fell Lane, Scot...",LAI 1XD,,,,0,"[(Infectious Diseases Data Observatory, 85.5, ...",85.500000,Infectious Diseases Data Observatory,
48057,106599190,Starlight Xpress,Berkshire,"Foxley Green Farm, Ascot Road, Holyport",SL6 3LA,"(51.483702, -0.733949)",,,0,"[(Esys , 77.14285714285714, 1717), (Start, 72....",77.142857,Esys,28.117923
48058,106598998,British Astronomical Association,Reading,Lower Earley,RG6 4AZ,"(51.420429, -0.94514)",,,0,"[(British Astronomical Association, 100.0, 714...",100.000000,British Astronomical Association,56.849778


In [72]:
# Add fuzzy GRID matches affil_type and type_source
def nearest_inst_type(scopus_row, grid_rows):
    '''
    Input:
    - scopus row: one row of the scopus
    - grid_rows: one or more rows of the grid_df
    Returns:
    - dictionary containing closest distance, type and grid id
    Notes: 
    - scopus['grid_name_best_match'] == grid['formatted_name']
    - function will return np.nan for distance and type if not suitable
    '''
    
    closest = {'distance': np.inf,
               'type': np.nan,
               'grid_id': np.nan}
    
    # Only one proposal > return proposal irrespective of distance
    if grid_rows.shape[0] == 1:
        closest['type'] = grid_rows['type'].values[0]
        closest['grid_id'] = grid_rows['grid_id'].values[0]
        return closest
    # More than one proposal > check which is closest in distance
    # If Scopus row does not have lat_lng > return np.nan for type and grid_id
    elif pd.isnull(scopus_row['lat_lng']):
        return closest
    
    # Multiple proposals to consider
    for i, row in grid_rows.iterrows():
        
        grid_lat_lng = (row['lat'], row['lng'])
        
        # If the grid proposal does not have lng, lat data, skip to next proposal
        if pd.isnull(grid_lat_lng[0]):
            continue
        distance = geodesic(grid_lat_lng, scopus_row['lat_lng'])
        if distance < closest['distance']:
            closest['distance'] = distance
            closest['type'] = row['type']
            closest['grid_id'] = row['grid_id']
    
    return closest


def add_fuzzy_grid_info(df):
    
    df['grid_id_match'] = None
    
    for i, row in df.iterrows():
        
        # If the affiliation does not have a type yet
        #if pd.isnull(row['type_source']):
            
        proposed_name = row['grid_name_best_match']
        grid_data = grid_df[grid_df['formatted_name'] == f'{proposed_name}']

        # Exact matching
        if row['grid_name_best_score'] == 100:
            closest = nearest_inst_type(row, grid_data)
            df.iloc[i, 6] = closest['type'] # nan or type
            df.iloc[i, 13] = closest['grid_id'] # nan or distance
            # Only update type_source if there is a match
            if pd.notnull(closest['type']):
                df.iloc[i, 7] = 'exact_grid'
        # Fuzzy matching
        elif row['grid_name_best_score'] > 91:
            closest = nearest_inst_type(row, grid_data)
            df.iloc[i, 6] = closest['type']
            df.iloc[i, 13] = closest['grid_id']
            if pd.notnull(closest['type']):
                df.iloc[i, 7] = 'fuzzy_grid'
        #else:
            #continue
    
    return df


scopus_affils_df = add_fuzzy_grid_info(scopus_affils_df)

In [87]:
def add_more_fuzzy_grid_info(df):
    '''This function is used to add fuzzy match info for affiliations that:
    - Have a fuzzy score >= 90 AND
    - Are located within 2 km of their match
    '''
    
    for i, row in df.iterrows():
        
        # If the affiliation does not have a type yet
        # For some reason, this didn't work for grid_id so had to use the commented out row
        if pd.isnull(row['type_source']):
        #if row['type_source'] == 'fuzzy_loc_grid':
            
            proposed_name = row['grid_name_best_match']
            grid_data = grid_df[grid_df['formatted_name'] == f'{proposed_name}']

            if (row['grid_name_best_score'] >= 90) and (row['grid_best_match_distance_km'] < 2):
                closest = nearest_inst_type(row, grid_data)
                df.iloc[i, 6] = closest['type'] # nan or type
                #print(closest['grid_id'])
                df.iloc[i, 13] = closest['grid_id'] # nan or grid_id
                df.iloc[i, 7] = 'fuzzy_loc_grid'
            else:
                continue
    
    return df


scopus_affils_df = add_more_fuzzy_grid_info(scopus_affils_df)

__What have we matched?__

There are 48060 affiliations.

Of the 6214 that have an affiliation type:

* 4587 are linked to a GRID ID and therefore have established year
* 1627 are not linked to a GRID ID

__What have we got left?__

41846 affiliations do not have an affiliation type.

* 1113 have hospital
* 1364 have school
* 684 have university
* 5388 have ltd
* 1307 have limited
* 185 have foundation
* 1246 have trust

In [107]:
s = 'ltd'
scopus_affils_df[scopus_affils_df['affil_type'].isnull()]['affil_name'].str.contains(s, case=False).value_counts()

False    36458
True      5388
Name: affil_name, dtype: int64

In [108]:
# MORE HEURISTICS!

# Match Scopus against Companies House

__API KEY IN CODE!__

Companies House has a 600 request limit within a 5 minute period.

SIC codes found [here](https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/527619/SIC07_CH_condensed_list_en.csv/preview).

API information found [here](https://developer-specs.company-information.service.gov.uk/companies-house-public-data-api/reference).

In [109]:
CH_API_KEY = '2f182cf1-73c8-42f5-ba7f-b7b8878078fa'

In [110]:
# Options for using CH:
### INSTEAD OF FILTERING FOR UNMATCHED, FILTER FOR NO GRID ID
# 1) Filter unmatched Scopus affiliations and then fuzzy match against all of CH
# 2) Filter unmatched Scopus affiliations, filter CH and then fuzzy match
# 3) Filter unmatched Scopus affiliations and then use CH API to fuzzy matchtime

In [111]:
% timeit faster_process.extract('Biontech limited', grid_l)
% timeit faster_process.extract('Biontech limited', grid_l * 2)

54.1 ms ± 955 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
109 ms ± 1.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [113]:
speed_per_match_item = 0.0541 / len(grid_l)
one_scopus_comparison = speed_per_match_item * 5500000
print(one_scopus_comparison)
print(one_scopus_comparison * 1000)
(one_scopus_comparison * 1000) / (60**2)

40.911590815344425
40911.590815344425


11.364330782040119

In [177]:
scopus_affils_df[scopus_affils_df['grid_id_match'].isnull()].sample(n=10)

Unnamed: 0,af_id,affil_name,affil_city,affil_address,post_code,lat_lng,affil_type,type_source,type_confidence,grid_name_matches,grid_name_best_score,grid_name_best_match,grid_best_match_distance_km,grid_id_match
34403,113218646,Borax Europe Limited,Guildford,1A Guildford Business Park,GU2 8XG,"(51.246686, -0.586932)",,,0,"[(Environmental Design Solutions Limited , 85....",85.5,Environmental Design Solutions Limited,91.895392,
22395,100925518,Pharmorphix Ltd.,Cambridge (ex Galt),Milton Road,CB1 0WE,,,,0,"[(ARM , 90.0, 162), (Format International Ltd,...",90.0,ARM,,
27082,101210170,Peoplefirst,London,78 Montacute Road,SE6 4XQ,"(51.444964, -0.033963)",,,0,"[(GeneFirst , 70.0, 3147), (JEOL , 67.5, 761),...",70.0,GeneFirst,86.186075,
9887,117307307,Centre for Advanced Built Environment Research,Glasgow,,,,,,0,"[(Environment Agency, 85.5, 9), (Transport Res...",85.5,Environment Agency,,
16853,101499148,Institute of Development Stds. IDS,,,,,,,0,"[(Institute of Development Studies, 90.9090909...",90.909091,Institute of Development Studies,,
43413,105675615,Denehurst Chemical Safety Ltd,,"Denehurst, Station Road, Burley in Wharfedale",,,,,0,"[(Thomas Keating Ltd, 85.5, 898), (British Ass...",85.5,Thomas Keating Ltd,,
24888,119580635,Avlar BioVentures Limited,Cambridge,"Compass House, Vision Park, Chivers Way, Histon",CB4 9ZR,,,,0,"[(Bioven , 90.0, 4292), (Environmental Design ...",90.0,Bioven,,
11763,116061847,Northern Ireland Prison Service,Belfast,Dundonald House,BT4 3SU,"(54.595406999999994, -5.826217)",,,0,"[(Mines Rescue Service, 85.5, 1235), (HM Priso...",85.5,Mines Rescue Service,344.249254,
18941,101295932,Strategem Ltd,Manchester,Hough End Hall,M21 7AZ,,,,0,"[(Scimar Engineering Ltd, 85.5, 204), (Format ...",85.5,Scimar Engineering Ltd,,
41267,105995618,A2SP Ltd.,Chatham,117 Pagitt Street,ME4 6RD,"(51.374828, 0.517728)",,,0,"[(Scimar Engineering Ltd, 85.5, 204), (Format ...",85.5,Scimar Engineering Ltd,293.507525,


In [150]:
search = chwrapper.Search(access_token=CH_API_KEY)

In [193]:
def handle_ltd(string):
    string = string.lower()
    if 'ltd' in string.lower():
        return string.replace('ltd', 'limited')
    else:
        return string

handle_ltd('strategem Ltd')

'strategem limited'

In [242]:
def search_CH(query):

    query = handle_ltd(query)

    resp = search.search_companies(query)
    rate_limit = resp.headers['X-Ratelimit-Limit']
    resp = resp.json()

    # Index of 0 means this is the first result - there may be more
    if not resp['items']:
        return resp['items']
    
    name = handle_ltd(resp['items'][0]['title'])
    
    if 'date_of_creation' in resp['items'][0]:
        incorp_date = resp['items'][0]['date_of_creation']
    else:
        incorp_date = ''
    
    registered_address = resp['items'][0]['address_snippet']
    company_num = resp['items'][0]['company_number']

    resp_2 = search.profile(company_num)
    rate_limit = resp_2.headers['X-Ratelimit-Limit']
    resp_2 = resp_2.json()
    
    if 'sic_codes' in resp_2:
        sic_codes = resp_2['sic_codes']
    else:
        sic_codes = []
    if 'previous_company_names' in resp_2:
        prev_names = resp_2['previous_company_names']  # list of dictionaries
    else:
        prev_names = [{'name': ''}]

    names = [handle_ltd(d['name']) for d in prev_names]
    names.insert(0, name)
    fuzzy_scores = [faster_process.extract(query, [name])[0][1] for name in names]
    
    return {'names': names,
           'incorp_date': incorp_date,
           'registered_address': registered_address,
           'sic_codes': sic_codes,
           'fuzzy_scores': fuzzy_scores}


# Testing
query = 'Roy. U. Hosp.'
results = search_CH(query)
print('Query:\n', query)
print('Different names:\n',results['names'])
print('Incorporated date:\n', results['incorp_date'])
print('Registered address:\n', results['registered_address'])
print('SIC codes:\n', results['sic_codes'])
print('Fuzzy match scores:\n', results['fuzzy_scores'])
if max(results['fuzzy_scores']) >= 90:
    print('We have a match! The highest score was {}'.format(max(results['fuzzy_scores'])))
else:
    print('No match :(')

Query:
 Roy. U. Hosp.


TypeError: list indices must be integers or slices, not str

In [218]:
test = scopus_affils_df.copy()

In [255]:
def find_CH_match(x):
    
    query = x['affil_name']
    
    if pd.notnull(x['CH_match_data']):
        return x['CH_match_data']
    
    try:
        results = search_CH(query)
        if not results:  # empty list
            return 'no data'
        else:
            return results
    except:
        return np.nan

In [256]:
# 13 hours - not sure if due to terrible internet connection
tick = datetime.now()
test['CH_match_data'] = test.apply(lambda x: find_CH_match(x), axis=1)
tock = datetime.now()
print('Total time:', tock - tick)

Total time: 13:14:48.390494


In [263]:
test.iloc[0]['CH_match_data']

{'names': ['university of bath (the)', ''],
 'incorp_date': '',
 'registered_address': None,
 'sic_codes': [],
 'fuzzy_scores': [95.0, 0.0]}

In [264]:
test['CH_match_data'].apply(pd.Series).drop(columns=[0])

Unnamed: 0,fuzzy_scores,incorp_date,names,registered_address,sic_codes
0,"[95.0, 0.0]",,"[university of bath (the), ]",,[]
1,"[90.0, 0.0]",,"[st mary's hospital paddington, ]",,[]
2,"[100.0, 0.0]",,"[university college london, ]",,[]
3,"[100.0, 0.0]",,"[king's college london, ]",,[]
4,"[100.0, 0.0]",,"[the university of manchester, ]",,[]
...,...,...,...,...,...
48055,"[85.5, 0.0]",2013-04-10,[innovationszentrum fur mobilitat und geselles...,"8th Floor Tower Three Houghton Street, London...",[]
48056,"[44.107142857142854, 0.0]",2018-07-31,"[casson conder partnership limited, ]","470a Green Lane, Palmers Green, London, United...","[71111, 71112]"
48057,"[90.0, 33.096774193548384]",1966-05-24,"[starlight xpress limited, functional design &...","Unit 3 Brooklands Farm Business Park, Bottle L...",[27900]
48058,"[95.0, 0.0]",1911-09-07,"[british astronomical association(the), ]","Burlington House, Piccadilly, London, W1J 0DU","[85310, 94120]"


In [267]:
# Laptop is being really slow so saving just in case something happens
os.chdir('..')
test.to_csv('scopus_affils_w_CH.csv')

In [2]:
test = pd.read_csv('scopus_affils_w_CH.csv')

In [34]:
# Loading the data back in makes it string, not dict
def to_dict(x):
    if isinstance(x, float):
        return np.nan
    elif len(x) > 20:
        return ast.literal_eval(x)
    else:
        return 'no data'

In [38]:
test['CH_match_data_t'] = test['CH_match_data'].apply(to_dict)

In [41]:
CH_data = test['CH_match_data_t'].apply(pd.Series).drop(columns=[0])
test_1 = pd.concat([test, CH_data], axis=1)

# Find CH matches

In [49]:
# Find the best CH match name and corresponding score
best_CH_score = lambda x: max(x) if isinstance(x, list) else np.nan

def best_CH_name(x):
    fuzzy_scores = x['fuzzy_scores']
    if isinstance(fuzzy_scores, list):
        max_value = max(fuzzy_scores)
        max_index = fuzzy_scores.index(max_value)
        return x['names'][max_index]
    else:
        return np.nan


test_1['best_score'] = test_1['fuzzy_scores'].apply(best_CH_score)
test_1['best_name'] = test_1.apply(lambda x: best_CH_name(x), axis=1)

In [56]:
test_1[(test_1['type_source'].isnull()) & (test_1['best_score'] >= 88)].shape

(16421, 24)

In [61]:
# Check unique value counts to ensure that I do not change anything else unexpectedly
test_1.type_source.value_counts()

exact_grid        2255
fuzzy_grid        2006
scopus            1629
fuzzy_loc_grid     326
Name: type_source, dtype: int64

In [62]:
# Add type_source as CH_fuzzy
def add_CH_fuzzy(x):
    source = x['type_source']
    best_score = x['best_score']
    if (pd.isnull(source)) and (best_score >= 88):
        return 'CH_fuzzy'
    else:
        return source

In [65]:
test_1['type_source'] = test_1.apply(lambda x: add_CH_fuzzy(x), axis=1)
test_1.type_source.value_counts()

CH_fuzzy          16421
exact_grid         2255
fuzzy_grid         2006
scopus             1629
fuzzy_loc_grid      326
Name: type_source, dtype: int64

In [71]:
#test_1[test_1['type_source'] == 'CH_fuzzy'].sample(n=20)

There are sections A-U for SIC codes.

* Section O: Public administration and defence; compulsory social security
* Section P: Education
* Section Q: Human health and social work activities
* All other sections can fall under company

In [74]:
# Read in SIC codes from CH
sic_df = pd.read_csv('ch_sic_codes.csv')

In [75]:
sic_df.head(1)

Unnamed: 0,SIC Code,Description,Section
0,1110,"Growing of cereals (except rice), leguminous c...",A


In [85]:
public = sic_df[sic_df['Section'] == 'O']['SIC Code'].unique()
education = sic_df[sic_df['Section'] == 'P']['SIC Code'].unique()
health = sic_df[sic_df['Section'] == 'Q']['SIC Code'].unique()

In [121]:
# Add affil_type for CH matches
def add_CH_type(x):
    #print(x)
    type_source = x['type_source']
    if type_source == 'CH_fuzzy':
        # Find SIC code
        sic_codes = x['sic_codes']
        affil_type = 'company'
        # Loop through each code
        for code in sic_codes:
            code = int(code)
            # Check non-company groups
            if code in public:
                affil_type = 'public'
            elif code in education:
                affil_type = 'education'           
            elif code in health:
                affil_type = 'health'
        return affil_type
    else:
        return x['affil_type']

In [125]:
# Test out function on some examples
add_CH_type(test_1.iloc[47607])

'education'

In [130]:
test_1['affil_type'] = test_1.apply(lambda x: add_CH_type(x), axis=1)

In [132]:
test_1.to_csv('scopus_affils.csv')

# Clean for analysis

This was done in Excel to save time. The following steps were taken to clean the data:

* Standardise affiliation types across different sources
    * comp|hosp, comp, comp|ngov, comp|lawf (all Scopus matches) changed to company
    * Education from GRID, coll and univ from Scopus changed to education
    * resi|edu > education
    * HP labs, microsoft research Cambridge, BT research lab, Schlumberger Cambridge Research, Advanced Technologies (Cambridge) Limited, GEC Research Lab, Tube Investments Research Laboratories, Saffron Walden, The Arable Group > company
    * HR Walligford, HRWallingford, HR Wallirgford, HRWallingford, Glass Technology Services Ltd > company
    * Facility (GRID) > resi
    * govt, Government > government
    * meds (medical schools) > education
    * Healthcare, health, hosp > healthcare
    * lawf (law companies) > company
    * library, museum > archive
    * milo (military) > other
    * ngov > government
    * ngov|resi > resi
    * Nonprofit seems to be a mixture of resi and government - kept as is for now as there are 573 entries for this
    * From Other: BioRegional MiniMills UK Ltd (BRMM), J.P. Morgan and others
    * poli (policy?) to resi
    * public from CH looks like a bit of a shambles (only 51 though)
    * Other > other - checked that no companies were involved. Otherwise other is still a bit of a mess
    
__Unmatched entries:__

* 1003 contain school > education
* 848 contain institut > resi
* 389 surgery > healthcare
* 588 medical but not biomedical > healthcare
* 1079 contains ltd or limited > company - __there could be start ups in here - they are matched under string for now__
* 776 contain univ > education
* 354 contain college > education

Not done:

* 131 contain plc - a lot of them seemt to be good matches so I have highlighted the ones which don't seem like good matches. I haven't matched these yet because some of the registered dates are past the period of the data set. For example, Lasmo plc could be matched with Lasmo llp but Lasmo llp was registered in 2014. In this case, they are two separate companies. 
* 633 contains department or dept - hard to classify

# GRID Establsihed Date

In [21]:
os.chdir('..')

In [22]:
# Load data
affils = pd.read_csv('scopus_affils_clean.csv')
grid = load_grid_data(cleaning=True)

7273
7273


In [101]:
# Define a function to add established date for grid matches only
def add_grid_date(x, dtype=int):
    try:
        result = grid[grid['grid_id'] == x]
        if result.shape[0] == 0:
            return np.nan
        else:
            return result.established.to_numpy(dtype=dtype)
    except:
        print('bruh', x)

In [61]:
add_grid_date(affils.iloc[0]['grid_id_match'])

array([1966])

In [62]:
# Apply function
affils['established_date'] = affils['grid_id_match'].apply(add_grid_date)

In [72]:
# Investigate start-ups
grid_start_ups = affils[(affils['affil_type'] == 'company') & (affils['established_date'] >= 1992)].copy()
grid_start_ups.loc[:, ['affil_name', 'established_date']]

Unnamed: 0,affil_name,established_date
74,AstraZeneca,[1999]
149,GlaxoSmithKline plc.,[2000]
351,Oxford BioMedica Plc,[1995]
625,BAE Systems plc,[1999]
641,PricewaterhouseCoopers,[1998]
...,...,...
47206,Faculty Development Lead,[2014]
47679,Linguamatics Ltd,[2001]
47738,Camfridge Ltd.,[2005]
47946,United BioSource Corporation,[2003]


# Established / Incorporated Date Variable

In [93]:
def consoliDATE(x):
    if x['affil_type'] == 'company':
        match_type = x['type_source']
        if match_type in ['exact_grid', 'fuzzy_grid', 'fuzzy_loc_grid']:
            return x['established_date'][0]
        elif match_type == 'CH_fuzzy':
            return pd.to_datetime(x['incorp_date']).year
        else:
            return np.nan
    else:
        return np.nan
    
    
consoliDATE(affils.iloc[33])

2016

In [94]:
affils['est_inc_date'] = affils.apply(lambda x: consoliDATE(x), axis=1)

# Companies Matched Through Scopus

Because there is no established date for companies that were matched through Scopus, I need to check these for their Companies House and GRID matches.

In [106]:
affils[(affils['affil_type'] == 'company') & (affils['type_source'] == 'scopus')]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,af_id,affil_name,affil_city,affil_address,post_code,lat_lng,affil_type,type_source,type_confidence,grid_name_matches,grid_name_best_score,grid_name_best_match,grid_best_match_distance_km,grid_id_match,CH_match_data,CH_match_data_t,fuzzy_scores,incorp_date,names,registered_address,sic_codes,best_score,best_name,established_date,est_inc_date
25,25,25,60006769,RSK ADAS Ltd.,Helsby,"Spring Lodge, 172 Chester Road",WA6 0AR,"(53.274328000000004, -2.768207)",company,scopus,10,"[('Scimar Engineering Ltd', 85.5, 204), ('Form...",85.5,Scimar Engineering Ltd,290.531361,,"{'names': ['rsk adas limited', ''], 'incorp_da...","{'names': ['rsk adas limited', ''], 'incorp_da...","[100.0, 0.0]",18/11/2016,"['rsk adas limited', '']","Spring Lodge, 172 Chester Road, Helsby, Cheshi...",['74901'],100.000000,rsk adas limited,,
101,101,101,60005889,LGC Ltd.,Teddington,Queens Road,TW11 0LY,"(51.423929, -0.341471)",company,scopus,10,"[('Scimar Engineering Ltd', 85.5, 204), ('Form...",85.5,Scimar Engineering Ltd,237.398027,,"{'names': ['lgc limited', 'lgc (teddington) li...","{'names': ['lgc limited', 'lgc (teddington) li...","[100.0, 85.5, 85.5, 85.5]",18/11/1994,"['lgc limited', 'lgc (teddington) limited', 'l...","Queens Road, Teddington, Middlesex, TW11 0LY",['74909'],100.000000,lgc limited,,
128,128,128,60015695,Kodak Limited,Hemel Hempstead,Station Road,HP11JU,,company,scopus,10,"[('Kodak ', 90.0, 778), ('Beta Technology Limi...",90.0,Kodak,,,"{'names': ['kodak limited', ''], 'incorp_date'...","{'names': ['kodak limited', ''], 'incorp_date'...","[100.0, 0.0]",1898-11-15,"['kodak limited', '']","Building 8 Croxley Green Business Park, Hatte...",['58190'],100.000000,kodak limited,,
136,136,136,60010098,GV Instruments Ltd,Manchester,Crewe Road,M23 9BE,"(53.404701, -2.29745)",company,scopus,10,"[('Catalyst Health Economics Consultants Ltd',...",85.5,Catalyst Health Economics Consultants Ltd,237.380673,,"{'names': ['g v instruments limited', 'level s...","{'names': ['g v instruments limited', 'level s...","[97.77777777777777, 63.63636363636363]",21/05/2002,"['g v instruments limited', 'level services li...","3rd Floor, 1 Ashley Road, Altrincham, Cheshire...",['26511'],97.777778,g v instruments limited,,
210,210,210,60110398,"Hewlett Packard Laboratories, Bristol",Bristol,Long Down Avenue Stoke Gifford,BS348QZ,,company,scopus,10,"[('Hewlett-Packard ', 90.0, 155), ('University...",90.0,Hewlett-Packard,,,"{'names': ['hewlett - packard limited', ''], '...","{'names': ['hewlett - packard limited', ''], '...","[75.0, 0.0]",24/04/1961,"['hewlett - packard limited', '']","Ground Floor, 210 Wharfedale Road, Winnersh Tr...",['62090'],75.000000,hewlett - packard limited,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46851,46851,46851,60100629,ARTVPS Limited,Cambridge,"St John's Innovation Centre, Cowley Road",CB40WS,,company,scopus,10,"[('Beta Technology Limited', 85.5, 196), ('Env...",85.5,Beta Technology Limited,,,"{'names': ['art vps limited', 'legislator 1584...","{'names': ['art vps limited', 'legislator 1584...","[96.55172413793103, 85.5]",14/06/2002,"['art vps limited', 'legislator 1584 limited']","St John's Innovation Centre, Cowley Road, Camb...",['62012'],96.551724,art vps limited,,
47055,47055,47055,60121174,Blue Bear Systems Research Ltd,Oakley,"4 Highfield Parc, Highfield Road",MK43 7TA,"(52.175458, -0.519696)",company,scopus,10,"[('Blue Bear ', 90.0, 3109), ('BAE Systems ', ...",90.0,Blue Bear,3.121020,,{'names': ['blue bear systems research limited...,{'names': ['blue bear systems research limited...,"[100.0, 0.0]",27/10/1999,"['blue bear systems research limited', '']","Richmond House, Walkern Road, Stevenage, Hertf...",['82990'],100.000000,blue bear systems research limited,,
47248,47248,47248,60112368,KWS UK Ltd.,Royston,"56 Church Street, Thriplow",SG8 7RE,"(52.099726000000004, 0.104675)",company,scopus,10,"[('KWS ', 90.0, 782), ('Cancer Research UK', 8...",90.0,KWS,0.287248,,"{'names': ['kws uk limited', 'cpb twyford limi...","{'names': ['kws uk limited', 'cpb twyford limi...","[100.0, 63.33333333333332, 85.5, 63.3333333333...",10/08/1981,"['kws uk limited', 'cpb twyford limited', 'cam...","56 Church Street, Thriplow, Royston, Hertford...",['01110'],100.000000,kws uk limited,,
47466,47466,47466,60158095,David Marlin Consulting Ltd,Bury St Edmunds,Unit 2A Chase Road,IP32 6NT,"(52.268046999999996, 0.699194)",company,scopus,10,"[('Catalyst Health Economics Consultants Ltd',...",85.5,Catalyst Health Economics Consultants Ltd,106.361296,,"{'names': ['david marlin consulting limited', ...","{'names': ['david marlin consulting limited', ...","[100.0, 0.0]",08/12/2006,"['david marlin consulting limited', '']","Streets Whitmarsh Sterland, 62 Hills Road, Cam...",['82990'],100.000000,david marlin consulting limited,,


In [103]:
uk = '(United Kingdom)'
grid['Name'] = grid['Name'].apply(lambda x: x.replace(uk, '') if uk in x else x)

In [134]:
def consoliDATE_scopus(x):
    
    if (x['affil_type'] == 'company') and (x['type_source'] == 'scopus'):
        # Check GRID match score
        if x['grid_name_best_score'] >= 90:
            # Get GRID established date
            result = grid[grid['Name'] == x['grid_name_best_match']]
            print(result)
            if result.shape[0] > 0:
                if result.shape[0] != 1:
                    print(result)
                    print(result.established.to_numpy()[0])
                return result.established.to_numpy()[0]
        # Check CH match score
        elif x['best_score'] >= 90:
            return pd.to_datetime(x['incorp_date']).year
        else:
            #print('not good enough')
            return np.nan
    else:
        return x['est_inc_date']
        

consoliDATE_scopus(affils.iloc[46267])

nan

In [125]:
affils['est_inc_date'] = affils.apply(lambda x: consoliDATE_scopus(x), axis=1)

In [139]:
print('Potential companies that do not have an incorporation date (ignoring string matches):')
print(affils[(affils['affil_type'] == 'company') & (affils['est_inc_date'].isnull()) & 
      (affils['type_source'] != 'string')].shape[0])
print('Potential companies that do not have an est/inc date and were matched through string matching:')
print(affils[(affils['affil_type'] == 'company') & (affils['est_inc_date'].isnull()) & 
      (affils['type_source'] == 'string')].shape[0])

Potential companies that do not have an incorporation date (ignoring string matches):
452
Potential companies that do not have an est/inc date and were matched through string matching:
1079


In [148]:
columns = ['affil_name', 'grid_name_best_score', 'grid_name_best_match', 'best_score', 'best_name']
affils[(affils['affil_type'] == 'company') & (affils['est_inc_date'].isnull()) & 
      (affils['type_source'] == 'string')].loc[:, columns].shape[0]

1079

# Additional Matches

There are probably more matches in string that could be made by removing ltd, limited, and plc. However, I will stick with what I've got for now.

# Latitude and Longitude

Are there are any more institutions that can have lat_lng included that aren't already?

Not the most important part at the moment!

In [170]:
print('Number of institutions without affiliation type:')
print(affils[affils['affil_type'].isnull()].shape[0])

print('Number of institutions WITH affiliation type but no lat, lng:')
print(affils[(affils['lat_lng'].isnull()) & (affils['affil_type'].notnull())].shape[0])

Number of institutions without affiliation type:
20388
Number of institutions WITH affiliation type but no lat, lng:
12843


In [171]:
affils[(affils['lat_lng'].isnull()) & (affils['affil_type'].notnull())].type_source.value_counts()

CH_fuzzy      7752
string        2953
fuzzy_grid     870
exact_grid     676
scopus         592
Name: type_source, dtype: int64

In [199]:
# How many have CH registered address?
print(affils[(affils['lat_lng'].isnull()) & (affils['type_source'] == 'CH_fuzzy') & 
       (affils['registered_address'].notnull())].shape[0])
affils[(affils['lat_lng'].isnull()) & (affils['type_source'] == 'CH_fuzzy') & 
       (affils['registered_address'].notnull())].sample(n=1)

7560


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,af_id,affil_name,affil_city,affil_address,post_code,lat_lng,affil_type,type_source,type_confidence,grid_name_matches,grid_name_best_score,grid_name_best_match,grid_best_match_distance_km,grid_id_match,CH_match_data,CH_match_data_t,fuzzy_scores,incorp_date,names,registered_address,sic_codes,best_score,best_name,established_date,est_inc_date
4326,4326,4326,115984584,Enfield,,,,,company,CH_fuzzy,0,"[('Glenfield Hospital', 90.0, 308), ('Barnet E...",90.0,Glenfield Hospital,,,"{'names': ['enfield limited', ''], 'incorp_dat...","{'names': ['enfield limited', ''], 'incorp_dat...","[90.0, 0.0]",26/03/2001,"['enfield limited', '']","58 Brookside Crescent, Cuffley, Hertfordshire,...","['68209', '69202']",90.0,enfield limited,,2001.0


In [None]:
def locate(x):
    if ()

In [208]:
from geopy.geocoders import Nominatim

In [209]:
geolocator = Nominatim(user_agent='aidan')

In [210]:
location = geolocator.geocode(affils.iloc[4326]['registered_address'])

In [214]:
location.latitude, location.longitude

(51.7154379, -0.1121931)

# Save Data

In [215]:
affils.to_csv('scopus_affils.csv')

# Save Clean Version

In [216]:
columns = ['af_id', 'affil_name', 'affil_city', 'post_code', 'lat_lng', 'affil_type', 'type_source', 
           'grid_id_match', 'est_inc_date']
clean = affils.loc[:, columns].copy()
clean.to_csv('scopus_affils_clean.csv')

In [217]:
clean

Unnamed: 0,af_id,affil_name,affil_city,post_code,lat_lng,affil_type,type_source,grid_id_match,est_inc_date
0,60030480,University of Bath,Bath,BA2 7AY,"(51.38044, -2.330673)",education,exact_grid,grid.7340.0,
1,60022871,St Mary's Hospital,London,W2 1NY,"(51.516969, -0.173569)",healthcare,exact_grid,grid.426467.5,
2,60022148,University College London,London,WC1E 6BT,"(51.523569, -0.13242400000000001)",education,exact_grid,grid.83440.3b,
3,60011520,King's College London,London,WC2R 2LS,"(51.511612, -0.116253)",education,exact_grid,grid.13097.3c,
4,60003771,The University of Manchester,Manchester,M13 9PL,"(53.467925, -2.2332240000000003)",education,fuzzy_grid,grid.5379.8,
...,...,...,...,...,...,...,...,...,...
48055,108616326,Lntervet Innovation GmbH,,,,,,,
48056,107918540,Conder Brow Observatory,Lancaster,LAI 1XD,,,,,
48057,106599190,Starlight Xpress,Berkshire,SL6 3LA,"(51.483702, -0.733949)",company,CH_fuzzy,,1966.0
48058,106598998,British Astronomical Association,Reading,RG6 4AZ,"(51.420429, -0.94514)",Nonprofit,exact_grid,grid.508416.e,
