# Purpose

Categorise Scopus affiliations into types of institutions. This does not include defining start-ups. This will be done in another notebook.

+ Match Scopus affiliations to the [Global Research Identifier Database](https://www.grid.ac/)
+ Using location data to increase the number of matches with the above database - calculate [geodesic distance](https://en.wikipedia.org/wiki/Geodesic) using [geopy](https://pypi.org/project/geopy/)
+ Match more Scopus affiliations to [Companies House](https://developer.company-information.service.gov.uk/)

# Import Dependencies

In [134]:
import pandas as pd
import numpy as np
import os
from rapidfuzz import process  # Levenshtein distance for string matching
from datetime import datetime, timedelta
from geopy.geocoders import Nominatim
from geopy.distance import geodesic  # geo distance
import chwrapper  # companies house API wrapper
from tqdm import tqdm  # progress bar
import constants

# Load Data

In [5]:
def load_grid(cleaning=False):
    
    grid_df = pd.read_csv(os.getcwd() + r'\data\grid\grid.csv').rename(columns={'ID': 'grid_id'})
    type_df = pd.read_csv(os.getcwd() + r'\data\grid\types.csv')
    address_df = pd.read_csv(os.getcwd() + r'\data\grid\addresses.csv', usecols=['grid_id', 'lat', 'lng', 'city'])
    
    # Filter for UK institutions
    uk_grid_df = grid_df[grid_df['Country'] == 'United Kingdom']
    
    # Join dataframes
    merged_df = pd.merge(uk_grid_df, type_df, how='left', on='grid_id')
    merged_df = pd.merge(merged_df, address_df, how='left', on='grid_id')
    
    return merged_df

In [7]:
def load_scopus():
    return pd.read_csv(os.getcwd() + r'\data\uk_affils.csv')

In [8]:
grid_df = load_grid()
scopus_df = load_scopus()

# Quick Clean and Pre-Process

In [10]:
# Remove United Kingdom from names of GRID institutions
uk = '(United Kingdom)'
grid_df['Name'] = grid_df['Name'].apply(lambda x: x.replace(uk, '') if uk in x else x)

In [14]:
# Add source of data
add_source = lambda x: 'scopus' if pd.notnull(x) else np.nan
scopus_df['source'] = scopus_df['type'].apply(add_source)

__Geocode postal code__

Data originally from Ordnance Survey but can be found [here](https://www.freemaptools.com/download-uk-postcode-lat-lng.htm).

In [22]:
geo_df = pd.read_csv(os.getcwd() + r'\data\geocode\ukpostcodes.csv')

In [56]:
# Geocode Scopus post code data
def geocode(x):
    """Function to convert postcode to latitude and longitude."""
    
    # Skip affiliations without postal code
    if pd.notnull(x):
        # Search postcode df 
        row = geo_df[geo_df['postcode'] == x]
        # If postcode not there, return nan
        if row.shape[0] == 0:
            return np.nan
        else:
            return (row['latitude'].values[0], row['longitude'].values[0])
    else:
        return np.nan
    
    
scopus_df['lat_lng'] = scopus_df['postal_code'].apply(geocode)

In [35]:
# Move lat_lng next to postal_code
columns = list(scopus_df.columns)
columns.remove('lat_lng')
columns.insert(5, 'lat_lng')
scopus_df = scopus_df.reindex(columns=columns)

# Match Scopus to GRID

This uses the [Levenshtein algorithm](https://en.wikipedia.org/wiki/Levenshtein_distance) from [rapidfuzz](https://github.com/maxbachmann/RapidFuzz). Rapidfuzz was chosen over fuzzywuzzy due its faster implementation in C++.

In [20]:
# Create a list of potential grid matches
grid_l = grid_df['Name'].to_list()

# Match each scopus affiliation against grid
match = lambda x: process.extract(x, grid_l)
scopus_df['grid_match'] = scopus_df['affil_name'].apply(match)

In [21]:
# Find best scores and matches
scopus_df['best_name'] = scopus_df['grid_match'].apply(lambda x: x[0][0])
scopus_df['best_score'] = scopus_df['grid_match'].apply(lambda x: x[0][1])

__Use location data to increase the number of matches__

In [62]:
def find_grid_distance(df):
    '''Find the geodesic disance between Scopus and best GRID match.'''
    
    df['best_dist'] = np.nan
    num_errors = 0
    
    for row in df.itertuples():
        
        if pd.notnull(row.lat_lng):
            
            grid_row = grid_df[grid_df['Name'] == f'{row.best_name}']
            grid_lat_lng = (grid_row['lat'].values[0], grid_row['lng'].values[0])
            
            if pd.notnull(grid_lat_lng):
                try:
                    df.iat[row.Index, 11] = geodesic(row.lat_lng, grid_lat_lng).km
                except:
                    num_errors += 1
        else:
            df.iat[row.Index, 11] = np.nan
            
    print(num_errors)
            
    return df


scopus_df = find_grid_distance(scopus_df)

  return cls(*args)


32


__Add GRID match information__

In [222]:
def find_GRID_data(scopus_row, grid_rows):
    '''Find GRID institution data.'''
    
    closest = {'distance': np.inf,
               'type': np.nan,
               'grid_id': np.nan}
    
    # If only one proposal > return proposal irrespective of distance
    if grid_rows.shape[0] == 1:
        closest['type'] = grid_rows['type'].values[0]
        closest['grid_id'] = grid_rows['grid_id'].values[0]
        return closest
    
    # If more than one proposal > check which is closest in distance
    # If Scopus row does not have lat_lng > return np.nan for type and grid_id
    elif pd.isnull(scopus_row.lat_lng):
        return closest
    
    # Multiple proposals to consider
    for row in grid_rows.itertuples():
        
        grid_lat_lng = (row.lat, row.lng)
        
        # If the grid proposal does not have lng, lat data, skip to next proposal
        if pd.isnull(grid_lat_lng[0]):
            continue
        distance = geodesic(grid_lat_lng, scopus_row.lat_lng)
        if distance < closest['distance']:
            closest['distance'] = distance
            closest['type'] = row.type
            closest['grid_id'] = row.grid_id
    
    return closest


def add_grid_info(df):
    """This function finds the GRID data for the fuzzy matches. It also
    writes the matching type into the data frame."""
    
    df['grid_id_match'] = None
    
    for scopus_row in df.itertuples():
        
        # If the affiliation does not have a type yet
        if pd.isnull(scopus_row.source):
            
            grid_data = grid_df[grid_df['Name'] == f'{scopus_row.best_name}']

            # Exact matching
            if scopus_row.best_score == 100:
                closest = find_GRID_data(scopus_row, grid_data)
                df.iat[scopus_row.Index, 6] = closest['type'] # nan or type
                df.iat[scopus_row.Index, 12] = closest['grid_id'] # nan or distance
                # Only update type_source if there is a match
                if pd.notnull(closest['type']):
                    df.iat[scopus_row.Index, 7] = 'exact_grid'
            
            # Fuzzy matching
            elif scopus_row.best_score > 91:
                closest = find_GRID_data(scopus_row, grid_data)
                df.iat[scopus_row.Index, 6] = closest['type']
                df.iat[scopus_row.Index, 12] = closest['grid_id']
                if pd.notnull(closest['type']):
                    df.iat[scopus_row.Index, 7] = 'fuzzy_grid'
            
            # Fuzzy and location matching
            elif (scopus_row.best_score >= 90) and scopus_row.best_dist < 2:
                closest = find_GRID_data(scopus_row, grid_data)
                df.iat[scopus_row.Index, 6] = closest['type']
                df.iat[scopus_row.Index, 12] = closest['grid_id']
                if pd.notnull(closest['type']):
                    df.iat[scopus_row.Index, 7] = 'fuzzy_loc_grid'
            
        else:
            continue
    
    return df


scopus_df = add_grid_info(scopus_df)

In [87]:
# Show some sample matches found using location
scopus_df[scopus_df['source'] == 'fuzzy_loc_grid'].sample(n=5)

Unnamed: 0,af_id,affil_name,affil_city,address,postal_code,lat_lng,type,source,grid_match,best_name,best_score,best_dist,grid_id_match
4975,109537970,East Yorkshire,Hull,6 Wright Street,HU2 8HU,"(53.747962, -0.34320900000000004)",Healthcare,fuzzy_loc_grid,"[(Hull and East Yorkshire Hospitals NHS Trust,...",Hull and East Yorkshire Hospitals NHS Trust,90.0,0.901295,grid.417700.5
4836,101786554,Cow and Gate Nutricia UK Ltd.,Trowbridge,White Horse Business Park,BA14 0XQ,"(51.299165, -2.199637)",Company,fuzzy_loc_grid,"[(Nutricia , 90.0, 4754), (UK Coal, 85.5, 10),...",Nutricia,90.0,0.048845,grid.487299.9
3990,109616374,Newbold College,Bracknell,"St Mark's Road, Binfield",RG42 4AN,"(51.424813, -0.785803)",Education,fuzzy_loc_grid,"[(Newbold College of Higher Education, 90.0, 3...",Newbold College of Higher Education,90.0,0.023599,grid.462616.1
4443,113103809,London Mathematical Society De Morgan House,London,57-58 Russell Square,WC1B 4HP,"(51.521105, -0.124596)",Nonprofit,fuzzy_loc_grid,"[(London Mathematical Society, 90.0, 3318), (U...",London Mathematical Society,90.0,0.01757,grid.453177.3
7740,112683436,Royal Free,London,Pond Street,NW3 2QG,"(51.553222999999996, -0.165324)",Healthcare,fuzzy_loc_grid,"[(The Royal Free Hospital, 90.0, 1267), (Royal...",The Royal Free Hospital,90.0,0.012837,grid.426108.9


# Match Scopus to Companies House

In [88]:
# Instantiate CH search object
search = chwrapper.Search(access_token=constants.CH_API_KEY)

In [116]:
def remove_ltd(string):
    string = string.lower()
    if 'ltd' in string:
        return string.replace('ltd', '')
    elif 'limited' in string:
        return string.replace('limited', '')
    else:
        return string

In [120]:
def search_CH(query):
    """Takes a Scopus affiliation and searches Companies House
    for relevant data: names, number, incorporation date, address, SIC.
    Function also does fuzzy matching of names against query."""
    
    # Remove limited from query
    query = remove_ltd(query)

    resp = search.search_companies(query)
    rate_limit = resp.headers['X-Ratelimit-Limit']
    resp = resp.json()

    # Index of 0 means this is the first result - there may be more
    if not resp['items']:
        return resp['items']
    
    name = remove_ltd(resp['items'][0]['title'])
    
    # Add date if available
    if 'date_of_creation' in resp['items'][0]:
        incorp_date = resp['items'][0]['date_of_creation']
    else:
        incorp_date = ''
    
    # Add address if available
    registered_address = resp['items'][0]['address_snippet']
    company_num = resp['items'][0]['company_number']

    resp_2 = search.profile(company_num)
    rate_limit = resp_2.headers['X-Ratelimit-Limit']
    resp_2 = resp_2.json()
    
    # Add SIC codes if available
    if 'sic_codes' in resp_2:
        sic_codes = resp_2['sic_codes']
    else:
        sic_codes = []
    
    # Add previous names if available
    if 'previous_company_names' in resp_2:
        prev_names = resp_2['previous_company_names']  # list of dictionaries
    else:
        prev_names = [{'name': ''}]
    
    # Find fuzzy scores for all previous and current names
    names = [remove_ltd(d['name']) for d in prev_names]
    names.insert(0, name)
    fuzzy_scores = [process.extract(query, [name])[0][1] for name in names]

    return {'names': names,
            'number': company_num,
            'incorp_date': incorp_date,
            'registered_address': registered_address,
            'sic_codes': sic_codes,
            'fuzzy_scores': fuzzy_scores}


# Testing
query = 'oxford nanopore technologies'
results = search_CH(query)
print('Query:\n', query)
print('Company number:\n', results['number'])
print('Different names:\n',results['names'])
print('Incorporated date:\n', results['incorp_date'])
print('Registered address:\n', results['registered_address'])
print('SIC codes:\n', results['sic_codes'])
print('Fuzzy match scores:\n', results['fuzzy_scores'])
if max(results['fuzzy_scores']) >= 90:
    print('We have a match! The highest score was {}'.format(max(results['fuzzy_scores'])))
else:
    print('No match :(')

Query:
 oxford nanopore technologies
Company number:
 05386273
Different names:
 ['oxford nanopore technologies ', 'oxford nanolabs ']
Incorporated date:
 2005-03-09
Registered address:
 Gosling Building Edmund Halley Road, Oxford Science Park, Oxford, Oxfordshire, United Kingdom, OX4 4DQ
SIC codes:
 ['72190']
Fuzzy match scores:
 [100.0, 85.5]
We have a match! The highest score was 100.0


In [125]:
scopus_df.head(1)

Unnamed: 0,af_id,affil_name,affil_city,address,postal_code,lat_lng,type,source,grid_match,best_name,best_score,best_dist,grid_id_match,CH_data
0,60030480,University of Bath,Bath,Claverton Down,BA2 7AY,"(51.38044, -2.330673)",univ,scopus,"[(University of Bath, 100.0, 48), (Bath Spa Un...",University of Bath,100.0,0.208249,,


In [132]:
def get_CH_data(df):
    """Loops through Scopus affiliations and retrieves CH data.
    Updates the CH_data column"""
    
    for row in tqdm(df.itertuples(), total=df.shape[0]):
        
        if pd.isnull(row.CH_data):
            
            try:
                results = search_CH(row.affil_name)
                if not results:
                    df.iat[row.Index, 13] = 'no data'
                else:
                    df.iat[row.Index, 13] = results
            except:
                df.iat[row.Index, 13] = np.nan
    
    return df

In [124]:
scopus_df['CH_data'] = None

In [135]:
scopus_df = get_CH_data(scopus_df)

100%|████████████████████████████████████████████████████████████████████████████| 8368/8368 [1:49:55<00:00,  1.27it/s]


In [139]:
scopus.head(1)

Unnamed: 0,af_id,affil_name,affil_city,address,postal_code,lat_lng,type,source,grid_match,best_name,best_score,best_dist,grid_id_match,CH_data
0,60030480,University of Bath,Bath,Claverton Down,BA2 7AY,"(51.38044, -2.330673)",univ,scopus,"[(University of Bath, 100.0, 48), (Bath Spa Un...",University of Bath,100.0,0.208249,,"{'names': ['university of bath (the)', ''], 'n..."


__Format CH data__

In [149]:
scopus_df.iloc[0]['CH_data']

{'names': ['university of bath (the)', ''],
 'number': 'RC000644',
 'incorp_date': '',
 'registered_address': None,
 'sic_codes': [],
 'fuzzy_scores': [95.0, 0.0]}

In [165]:
def format_CH(df):
    """."""
    
    df['CH_best_name'] = ''
    df['CH_best_score'] = ''
    df['CH_number'] = ''
    df['incorp_date'] = ''
    df['CH_SICs'] = ''
    df['CH_address'] = ''
    
    for row in df.itertuples():
        try:
            if row.CH_data != 'no data' and pd.notnull(row.CH_data):

                # Best name
                max_idx = np.argmax(row.CH_data['fuzzy_scores'])
                df.iat[row.Index, 14] = row.CH_data['names'][max_idx]
                # Best score
                df.iat[row.Index, 15] = max(row.CH_data['fuzzy_scores'])
                # Companies house registered number
                df.iat[row.Index, 16] = row.CH_data['number']
                # Incorporation date
                df.iat[row.Index, 17] = row.CH_data['incorp_date']
                # SICs
                df.iat[row.Index, 18] = row.CH_data['sic_codes']
                # Registered address
                df.iat[row.Index, 19] = row.CH_data['registered_address']
            else:
                continue
        except:
            print(row.Index)
            break
    return df

In [166]:
scopus_df = format_CH(scopus_df)

In [168]:
scopus_df.sample(n=5)

Unnamed: 0,af_id,affil_name,affil_city,address,postal_code,lat_lng,type,source,grid_match,best_name,best_score,best_dist,grid_id_match,CH_data,CH_best_name,CH_best_score,CH_number,incorp_date,CH_SICs,CH_address
1520,100626907,Galway-Mayo Institute of Technology,Galway,Galway,,,,,"[(University of Edinburgh, 85.5, 11), (Univers...",University of Edinburgh,85.5,,,{'names': ['birmingham institute of education ...,birmingham institute of education training & t...,85.5,4314444.0,2001-10-31,"[85310, 85320, 85422, 85600]","Alma Park, Woodwy Lane, Claybrooke Parva, Leic..."
363,60211435,St Michael's Hospital,Warwick,St Michael's Road,CV34 5QW,"(52.28853, -1.594092)",hosp,scopus,"[(St Michael's Hospital, 100.0, 572), (St Mich...",St Michael's Hospital,100.0,115.368603,,"{'names': ['st michael's hospitality ', 'st mi...",st michael's hospitality,93.3333,8568764.0,2013-06-13,[55100],"53 Fore Street, Ivybridge, Devon, PL21 9AE"
4588,101629387,Asmara Resources,London,,,,,,"[(Natural Resources Institute, 85.5, 197), (En...",Natural Resources Institute,85.5,,,"{'names': ['asmara resources ', ''], 'number':...",asmara resources,100.0,9378693.0,2015-01-08,[86210],"Ramsay Brown Llp The Brentano Suite, Solar Hou..."
7082,105870159,University Hospital,Utrecht,Utrecht,,,Healthcare,fuzzy_grid,"[(University Hospital of Wales, 95.0, 244), (A...",University Hospital of Wales,95.0,,,,,,,,,
83,60029336,Bournemouth University,Bournemouth,"Fern Barrow, Talbot Campus",BH12 5BB,"(50.742978, -1.897168)",univ,scopus,"[(Bournemouth University, 100.0, 125), (Arts U...",Bournemouth University,100.0,0.031059,,"{'names': ['bournemouth university ', 'bournem...",bournemouth university,100.0,4041028.0,2000-07-26,[82990],"2nd Floor Melbury House, 1-3 Oxford Road, Bou..."


__Add Companies House match information__

In [178]:
def add_CH_info(row):
    try:
        if (pd.isnull(row['source'])) and (row['CH_best_score'] != '') and (row['CH_best_score'] >= 88):
            return 'CH'
        else:
            return row['source']
    except:
        print(row)

In [179]:
scopus_df['source'] = scopus_df.apply(lambda row: add_CH_info(row), axis=1)

In [182]:
scopus_df[scopus_df['source'] == 'CH'].sample(n=5)

Unnamed: 0,af_id,affil_name,affil_city,address,postal_code,lat_lng,type,source,grid_match,best_name,best_score,best_dist,grid_id_match,CH_data,CH_best_name,CH_best_score,CH_number,incorp_date,CH_SICs,CH_address
3773,100424938,Thistle Veterinary Health Centre,Edinburgh,1 Alcorn Rigg,EH14 3BF,"(55.910724, -3.278093)",,CH,"[(Centre for Environment, Fisheries and Aquacu...","Centre for Environment, Fisheries and Aquacult...",85.5,505.680814,,"{'names': ['the veterinary health centre ', ''...",the veterinary health centre,93.3333,05507480,2005-07-13,[75000],"4 Greenways, St Annes On Sea, Lancashire, FY8 3LY"
2711,100940561,Textile Conservation Centre,East Molesey,Hampton Court Palace,,,,CH,"[(Centre for Environment, Fisheries and Aquacu...","Centre for Environment, Fisheries and Aquacult...",85.5,,,{'names': ['the textile conservation foundatio...,textile conservation centre,100.0,01208092,1975-04-17,[85422],"THE TRUST PARTNERSHIP, 6 Trull Farm Buildings,..."
6383,100559267,Goddard Veterinary Group,London,84 New Wanstead,E11 5SY,,,CH,"[(BT Group , 85.5, 5), (BMT Group , 85.5, 105)...",BT Group,85.5,,,"{'names': ['goddard veterinary group ', 'godda...",goddard veterinary group,100.0,01971231,1985-12-13,[75000],"Claygate House Manor Road, (The Old Police Sta..."
2440,118919038,Internet Designers Limited,Histon,"Compass House, Vision Park, Chivers Way",CB4 9AD,,,CH,"[(Tern , 90.0, 5007), (Freshseal Limited , 85....",Tern,90.0,,,"{'names': ['internet designers ', ''], 'number...",internet designers,100.0,NF003653,2002-05-14,[],"Pp 9. 20 Riverside Tower, 5 Lanyon Place, Belf..."
3927,101568553,Vysis,London,,,,,CH,"[(Institute of Physics, 72.0, 1354), (Scottish...",Institute of Physics,72.0,,,"{'names': ['vysis (uk) ', ''], 'number': '0315...",vysis (uk),90.0,03154044,1996-02-01,[5190],"Abbott House Vanwall Business Park, Vanwall Ro..."


__Categorise Companies House Matches__

Information on SIC codes can be found [here](https://www.gov.uk/government/publications/standard-industrial-classification-of-economic-activities-sic).

There are sections A-U for SIC codes.

* Section O: Public administration and defence; compulsory social security
* Section P: Education
* Section Q: Human health and social work activities
* All other sections can fall under company

In [183]:
sic_df = pd.read_csv(os.getcwd() + r'\data\SIC_codes\ch_sic_codes.csv')

In [184]:
public = sic_df[sic_df['Section'] == 'O']['SIC Code'].unique()
education = sic_df[sic_df['Section'] == 'P']['SIC Code'].unique()
health = sic_df[sic_df['Section'] == 'Q']['SIC Code'].unique()

In [190]:
def add_CH_type(x):
    """."""
    
    source = x['source']
    if source == 'CH':
        # Find SIC codes
        sic_codes = x['CH_SICs']
        affil_type = 'company'
        # Loop through each code
        for code in sic_codes:
            code = int(code)
            # Check non-company groups
            if code in public:
                affil_type = 'public'
            elif code in education:
                affil_type = 'education'           
            elif code in health:
                affil_type = 'health'
        return affil_type
    else:
        return x['type']

In [191]:
scopus_df['type'] = scopus_df.apply(lambda x: add_CH_type(x), axis=1)

In [193]:
scopus_df[scopus_df['source'] == 'CH'].sample(n=5)

Unnamed: 0,af_id,affil_name,affil_city,address,postal_code,lat_lng,type,source,grid_match,best_name,best_score,best_dist,grid_id_match,CH_data,CH_best_name,CH_best_score,CH_number,incorp_date,CH_SICs,CH_address
322,102051640,Creating Insights,Bristol,,,,company,CH,"[(Eaton , 72.0, 4123), (RAFT, 67.5, 4989), (Br...",Eaton,72.0,,,"{'names': ['creating insights ', 'people conce...",creating insights,100.0,10922131,2017-08-18,[70229],"Beech Grove, Harple Lane, Detling, Maidstone, ..."
6217,100509077,Trident Feeds,Peterborough,Oundle Road,PE2 9QX,"(52.561886, -0.258562)",company,CH,"[(DSV , 72.0, 5327), (Strix , 68.4, 4560), (Ti...",DSV,72.0,43.5293,,"{'names': ['trident feeds ', 'j.meredith & son...",trident feeds,100.0,194336,1923-12-11,[74990],"Weston Centre, 10 Grosvenor Street, London, W1..."
3768,100352055,Bradford Particle Design Ltd.,Bradford,"69 Listerhills Sci. Pk., Campus Road",BD7 1HR,"(53.792883999999994, -1.7698049999999999)",company,CH,[(Bradford Teaching Hospitals NHS Foundation T...,Bradford Teaching Hospitals NHS Foundation Trust,85.5,2.1487,,"{'names': ['nektar therapeutics uk ', 'bradfor...",bradford particle design,100.0,2998064,1994-12-05,"[20130, 20140, 21100, 72190]","Ground Floor, 32 Park Cross Street, Leeds, Wes..."
6382,101313237,Protasis (UK) Ltd,Swansea,,,,company,CH,"[(UK Coal, 85.5, 10), (UK Health and Environme...",UK Coal,85.5,,,"{'names': ['protasis u.k. ', ''], 'number': '0...",protasis u.k.,91.6667,3815912,1999-07-29,[26511],"Cannon Place, 78 Cannon Street, London, EC4N 6AF"
736,113101006,Bradford Particle Design,Bradford,69 Listerhills Science Park,BD7 1HR,"(53.792883999999994, -1.7698049999999999)",company,CH,[(Bradford Teaching Hospitals NHS Foundation T...,Bradford Teaching Hospitals NHS Foundation Trust,85.5,2.1487,,"{'names': ['nektar therapeutics uk ', 'bradfor...",bradford particle design,100.0,2998064,1994-12-05,"[20130, 20140, 21100, 72190]","Ground Floor, 32 Park Cross Street, Leeds, Wes..."


# Cleaning

__Add GRID established date__

In [196]:
date_df = pd.read_csv(os.getcwd() + r'\data\grid\institutes.csv', usecols=['grid_id', 'established'])
grid_df = pd.merge(grid_df, date_df, how='left', on='grid_id')

In [235]:
def add_GRID_date(df):
    """."""
    
    df['grid_est_date'] = None
    
    for row in df.itertuples():
        if row.source in ['exact_grid', 'fuzzy_grid', 'fuzzy_loc_grid']:
            # Find GRID established date
            grid_row = grid_df[grid_df['grid_id'] == row.grid_id_match]
            df.iat[row.Index, 20] = grid_row.established.to_numpy(dtype=int)
        else:
            continue
    
    return df

In [236]:
scopus_df = add_GRID_date(scopus_df)

__Standardise affiliations__

This was carried out in Excel to see the data more easily.

* comp|hosp, comp, comp|ngov, comp|lawf (all Scopus matches) changed to company
* Education from GRID, coll and univ from Scopus changed to education
* resi|edu > education
* HP labs, microsoft research Cambridge, BT research lab, Schlumberger Cambridge Research, Advanced Technologies (Cambridge) Limited, GEC Research Lab, Tube Investments Research Laboratories, Saffron Walden, The Arable Group > company
* HR Walligford, HRWallingford, HR Wallirgford, HRWallingford, Glass Technology Services Ltd > company
* Facility (GRID) > resi
* govt, Government > government
* meds (medical schools) > education
* Healthcare, health, hosp > healthcare
* lawf (law companies) > company
* library, museum > archive
* milo (military) > other
* ngov > government
* ngov|resi > resi
* Nonprofit seems to be a mixture of resi and government - kept as is for now as there are 573 entries for this
* From Other: BioRegional MiniMills UK Ltd (BRMM), J.P. Morgan and others
* poli (policy?) to resi
* public from CH looks like a bit of a shambles (only 51 though)
* Other > other - checked that no companies were involved. Otherwise other is still a bit of a mess

__Heuristic matching__

* 1003 contain school > education
* 848 contain institut > resi
* 389 surgery > healthcare
* 588 medical but not biomedical > healthcare
* 1079 contains ltd or limited > company - __there could be start ups in here - they are matched under string for now__
* 776 contain univ > education
* 354 contain college > education

In [241]:
scopus_df.to_csv(os.getcwd() + r'\data\uk_affils.csv', index=False)

In [248]:
scopus_df = pd.read_csv(os.getcwd() + r'\data\uk_affils.csv')

In [249]:
scopus_df.type.unique()

array(['education', 'healthcare', 'government', 'resi', 'company', nan,
       'non-profit', 'archive', 'other'], dtype=object)

__Consolidate CH and GRID dates__

+ GRID > established date - none missing
+ CH > incorporated date - 29 missing
+ Scopus (278) > GRID (166) > CH (77)

In [256]:
scopus_df.head(1)

Unnamed: 0,af_id,affil_name,affil_city,address,postal_code,lat_lng,type,source,grid_match,best_name,best_score,best_dist,grid_id_match,CH_data,CH_best_name,CH_best_score,CH_number,incorp_date,CH_SICs,CH_address,grid_est_date
0,60030480,University of Bath,Bath,Claverton Down,BA2 7AY,"(51.38044, -2.330673)",education,scopus,"[('University of Bath', 100.0, 48), ('Bath Spa...",University of Bath,100.0,0.208249,,"{'names': ['university of bath (the)', ''], 'n...",university of bath (the),95.0,RC000644,,[],,


In [296]:
def consoliDATE(df):
    """."""
    
    df['date'] = None
    
    for row in df.itertuples():
        
        # We only care about companies
        if row.type == 'company':
            # GRID matches
            if row.source in ['exact_grid', 'fuzzy_grid', 'fuzzy_loc_grid']:
                df.iat[row.Index, 21] = row.grid_est_date[0]
            # CH matches
            elif row.source == 'CH':
                df.iat[row.Index, 21] = row.incorp_date
            # Scopus matches - GRID > CH > np.nan
            elif row.source == 'scopus':
                # GRID
                if row.best_score >= 90 and pd.notnull(row.grid_est_date):
                    df.iat[row.Index, 21] = row.grid_est_date
                # Companies 
                elif row.CH_best_score >= 90:
                    df.iat[row.Index, 21] = row.incorp_date
            else:
                df.iat[row.Index, 21] = np.nan
        else:
            continue
    
    return df

In [288]:
scopus_df = consoliDATE(scopus_df)

# Save to csv

In [292]:
columns = ['af_id', 'affil_name', 'affil_city', 'postal_code', 'type', 'source', 'grid_id_match', 'CH_number', 
           'CH_address', 'date']
scopus_df = scopus_df[columns]

In [295]:
scopus_df.to_csv(os.getcwd() + r'\data\uk_affils.csv', index=False)