# Purpose


+ Clean data from Scopus into a more friendly format
+ Get data on affiliations from Scopus
+ Identify UK affiliations

# Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import glob
import os
from pybliometrics.scopus import AffiliationRetrieval

# Load Data

In [2]:
data = pd.read_csv(os.getcwd() + r'\data\data.csv')

# Cleaning

Separate rows into individual affiliations and keep only UK based affiliations.

In [3]:
def string_to_list(df):
    """Takes in raw Scopus dataframe and splits affiliation data
    for each paper into lists of affiliations."""

    # Remove special characters
    df['affilname'] = df.affilname.str.replace('&amp;', '')
    df['affilname'] = df.affilname.str.replace('acute;', '')

    # Split the fields by semi-colon
    df['afid'] = df['afid'].str.split(pat=';')
    df['affilname'] = df['affilname'].str.split(pat=';')
    df['affiliation_city'] = df['affiliation_city'].str.split(pat=';')
    df['affiliation_country'] = df['affiliation_country'].str.split(pat=';')

    # Create empty list if nan - deal with bug finding indices
    bug_catcher = lambda x: x if isinstance(x, list) else []
    df['affilname'] = df['affilname'].apply(bug_catcher)
    df['affiliation_country'] = df['affiliation_country'].apply(bug_catcher)
    
    return df

In [4]:
def find_indices(df):
    """Updates a data frame to add a column indicating whether
    the list of affiliations for each paper are based in the UK."""
    
    get_uk_idx = lambda x: [1 if i == 'United Kingdom' else 0 for i in x]
    df['encoding'] = df['affiliation_country'].apply(get_uk_idx)
    
    return df

In [5]:
def add_UK_affil_data(df):
    """Find all UK affiliations for each paper using the
    boolean array entitled 'encoding'. Create new colums
    with only UK affiliation data."""
    
    # Add columns for UK only to df
    df['uk_afid'] = None
    df['uk_affil_name'] = None
    df['uk_affil_city'] = None
    
    id_errors = 0
    name_errors = 0
    city_errors = 0
    
    # Loop through df and create an array of only uk affiliations
    for row in df.itertuples():
        
        # Create boolean array
        idx = row.Index
        bool_list = list(map(bool, row.encoding))
        encoding_array = np.array(bool_list) 
        
        # Create arrays to be indexed by boolean array
        affil_array = np.array(row.affilname)
        afid_array = np.array(row.afid)
        affil_city_array = np.array(row.affiliation_city)
        
        # Try setting UK affiliation id array
        try:
            uk_afid = afid_array[encoding_array]
            df.iat[idx, 11] = uk_afid
        except:
            df.iat[idx, 11] = np.nan
            id_errors += 1
        
        # Try setting UK affiliation name array
        try:
            uk_affil_name = affil_array[encoding_array]
            df.iat[idx, 12] = uk_affil_name
        except:
            df.iat[idx, 12] = np.nan
            name_errors += 1
        
        # Try setting UK affiliation city array
        try:
            uk_affil_city = affil_city_array[encoding_array]
            df.iat[idx, 13] = uk_affil_city
        except:
            df.iat[idx, 13] = np.nan
            city_errors += 1

    print('ID errors:', id_errors)
    print('Name errors:', name_errors)
    print('City errors:', city_errors)

    return df

__Execute__

In [6]:
# Split string of affiliation data into lists
data = string_to_list(data)

# Remove papers that do not have affiliations
before = data.shape[0]
data.dropna(subset=['afid'], inplace=True)
print(f'Removed {before - data.shape[0]} out of {data.shape[0]} due to no affiliation data provided.')

# Reset index
data.reset_index(drop=True, inplace=True)

# Add boolean array to locate UK affiliations
data = find_indices(data)

# Locate UK affiliations
data = add_UK_affil_data(data)

Removed 1032 out of 79170 due to no affiliation data provided.
ID errors: 28
Name errors: 33
City errors: 535


In [7]:
data.head(1)

Unnamed: 0.1,Unnamed: 0,eid,title,coverDate,source_id,publicationName,afid,affilname,affiliation_city,affiliation_country,encoding,uk_afid,uk_affil_name,uk_affil_city
0,0,2-s2.0-0034739787,Regulation of steroid sulphatase expression an...,2000-12-31,14102,Journal of Steroid Biochemistry and Molecular ...,"[60030480, 60022871]","[University of Bath, St Mary's Hospital]","[Bath, London]","[United Kingdom, United Kingdom]","[1, 1]","[60030480, 60022871]","[University of Bath, St Mary's Hospital]","[Bath, London]"


### Error Analysis

__Papers that had ID errors__

These were errors because they do not have country data and therefore could not produce an encoding array for country. It seems like the majority of them are just on institution and are UK related (due to the UK search). Therefore, I'm going to keep these in the data set.

__Papers that had name errors__

These include the same errors as the above BUT there are more. The additional errors come from poor formatting of the institute names. I think it comes from the fact that some institute names have ';' within them. This means that they are split into different institutions during the extraction of data. Since these are relatively few and far between, I don't think it is unreasonable to drop these. For example, 2000 has 5, 2001 has 6, 2002 has 12.

__Papers that had city errors__

These cause errors because the affiliation does not have a city associated with it. This could have happened because a non-UK institution didn't have a city with it though. I would like to keep the papers that only have affilname.

I would like to look at the entries that have length more than 1 for affilname - none of them are! Therefore, I am happy to keep them all. To keep these records, the uk_afid needs to be complete.

__Add extra affiliations__

If there was only one affiliation but no city was reported, add this affiliations to the UK affiliations.

In [8]:
def add_extra_afid(x):
    if (isinstance(x['uk_affil_city'], float)) & (len(x['affilname']) == 1):
        return x['afid']
    else:
        return x['uk_afid']
    
def add_extra_affilname(x):
    if (isinstance(x['uk_affil_city'], float)) & (len(x['affilname']) == 1):
        return x['affilname']
    else:
        return x['uk_affil_name']

data['uk_afid'] = data.apply(lambda x: add_extra_afid(x), axis=1)
data['uk_affil_name'] = data.apply(lambda x: add_extra_affilname(x), axis=1)

# Create a dataframe of UK affiliations

In [9]:
def create_uk_df(df):
    
    af_ids = []
    affil_names = []
    affil_cities = []

    for row in df.itertuples():

        names = row.uk_affil_name
        # Skip papers where UK num_ids != num_names (due to name splitting error)
        if isinstance(names, float):
            if pd.isnull(names):
                continue
        # Skip papers with no UK institutions that somehow sneaked in
        elif isinstance(row.encoding, np.ndarray):
            if names.all() == 0:
                continue

        # Loop through each institution associated with this paper
        for j, afid in enumerate(row.uk_afid):

            # If institution not already in list, add to list
            if afid not in af_ids:
                af_ids.append(afid)
                affil_names.append(row.uk_affil_name[j])
                # Some affiliations do not have city data
                try:
                    affil_cities.append(row.uk_affil_city[j])
                except:
                    affil_cities.append(np.nan)
    
    d = {'af_id': af_ids, 'affil_name': affil_names, 'affil_city': affil_cities}
    formatted_df = pd.DataFrame(data=d)
    
    return formatted_df


uk_only_df = create_uk_df(data)

In [10]:
uk_only_df.head()

Unnamed: 0,af_id,affil_name,affil_city
0,60030480,University of Bath,Bath
1,60022871,St Mary's Hospital,London
2,60022148,University College London,London
3,60011520,King's College London,London
4,60003771,The University of Manchester,Manchester


# Get affiliation data from Scopus

In [11]:
def get_affil_data(df):
    """Loop through affiliations and try to retrieve
    corresponding data from Scopus."""
    
    # Create columns to hold data
    df['address'] = ''
    df['postal_code'] = ''
    df['type'] = ''
    
    # Scopus search result does not work
    not_working = [4423, 5413]
    
    # Loop over affiliations
    for row in df.itertuples():
        
        if row.Index in not_working:
            continue
        
        if row.postal_code == '':
            try:
                resp = AffiliationRetrieval(row.af_id)
                df.iat[row.Index, 3] = resp.address
                df.iat[row.Index, 4] = resp.postal_code
                df.iat[row.Index, 5] = resp.org_type
            except Exception as e:
                print(row.Index, e)
                return df
        else:
            continue
            
    return df


uk_only_df = get_affil_data(uk_only_df)

In [12]:
uk_only_df.head()

Unnamed: 0,af_id,affil_name,affil_city,address,postal_code,type
0,60030480,University of Bath,Bath,Claverton Down,BA2 7AY,univ
1,60022871,St Mary's Hospital,London,Praed Street,W2 1NY,hosp
2,60022148,University College London,London,Gower Street,WC1E 6BT,univ
3,60011520,King's College London,London,Strand,WC2R 2LS,univ
4,60003771,The University of Manchester,Manchester,Oxford Road,M13 9PL,univ


# Summary of Scopus data

Of the 48060 affiliations:

* 16537 do not have post code
* 45132 do not have affil type
* 16422 do not have post code or affil type

Of the 2928 that have affil type:

* 887 are companies
* 134 are government
* 807 are hospitals
* 370 are universities
* 297 are residencies
* 238 are national governments

# Save UK affiliations

In [13]:
uk_only_df.to_csv(os.getcwd() + r'\data\uk_affils.csv', index=False)