# Data pre-processing

- publications and fellows related data are stored in separate csv files  
- all downloaded article texts are in separate text files  

- here we clean and combine all data into a new csv for analysis

# 0: Set up

### Required packages

In [None]:
#data manipulation and organisation
import pandas as pd
import numpy as np

#data pre-processing
import string
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from difflib import SequenceMatcher
from collections import Counter

#plotting
import matplotlib.pyplot as plt

#web scraping
from bs4 import BeautifulSoup
from urllib.request import urlopen

#other
import os, random, json, re, unicodedata

### Count Turing fellows listed on website

In [None]:
def get_page(url):
    page = urlopen(url) 
    soup = BeautifulSoup(page, 'html.parser')
    return soup

url = 'https://www.turing.ac.uk/turing-fellows/'
soup = get_page(url)
all_fellows = soup.find_all('div', attrs={'class':'fellow-card'})

num_web_fellows = len(all_fellows)
uni_counts = []

for i in all_fellows:
    #get affiliation - saved in class names
    attributes = i.get('class')
    university = attributes[1]
    uni_counts.append(university)
    
uni_counts = Counter(uni_counts)

### Define paths to data files

publications_eng.csv : contains a list of articles (one per row) and associated information including researcher's name (version returned by AK API), researcher ID, paper ID, title, abstract. These are the articles that Academic Knowledge API returned for each fellow - we do not have a full version of the article for each of these    

turing_AK_IDs.csv : links each 'author_id' to version of fellow name we accept (e.g. AK API might return both 't lyons' and 'terry lyons', we want to store both as 'terry lyons', not as separate researchers)  

papers_final : directory that contains all found article .pdf and converted .txt files (each saved in a directory named after the paper ID)   

All information can be matched using author (researcher) and paper IDs

In [None]:
rootdir = 'papers_final'
data_dir = 'data_files/'

fellows = 'turing_AK_IDs.csv'
article_info = 'publications_eng.csv'

# 1: Upload, clean and finalise dataset

### Load, combine and check files - general article and Turing fellows information

In [None]:
#load data and drop empty columns
publications = pd.read_csv(data_dir + article_info, encoding = "ISO-8859-1").dropna(axis=1, how='all')
fellow_ids = pd.read_csv(data_dir + fellows) 

#make university names constant
unis = {'Cambridge University': 'Cambridge', 'University of Cambridge': 'Cambridge',
        'Edinburgh University': 'Edinburgh', 'University of Edinburgh': 'Edinburgh',
        'Oxford University':'Oxford', 'University of Oxford':'Oxford',
        'University College London':'UCL',
        'Warwick University': 'Warwick', 'University of Warwick':'Warwick'}

#get ids
ids = {}
for index, row in fellow_ids.iterrows():
    name = row['name']
    author_ids = row['id_1':'id_3']
    for author_id in author_ids:
        if not pd.isnull(author_id):
            ids[author_id] = name
                        
#combine website info (webpage full name and affiliated university) with publications info
publications['full_name'] = publications['author_id'].apply(lambda x: ids[x])
publications['current_uni'] = publications['full_name'].apply(lambda x: unis[fellow_ids.loc[fellow_ids['name']==x]['uni'].values[0]])


#concatenate AK keywords into single column
publications['ak_keywords'] = (pd.Series(publications.loc[:,'keyword_0':'keyword_26'].values.tolist())
                               .apply(lambda row: '; '.join(e for e in row if not pd.isnull(e))))

In [None]:
#check how many fellows have associated with each university - compare against how many are on website
uni_names = publications['current_uni'].unique()
uni_fellows = publications.groupby('current_uni')['full_name'].unique()

for i in range(len(uni_names)):
    print(uni_names[i] + ": " + str(len(uni_fellows[i])) + ' of ' + str(uni_counts[uni_names[i]]) + " fellows")
    #print(random.sample(set(uni_fellows[i]), 2))
    
num_fellows = len(publications['full_name'].unique())
print('\nExpect {0} fellows overall, have: '.format(num_web_fellows) + str(num_fellows))

### Note which articles have a .txt file for

In [None]:
#note all unique paper ids
paper_ids = list(publications['paper_id'].unique())
found_pdf = {}
for i in paper_ids:
    found_pdf[i] = 0
    
#get information on which papers have text file for
count = 0
for subdir, dirs, files in os.walk(rootdir):
    check = False
    
    #search through files in each directory for .txt file
    for file in files:
        if file.endswith(".txt"):
            check = True
            paper_id = subdir.split('/')[-1]
            found_pdf[int(paper_id)] = 1
            count += 1
            
    #if no .txt file was found (pdf convert failed) - print directory
    if not check and len(subdir.split('/'))==2:
        print('No text file in the {0} directory'.format(subdir.split('/')[1]))
            
publications['found_pdf'] = publications['paper_id'].apply(lambda x: found_pdf[x])

print('\nHave {0} full articles in text format'.format(count))

# 2: Check for multiple instances of same paper (same/similar paper title, same fellow but saved under 2 unique IDs) - remove duplicates

For each fellow check all the found paper titles agains each other for similarity  

If the titles are the same or sufficiently similar AND PDFs for more than 1 have been found then choose 1 (randomly) and delete the others  

In [None]:
def get_same_titles(publications, titles, name):
    """
    function that checks for multiple versions of same title as well as for similar titles in a list of titles
    expects publications df with 'paper_id', 'title' and 'name' information
    
    returns list of IDs identifying the same papers - can choose one from this list to keep
    also returns remaining titles if want to further check these for similarity
    """
    to_remove = []
    
    #count how many times each title occurs in collection
    counter = Counter(titles)
    to_check = []
    
    #if title recorded multiple times --> counter[title] > 1
    for title in counter.keys():
        
        if counter[title] == 1:
            to_check.append(title)
        else:
            idx = publications.loc[(publications['title']==title) & (publications['full_name'] == name)]['paper_id'].values
            to_remove.append(list(idx))
            
    return to_remove, to_check
            
def get_similar_titles(publications, titles, name, threshold):
    """
    function that checks for for similar titles in a list of titles
    returns list of IDs identifying the same papers - can choose one from this list to keep
    expects publications df with 'paper_id', 'title' and 'name' information
    can specify similarity threshold - this is a ratio (above what score to note similar looking titles papers for removal)
    """
    to_remove = []
    marked = []
    
    #loop through titles and check for similarity
    for i in range(len(titles)):
        
        #get paper title and save ID
        title = titles[i]
        paper_id = publications.loc[publications['title']==title]['paper_id'].values[0]
        similar = [paper_id]
        
        #only search through papers that come after current
        for title_2 in titles[i+1:]:
            
            paper_id = publications.loc[publications['title']==title_2]['paper_id'].values[0]
            
            #check paper has not already been included in a to_remove set
            if paper_id not in marked:
            
                score = SequenceMatcher(None, title, title_2).ratio()
                if score >  threshold :
                    similar.append(paper_id)
                    
        if len(similar) > 1:
            to_remove.append(similar)     
            marked.extend(similar)
    
    return to_remove

In [None]:
to_keep = {}

#check each fellow, one at a time
for  name in publications['full_name'].unique():
    
    #get all paper titles that have a pdf for
    titles = publications.loc[(publications['full_name'] == name) & (publications['found_pdf']==1)]['title'].values
    
    #check for duplicates and save paper_ids if found any
    same_papers, to_check = get_same_titles(publications, titles, name)
    if len(same_papers) > 0:
        for id_set in same_papers:
            #shuffle paper id order,then mark first item for keeping [0] and rest for removal 
            random.shuffle(id_set)
            to_keep[id_set[0]] = 1
            for i in id_set[1:]:
                to_keep[i] = 0

    #check for similar paper titles - pick 1 to keep and mark remaining for removal
    similar_papers = get_similar_titles(publications, to_check, name, .85)
    if len(similar_papers) > 0:
        for id_set in similar_papers:
            random.shuffle(id_set)
            to_keep[id_set[0]] = 1
            for i in id_set[1:]:
                to_keep[i] = 0
            
#mark the remaining papers as to keep
for paper_id in publications['paper_id'].unique():
    if paper_id not in to_keep.keys():
        to_keep[paper_id] = 1

In [None]:
publications['to_keep'] = publications['paper_id'].apply(lambda x: to_keep[x])

print('Excluding {0} repeats'.format(len(publications.loc[publications['to_keep'] == 0].values)))

publications = publications.loc[publications['to_keep'] == 1]

# 3: Count number of abstracts and pdfs obtained per researcher - note researchers that do not meet criteria

These are researchers for whom we either found fewer than 5 PDFs OR we found less than 20% of the originally identified papers  
This will be shown in the visualisation   
Save in csv file

In [None]:
num_pdfs = pd.concat([publications.groupby('full_name')['paper_id'].count(), publications.groupby('full_name')['found_pdf'].sum()], axis=1)
num_pdfs['proportion_found'] = num_pdfs['found_pdf']/num_pdfs['paper_id']
num_pdfs = num_pdfs.rename(columns={'paper_id':'article_count', 'found_pdf':'full_article_count'})
num_pdfs['name'] = num_pdfs.index

In [None]:
to_exclude = {}
author_info = {}

for index, row in num_pdfs.iterrows():
    
    fellow = row['name']
    uni = publications.loc[publications['full_name']==fellow]['current_uni'].values[0]
    author_info[fellow] = [fellow.title(), uni]
    
    if row['full_article_count'] < 5 or (row['full_article_count'] >= 5 and row['proportion_found'] < .2):
        exclude = 1
    else:
        exclude = 0
        
    to_exclude[index] = exclude
    author_info[fellow].append(exclude)
    
num_pdfs['to_exclude'] = num_pdfs['name'].apply(lambda x: to_exclude[x])

num_pdfs.loc[num_pdfs['to_exclude']==1]

### Save author information to use in visualisation

In [None]:
with open('visualisation/author_info.json', 'w') as fp:
    json.dump(author_info, fp)

# 5: Clean text data (and extract keywords)

### Set-up for pre-processing

In [None]:
#define stop words - usual + mention of researcher names and affiliations
stop_words = set(stopwords.words('english'))|set(publications['current_uni'].unique())|set(publications['name'].unique())

#keep only letters and spaces i.e. remove digits, special characters, punctuation
pattern = re.compile("[^A-Za-z\s]+")

#set up stemmer
stemmer = PorterStemmer()

def clean_text(line):
    
    #replace '/' and '-' with spaces rather than remove
    line = line.replace("/", " ").replace("-", " ")
    
    #remove digits and special characters
    line = pattern.sub('', line)

    #remove double spaces
    line = line.replace("  ", " ")
    
    #remove end of line white-space
    line = line.strip()

    #remove stop words and stem
    #longest word in major dictionary has 45 letters - remove anything longer
    line = ' '.join([stemmer.stem(word) for word in line.split() if word not in stop_words and len(word) <= 45])
    
    return line

### Load and clean all retrieved .txt files 

In [None]:
papers = {}

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if file.endswith(".txt"):
            paper_id = subdir.split('/')[-1]

            if int(paper_id) in publications['paper_id'].unique():
                path = os.path.join(subdir, file)
                my_file = open(path, 'r')
                text = ""           

                for line in my_file:
                    text += clean_text(line).lower() 
                    text += " "

                papers[paper_id] = text
                my_file.close()

### Save full text + document lengths alongside other article information in new df 

In [None]:
publications['full_text'] = publications['paper_id'].apply(lambda x: papers[str(x)] if str(x) in papers.keys() else "")

to_analyse = publications.loc[publications['full_text'] != ""]
to_analyse['doc_length'] = to_analyse['full_text'].apply(lambda x: len(x.split(" ")))

to_analyse = to_analyse.reset_index(drop = True)

# 6: Check lengths of documents and remove outliers (texts that are too short/long)

NOTE: these word counts are after stop words have been removed (but before low and high frequency words have been removed)  

At the moment we are removing texts that are less than 500 words and texts that are longer than 20 000 words  
Clearly these thresholds are fairly arbitrary but very lenient and based on what we would expect from an article

In [None]:
to_analyse['doc_length'].describe()

In [None]:
doc_lengths = to_analyse['doc_length']
plt.hist(doc_lengths, bins='auto')
plt.show()

In [None]:
print('number of articles that have fewer than 500 words: ', to_analyse.loc[to_analyse['doc_length'] < 500].count().values[0])
print('number of articles that are more than 20 000 words long: ', to_analyse.loc[to_analyse['doc_length'] > 20000].count().values[0])

In [None]:
to_analyse = to_analyse.loc[(to_analyse['doc_length'] <= 20000) & (to_analyse['doc_length'] >= 500)]
to_analyse = to_analyse.reset_index(drop=True)

print('{0[0]} articles remain for analysis'.format(to_analyse.shape))

In [None]:
#split data so that can save into smaller files in chunks
split = int(to_analyse.shape[0]/2)

first_half = to_analyse.iloc[:split]
second_half = to_analyse.iloc[split:]

# 7: Save data in a new csv file

In [None]:
#select columns of interest
cols = ['author_id'] + ['full_name'] + ['current_uni'] + list(to_analyse.loc[:, 'paper_id':'title']) + list(to_analyse.loc[:,'full_text':'doc_length']) + ['ak_keywords']

#save file - full dataset but also split into halfs (i.e. smaller file size)
final_dataset = to_analyse[cols]
final_dataset.to_csv('data_files/final_dataset_full.csv', index = False)
first_half.to_csv('data_files/final_dataset_1.csv', index=False)
second_half.to_csv('data_files/final_dataset_2.csv', index=False)