# Topic Modelling (LDA) of Turing Institute publications 

# 0: Set up

### Required packages

In [None]:
#data manipulation and organisation
import pandas as pd
import numpy as np

#topic modelling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#visualisations
import pyLDAvis
from pyLDAvis import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

#other
import random, pkg_resources, os, json

In [None]:
print('Require sklearn version 0.19, have: ' + pkg_resources.get_distribution("scikit-learn").version)
print("If have lower version, need to change 'n_components' to 'n_topics' when calling LatentDirichletAllocation")

In [None]:
##might need to download nltk corpora and packages
#import nltk
#nltk.download()

# 1: Load data and check

In [None]:
if os.path.isfile('data_files/final_dataset_full.csv'):
    publications = pd.read_csv('data_files/final_dataset_full.csv')
else:
    publications_1 = pd.read_csv('data_files/final_dataset_1.csv')
    publications_2 = pd.read_csv('data_files/final_dataset_2.csv')
    publications = pd.concat([publications_1, publications_2])

publications = publications.rename(columns={'full_name': 'name', 'current_uni': 'uni'})
publications.head()

In [None]:
#check how many fellows have associated with each university
uni_names = publications['uni'].unique()
uni_fellows = publications.groupby('uni')['name'].unique()

for i in range(len(uni_names)):
    print(uni_names[i] + ": " + str(len(uni_fellows[i])) + " fellows")
    
num_fellows = len(publications['name'].unique())
print('\nExcpect 138 fellows overall, have: ' + str(num_fellows))

# 2: Note instances of multiple Turing fellows associated with 1 paper - remove duplicates based on paper ID (but keep note of all authors to attribute later)

paper_id_to_authors dictionary keys correspond to all unique paper ids  
dictionary value is a list of authors associated with that paper (most instances have 1 but in some cases have multiple)

In [None]:
paper_id_to_authors = {}
for idx, row in publications.iterrows():
    if row['paper_id'] not in paper_id_to_authors.keys():
        paper_id_to_authors[row['paper_id']] = [row['name']]
    else:
        paper_id_to_authors[row['paper_id']].append(row['name'])
        
#drop duplicates - rename df
publications_data = publications.drop_duplicates(subset = 'paper_id')

#relabel index numbers
publications_data = publications_data.reset_index(drop=True)

print('Dataset contains {0[0]} unique articles'.format(publications_data.shape))

# 3: LDA

### Create document-term matrix from text data

In [None]:
#1: create vector representation of vocabulary
vectorizer = CountVectorizer(max_df=0.95, min_df=2)

#create document_term_matrix
dtm = vectorizer.fit_transform(publications_data['full_text'].values.astype('U'))

#retrieve word names at each vocabulary position
feature_names = vectorizer.get_feature_names()

### Train model

In [None]:
n_topics = 25

lda = LatentDirichletAllocation(n_components = n_topics,  max_iter = 50, 
                                learning_method = 'online', random_state = 0) 

lda.fit(dtm)

### Visualise topics (LDAvis)

In [None]:
pyLDAvis.enable_notebook
prepared_data = pyLDAvis.sklearn.prepare(lda, dtm, vectorizer)
pyLDAvis.display(prepared_data)

## 4: Organise LDA outputs

### Retrieve topic_term and document_topic distributions

In [None]:
def df_with_names(data, index_name, columns_name):
    if type(data) == pd.DataFrame:
        #we want our index to be numbered
        df = pd.DataFrame(data.values)
    else:
        df = pd.DataFrame(data)
    df.index.name = index_name
    df.columns.name = columns_name
    return df

def series_with_name(data, name):
    if type(data) == pd.Series:
        data.name = name
        #ensures a numeric index
        return data.reset_index()[name]
    else:
        return pd.Series(data, name=name)

In [None]:
topic_term_dists = lda.components_ / lda.components_.sum(axis=1)[:, None]
doc_topic_dists = lda.transform(dtm)

topic_term_dists = df_with_names(topic_term_dists, 'topic', 'term')
doc_topic_dists  = df_with_names(doc_topic_dists, 'doc', 'topic')

## 5:  Format data for visualisation

### Transform document_topic distribution to fellow_topic and institute_topic information

publications_data df:  
each row of the df corresponds to row of same index in doc_topic_dists   
this allows us to get paper_id for the doc_topic_distribution of any paper   
we can then match the paper_id against known Turing fellows associated with that paper  

For each fellow, we get the average of distributions over topics for their documents and multiple by 100 to turn proportion into percentage -- this reflects the average percentage that their articles have been assigned each topic (which is based on how many words in each document have been assigned to each topic)      

Overall topic importance is the average of above topic value assignment across all fellows  - this means each fellow contributes equally to the topic importance/size evaluation (even if they overall contributed fewer papers)   

### Extract topic proportions for each author

In [None]:
author_names = publications_data['name'].unique()
author_topic_dists = {}
author_counts = {}
    
for index, row in publications_data.iterrows():
    #most papers have 1 author but some have multiple so this assures paper topics are assigned to all relevant authors
    for name in paper_id_to_authors[row['paper_id']]:
        if name in author_topic_dists.keys():
            author_topic_dists[name] += doc_topic_dists.iloc[index]
            author_counts[name] += 1
        else:
            author_topic_dists[name] = doc_topic_dists.iloc[index]
            author_counts[name] = 1

# percentage of topics per author
for name in author_topic_dists.keys():
    if author_topic_dists[name].sum() != 0:
        author_topic_dists[name] = author_topic_dists[name]/author_counts[name]*100 

### Extract overall institute topic "importance" (size) information

In [None]:
topic_importance, total = {}, 0
for i in range(n_topics):
    topic_importance[i] = 0
    for name in author_topic_dists:
        topic_importance[i] += author_topic_dists[name][i]
    topic_importance[i] = topic_importance[i]/num_fellows
    total += topic_importance[i]

### Combine all topic information in one df - save topic order

In [None]:
author_topic_info = pd.DataFrame.from_dict(author_topic_dists, orient = 'columns')
topic_info = pd.DataFrame.from_dict(topic_importance, orient='index')

merged = pd.concat([author_topic_info, topic_info], axis =1 )
merged = merged.rename(columns={0: 'topicVal'})

##order topics by topicVal (overall topic importance)
merged = merged.sort_values(['topicVal'], ascending = 0)

topic_order = merged.index.values

### Combine topics that have assigned topic value < 1.5 (out of 100) into 1 topic that will be labeled 'other'

In [None]:
to_combine = merged.loc[merged['topicVal'] < 1.5]
key_topics = merged.loc[merged['topicVal'] >= 1.5]
key_topics = key_topics.append(to_combine.sum(), ignore_index=True)

### Save to csv

In [None]:
key_topics.to_csv("visualisation/data_final.csv", index_label='topicNum')

## 6: Determine order in which to display researchers - ordered by university, by most prevalent topic

### Order researchers by university + extract prevalent topic info

In [None]:
researchers_dict = {}

for column in merged:
    
    name = column
    topics_order = merged.sort_values([column], ascending = 0)
    topic_num = topics_order.index.values[0]
    topic_val = topics_order[column][topic_num]
    
    if name != 'topicVal':

        #use original df to search for uni in case some researchers where removed during duplicates deletion
        uni = publications.loc[publications['name']== name]['uni'].values[0]

        if uni not in researchers_dict.keys():
            researchers_dict[uni] = {topic_num:[[name, topic_val]]}
        else:
            if topic_num not in researchers_dict[uni].keys():
                researchers_dict[uni][topic_num] = [[name, topic_val]]
            else:
                researchers_dict[uni][topic_num].append([name, topic_val])

### Order researchers in descending order by topic (within each university)

In [None]:
topic_order

#loop through universities
researchers_order = []
for i in sorted(researchers_dict.keys()):
    topics = researchers_dict[i]
    for j in topic_order:
        if j in topics.keys():
            researchers = topics[j]

            if len(researchers) == 1:
                researchers_order.append(researchers[0][0])
            else:
                researchers.sort(key=lambda x: x[1], reverse=True)
                for k in researchers:
                    researchers_order.append(k[0])

### Save author order to use in visualisation

In [None]:
with open('visualisation/author_order_final.json', 'w') as fp:
    json.dump(researchers_order, fp)

## 7: Topic interpretation

### Extract top documents for each topic (with associated top words and titles and AK API keywords)

For each topic we extract top N words associated with that topic (this is very standard and is the same as displayed in the LDAvis visualisation)  

Further, for each topic we get the titles of key articles associated with that topic - this makes topics easier to interpret   

Each document has been assigned some proportion of each topic (with sum of all topic proportions within document = 1). For each topic, we ordered documents based on how much of that topic they were assigned. Then we looped through the documents extracting top 10 key article titles with the condition that if a researcher already contributed 2 papers to the top 10 key titles then we skipped their subsequent papers. This ensured that the top 10 key titles contained contributions of at least 5 researchers (as compared to trying to evaluate the meaning of a topic based on articles written by only one Turing fellow). This seemed reasonable given most topics were assigned to a number of researchers.  

### Get N top words for different values of lambda (1.0, 0.6, 0.2) -- from N most frequent to N most unique

In [None]:
def calc_topic_freq(dtm, doc_topic_dists):
    """
    same as LDAvis, define topic frequency by proportion of words assigned to topic
    """
    doc_lengths = dtm.sum(axis=1).getA1()
    topic_freq = (doc_topic_dists.T * doc_lengths).T.sum()
    
    return topic_freq
    
def get_relevance(topic_freq, topic_term_dists, lambda_):
    """
    function for calculating relevance
    if lambda_ = 1 result is same as top N mot frequent words
    if lambda_ = 0; returns list of top unique words for each topic 
    unique = not shared by other topics
    """
    term_topic_freq = (topic_term_dists.T * topic_freq).T
    term_frequency = np.sum(term_topic_freq, axis=0)
    term_proportion = term_frequency / term_frequency.sum()  
    
    log_lift = np.log(topic_term_dists / term_proportion)
    log_ttd = np.log(topic_term_dists)
    relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
    
    return relevance

def get_top_words(relevance, feature_names, n_top):
    """
    function to extract n_top words per topic using relevance (which determines what words are ordered by)
    """
    topic_top_words = {}
        
    for idx, item in enumerate(relevance.as_matrix()):
        top_words = [feature_names[i] for i in item.argsort()[:-n_top - 1:-1]]
        topic_top_words[idx] = top_words
    
    return topic_top_words

def topic_top_words(topic_term_dists, topic_num, feature_names, n_top):
    """
    similar to above but returns topic info for single topic only
    returns most frequent words for that topic
    """
    topic_term_dist = topic_term_dists.loc[topic_num]
    top_words = [feature_names[i] for i in topic_term_dist.argsort()[:-n_words - 1:-1]]
    
    return top_words

In [None]:
lambdas = [1, .6, .2]
topic_freq = calc_topic_freq(dtm, doc_topic_dists)
n_top = 10

top_words_dict = {}

for lambda_ in lambdas:
    relevance = get_relevance(topic_freq, topic_term_dists, lambda_)
    top_words = get_top_words(relevance, feature_names, n_top)    
    top_words_dict[lambda_] = top_words

### Collate all topic info

In [None]:
#create dictionary with all information of interest for each topic
#get new order of topics
topic_doc_dict = {}
all_names = []

num_papers = 10
n_words = 15

#loop through topics
for column in doc_topic_dists:

    #order docs by how much topic assigned + save indexes of top docs
    doc_topic_dists = doc_topic_dists.sort_values([column], ascending = 0)
    indexes = doc_topic_dists.index.values
    topic_doc_dict[column] = [[indexes]]
    
    #get top words for all 3 lambda values
    top_words_10 = top_words_dict[1][column]
    top_words_06 = top_words_dict[.6][column]
    top_words_02 = top_words_dict[.2][column]
    topic_doc_dict[column].append({1:top_words_10, .6:top_words_06, .2:top_words_02})
    
    #loop through indexes and retrieve document information
    titles, keywords, names = [], [], []
    topic_names = {}
    top_n = []
    
    for i in indexes:
        
        #if don't have asked for N papers yet
        if len(titles) < num_papers:
        
            #get paper ID and use that to retrieve remaining information
            paper_id = publications_data.get_value(i, 'paper_id')
            
            #retrieve top N article titles + associated keywords
            if len(top_n) < num_papers:
                to_append = [publications_data.get_value(i, 'title'), doc_topic_dists.loc[i][column]]
                for name in paper_id_to_authors[paper_id]:
                    to_append.append(name)
                top_n.append(to_append)
            
            to_add = False
            for name in paper_id_to_authors[paper_id]:
                if name in topic_names.keys():
                    topic_names[name] += 1
                else:
                    topic_names[name] = 1
                    names.append(name)

                if topic_names[name] <= 2:
                    to_add = True

            if to_add:
                titles.append([publications_data.get_value(i, 'title'), doc_topic_dists.loc[i][column]])
                ak_keywords = publications_data.loc[publications_data['paper_id']==paper_id]['ak_keywords'].values[0]
                if type(ak_keywords) != float:
                    keywords += ak_keywords.split('; ')

    #save all extracted information
    topic_doc_dict[column].append(titles)
    topic_doc_dict[column].append(list(set(keywords)))     
    topic_doc_dict[column].append(set(names))
    
    all_names.extend(names)
    
print('Number of researchers that have been listed in at least one set of top 10 papers: ', len(set(all_names)))

### Inspect results

In [None]:
#print results of top 10 key article titles
n = 1
for topic in topic_order:
    data = topic_doc_dict[topic]
    print('Topic: ', n, "\n")

    for key in data[1]:
        print('Top words lambda ' + str(key) + ': ', *data[1][key], '\n')

    print("Article titles:")
    for i in data[2]:
        print(i[0]) 
        print(i[1],'\n')

    print("AK keywords: ", *data[3], '\n')

    print("Researchers: ", *data[4], '\n')
    
    n += 1

## OTHER

### Re-order merged, doc_topic and topic_term distributions by overall topic size

In [None]:
def get_topic_order(dtm, doc_topic_dists):
    """
    function which returns same topic order as LDAvis
    """
    doc_lengths      = series_with_name(dtm.sum(axis=1).getA1(), 'doc_length')
    topic_freq       = (doc_topic_dists.T * doc_lengths).T.sum()
    topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)
    
    return topic_proportion.index 

def set_topic_order(topic_term_dists, doc_topic_dists, topic_order):
    
    topic_term_dists = topic_term_dists.ix[topic_order]
    doc_topic_dists  = doc_topic_dists[topic_order]
    
    return topic_term_dists, doc_topic_dists

def rename_topic_order(topic_term_dists, doc_topic_dists):
    
    topic_term_dists = topic_term_dists.reset_index(drop=True)
    topic_term_dists.index.name = 'topic'
    
    doc_topic_dists.columns = [i for i in range(25)]
    doc_topic_dists.columns.name = 'topic'
    
    return topic_term_dists, doc_topic_dists

In [None]:
topic_order = merged.index.values

topic_term_dists, doc_topic_dists = set_topic_order(topic_term_dists, doc_topic_dists, topic_order)
topic_term_dists, doc_topic_dists = rename_topic_order(topic_term_dists, doc_topic_dists)

merged = merged.reset_index(drop=True)