# Topic Modelling (LDA) of Turing Institute publications 

# 0: Set up

### Required packages

In [None]:
#data manipulation and organisation
import pandas as pd
import numpy as np

#topic modelling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#visualisations
import pyLDAvis
from pyLDAvis import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

#other
import random, pkg_resources, os

In [None]:
print('Require sklearn version 0.19, have: ' + pkg_resources.get_distribution("scikit-learn").version)
print("If have lower version, need to change 'n_components' to 'n_topics' when calling LatentDirichletAllocation")

In [None]:
##might need to download nltk corpora and packages
#import nltk
#nltk.download()

# 1: Load data and check

In [None]:
#check if have full dataset in 1 file and load, otherwise look for full dataset in 2 files and combine
if os.path.isfile('data_files/final_dataset_full.csv'):
    publications_data = pd.read_csv('data_files/final_dataset_full.csv')
else:
    publications_1 = pd.read_csv('data_files/final_dataset_1.csv')
    publications_2 = pd.read_csv('data_files/final_dataset_2.csv')
    publications_data = pd.concat([publications_1, publications_2])

#rename some columns
publications_data = publications_data.rename(columns={'full_name': 'name', 'current_uni': 'uni'})
    
publications_data.head()

In [None]:
print('Dataset contains {0[0]} article records'.format(publications_data.shape))

In [None]:
#check how many fellows have associated with each university
uni_names = publications_data['uni'].unique()
uni_fellows = publications_data.groupby('uni')['name'].unique()

for i in range(len(uni_names)):
    print(uni_names[i] + ": " + str(len(uni_fellows[i])) + " fellows")
    #print(random.sample(set(uni_fellows[i]), 2))
    
num_fellows = len(publications_data['name'].unique())
print('\nExcpect 108 fellows overall, have: ' + str(num_fellows))

# 2: Note instances of multiple Turing fellows associated with 1 paper - remove duplicates based on paper ID (but keep note of all authors to attribute later)

paper_id_to_authors dictionary keys correspond to all unique paper ids  
dictionary value is a list of authors associated with that paper (most instances have 1 but in some cases have multiple)

In [None]:
paper_id_to_authors = {}
for idx, row in publications_data.iterrows():
    if row['paper_id'] not in paper_id_to_authors.keys():
        paper_id_to_authors[row['paper_id']] = [row['name']]
    else:
        paper_id_to_authors[row['paper_id']].append(row['name'])
        
#drop duplicates
publications_data = publications_data.drop_duplicates(subset = 'paper_id')

#relabel index numbers
publications_data = publications_data.reset_index(drop=True)

print('Dataset contains {0[0]} unique articles'.format(publications_data.shape))

# 3: LDA

we are looking for 25 topics with both priors set to .01  
-- these values are consistent with the LDA parameter exploration results (although other values could be also used)

### Create document-term matrix from text data

In [None]:
#1: create vector representation of vocabulary
vectorizer = CountVectorizer(max_df=0.95, min_df=2)

#create document_term_matrix
dtm = vectorizer.fit_transform(publications_data['full_text'].values.astype('U'))

#retrieve word names at each vocabulary position
feature_names = vectorizer.get_feature_names()

### Train model

In [None]:
n_topics = 25

lda = LatentDirichletAllocation(n_components = n_topics, 
                                max_iter = 50, #default is 10 - if threshold condition is met earlier, updates stop
                                learning_method = 'online',
                                random_state = 0,
                                doc_topic_prior = .01, 
                                topic_word_prior = .01) 

lda.fit(dtm)

### Visualise topics (LDAvis)

In [None]:
pyLDAvis.enable_notebook
prepared_data = pyLDAvis.sklearn.prepare(lda, dtm, vectorizer)
pyLDAvis.display(prepared_data)

## 4: Organise LDA outputs

### Retrieve topic_term and document_topic distributions

In [None]:
def df_with_names(data, index_name, columns_name):
    if type(data) == pd.DataFrame:
        #we want our index to be numbered
        df = pd.DataFrame(data.values)
    else:
        df = pd.DataFrame(data)
    df.index.name = index_name
    df.columns.name = columns_name
    return df

def series_with_name(data, name):
    if type(data) == pd.Series:
        data.name = name
        #ensures a numeric index
        return data.reset_index()[name]
    else:
        return pd.Series(data, name=name)

In [None]:
topic_term_dists = lda.components_ / lda.components_.sum(axis=1)[:, None]
doc_topic_dists = lda.transform(dtm)

topic_term_dists = df_with_names(topic_term_dists, 'topic', 'term')
doc_topic_dists  = df_with_names(doc_topic_dists, 'doc', 'topic')

### Order topics in both distributions to match LDAvis output
this is ordering topics by prevalence of topic in the entire corpus  
it makes exploring results easier as we can easily compare to above visualisation

In [None]:
def get_topic_order(dtm, doc_topic_dists):
    """
    function which returns same topic order as LDAvis
    """
    doc_lengths      = series_with_name(dtm.sum(axis=1).getA1(), 'doc_length')
    topic_freq       = (doc_topic_dists.T * doc_lengths).T.sum()
    topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)
    
    return topic_proportion.index 

In [None]:
topic_order      = get_topic_order(dtm, doc_topic_dists)

topic_term_dists = topic_term_dists.ix[topic_order]
doc_topic_dists  = doc_topic_dists[topic_order]

print('topic_term and doc_topic distributions have same topic order: ', set(topic_term_dists.index.values == doc_topic_dists.columns.values))

## 5:  Format data for visualisation

### Transform document_topic distribution to fellow_topic and institute_topic information

publications_data df:  
each row of the df corresponds to row of same index in doc_topic_dists   
this allows us to get paper_id for the doc_topic_distribution of any paper   
we can then match the paper_id against known Turing fellows associated with that paper  

For each fellow, we get the average of distributions over topics for their documents and multiple by 100 -- we can imagine that each author has 100 'points' and these are divided between the modelled 25 topics based on the average proportion that their articles have been assigned each topic (this is based on how many words in each document have been assigned to each topic)      

Overall topic importance is the average of above topic value assignment across all fellows  - this means each fellow contributes equally to the topic importance/size evaluation (even if they overall contributed fewer papers)   

### Extract topic proportions for each author

In [None]:
author_names = publications_data['name'].unique()
author_topic_dists = {}
author_counts = {}

for name in author_names:
    author_topic_dists[name] = np.zeros(n_topics)
    author_counts[name] = 0
    
for index, row in publications_data.iterrows():
    #most papers have 1 author but some have multiple so this assures paper topics are assigned to all relevant authors
    for name in paper_id_to_authors[row['paper_id']]:
        author_topic_dists[name] += doc_topic_dists.iloc[index]
        author_counts[name] += 1

# proportion of topics per author - for each author these sum to 1
for name in author_topic_dists.keys():
    if author_topic_dists[name].sum() != 0:
        author_topic_dists[name] = author_topic_dists[name]/author_counts[name]*100 

### Extract overall institute topic "importance" (size) information

In [None]:
topic_importance, total = {}, 0
for i in range(n_topics):
    topic_importance[i] = 0
    for name in author_topic_dists:
        topic_importance[i] += author_topic_dists[name][i]
    topic_importance[i] = topic_importance[i]/num_fellows
    total += topic_importance[i]

### Combine all topic information in one df

In [None]:
author_topic_info = pd.DataFrame.from_dict(author_topic_dists, orient = 'columns')
topic_info = pd.DataFrame.from_dict(topic_importance, orient='index')
merged = pd.concat([author_topic_info, topic_info], axis =1 )
merged = merged.rename(columns={0: 'topicVal'})

##order topics by topicVal (which depends on researchers) rather than importance as defined by LDAvis
merged = merged.sort_values(['topicVal'], ascending = 0)
#save the new order as will use this information later
order = list(merged.index.values)
#reset index
merged = merged.reset_index(drop=True)

print('To check whether each column sums to 100, check set of all column sums (rounded to 5 dp)')
print('Expect: {100.0}, get:', set(np.around(merged.sum(axis = 0).values, decimals=5)))

### Combine topics that have assigned topic value < 1.5 (out of 100) into 1 topic that will be labeled 'other'

In [None]:
to_combine = merged.loc[merged['topicVal'] < 1.5]
key_topics = merged.loc[merged['topicVal'] >= 1.5]

#make note of how many topics will need to name
num_topics_to_name = key_topics.shape[0]

key_topics = key_topics.append(to_combine.sum(), ignore_index=True)

### Last checks and save to csv

In [None]:
print('Check topic assignments')
print('Expect: {100.0}, get:', set(np.around(merged.sum(axis = 0).values, decimals=5)))

In [None]:
key_topics.to_csv("data.csv", index_label='topicNum')
print(order)

## 6: Determine order in which to display researchers - ordered by university, by most prevalent topic

In [None]:
can_shuffle = merged
researchers_dict = {}
num = 0
for column in can_shuffle:
    num += 1
    
    name = column
    
    #sort rows (documents) in descending order of probability of topic
    topics = can_shuffle.sort_values([column], ascending = 0)

    index = topics.index.values[0]

    val = topics[column][index]
    
    if name != 'topicVal':
        uni = publications_data.loc[publications_data['name']== name]['uni'].values[0]

        if uni not in researchers_dict.keys():
            researchers_dict[uni] = {index:[[name, val]]}
        else:
            if index not in researchers_dict[uni].keys():
                researchers_dict[uni][index] = [[name, val]]
            else:
                researchers_dict[uni][index].append([name, val])

In [None]:
#loop through universities
researchers_order = []
for i in sorted(researchers_dict.keys()):
    topics = researchers_dict[i]
    for j in sorted(topics.keys()):
        researchers = topics[j]
        
        if len(researchers) == 1:
            researchers_order.append(researchers[0][0])
        else:
            researchers.sort(key=lambda x: x[1], reverse=True)
            for k in researchers:
                researchers_order.append(k[0])

print(researchers_order)

## 7: Topic interpretation

### Extract top documents for each topic (with associated top words and titles and AK API keywords)

For each topic we extract top N words associated with that topic (this is very standard and is the same as displayed in the LDAvis visualisation)  

Further, for each topic we get the titles of key articles associated with that topic - this makes topics easier to interpret   

Each document has been assigned some proportion of each topic (with sum of all topic proportions within document = 1). For each topic, we ordered documents based on how much of that topic they were assigned. Then we looped through the documents extracting top 10 key article titles with the condition that if a researcher already contributed 2 papers to the top 10 key titles then we skipped their subsequent papers. This ensured that the top 10 key titles contained contributions of at least 5 researchers (as compared to trying to evaluate the meaning of a topic based on articles written by only one Turing fellow). This seemed reasonable given most topics were assigned to a number of researchers.  

For interpreting small topics (which might only be associated with 2 or 3 we researchers), we also kept track of the 10 top papers (with no constraints attached).

### Get N top words for different values of lambda (1.0, 0.6, 0.2) -- from N most frequent to N most unique

In [None]:
def calc_topic_freq(dtm, doc_topic_dists):
    """
    same as LDAvis, define topic frequency by proportion of words assigned to topic
    """
    doc_lengths = dtm.sum(axis=1).getA1()
    topic_freq = (doc_topic_dists.T * doc_lengths).T.sum()
    
    return topic_freq
    
def get_relevance(topic_freq, topic_term_dists, lambda_):
    """
    function for calculating relevance
    if lambda_ = 1 result is same as top N mot frequent words
    if lambda_ = 0; returns list of top unique words for each topic 
    unique = not shared by other topics
    """
    term_topic_freq = (topic_term_dists.T * topic_freq).T
    term_frequency = np.sum(term_topic_freq, axis=0)
    term_proportion = term_frequency / term_frequency.sum()  
    
    log_lift = np.log(topic_term_dists / term_proportion)
    log_ttd = np.log(topic_term_dists)
    relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
    
    return relevance

def get_top_words(relevance, feature_names, n_top):
    """
    function to extract n_top words per topic using relevance (which determines what words are ordered by)
    """
    topic_top_words = {}
        
    for idx, item in enumerate(relevance.as_matrix()):
        top_words = [feature_names[i] for i in item.argsort()[:-n_top - 1:-1]]
        topic_top_words[idx] = top_words
    
    return topic_top_words

def topic_top_words(topic_term_dists, topic_num, feature_names, n_top):
    """
    similar to above but returns topic info for single topic only
    returns most frequent words for that topic
    """
    topic_term_dist = topic_term_dists.loc[topic_num]
    top_words = [feature_names[i] for i in topic_term_dist.argsort()[:-n_words - 1:-1]]
    
    return top_words

In [None]:
lambdas = [1, .6, .2]
topic_freq = calc_topic_freq(dtm, doc_topic_dists)
n_top = 10

top_words_dict = {}

for lambda_ in lambdas:
    relevance = get_relevance(topic_freq, topic_term_dists, lambda_)
    top_words = get_top_words(relevance, feature_names, n_top)    
    top_words_dict[lambda_] = top_words

### Collate all topic info

In [None]:
#create dictionary with all information of interest for each topic
#get new order of topics
topic_doc_dict = {}
all_names = []

num_papers = 10
n_words = 15

for ind, column in enumerate(doc_topic_dists.columns):

    topic_num = order.index(ind) + 1
    
    #sort rows (documents) in descending order of probability of topic
    doc_topic_dists = doc_topic_dists.sort_values([column], ascending = 0)
    #get indexes of rows (correspond to documents) that have highest topic probability 
    indexes = doc_topic_dists.index.values
    
    #save ordered indexes of the top documents for the given topic
    topic_doc_dict[topic_num] = [[indexes]]
    
    #get top words for all 3 lambda values
    top_words_10 = top_words_dict[1][column]
    top_words_06 = top_words_dict[.6][column]
    top_words_02 = top_words_dict[.2][column]
    topic_doc_dict[topic_num].append({1:top_words_10, .6:top_words_06, .2:top_words_02})
    
    #now loop through indexes and retrieve document information
    titles, keywords, names = [], [], []
    topic_names = {}
    
    top_n = []
    
    for i in indexes:
        
        #if don't have asked for N papers yet
        if len(titles) < num_papers:
        
            #get paper ID and use that to retrieve remaining information
            paper_id = publications_data.get_value(i, 'paper_id')
            
            #retrieve top N article titles + associated keywords
            if len(top_n) < num_papers:
                to_append = [publications_data.get_value(i, 'title'), doc_topic_dists.loc[i][column]]
                for name in paper_id_to_authors[paper_id]:
                    to_append.append(name)
                top_n.append(to_append)
            
            to_add = False
            for name in paper_id_to_authors[paper_id]:
                if name in topic_names.keys():
                    topic_names[name] += 1
                else:
                    topic_names[name] = 1
                    names.append(name)

                if topic_names[name] <= 2:
                    to_add = True

            if to_add:
                titles.append([publications_data.get_value(i, 'title'), doc_topic_dists.loc[i][column]])
                ak_keywords = publications_data.loc[publications_data['paper_id']==paper_id]['ak_keywords'].values[0]
                if type(ak_keywords) != float:
                    keywords += ak_keywords.split('; ')

    #save all extracted information
    topic_doc_dict[topic_num].append(titles)
    topic_doc_dict[topic_num].append(list(set(keywords)))     
    topic_doc_dict[topic_num].append(set(names))
    
    #also add top N articles (not constrained by what researchers they were associated with - this might be more appropriate for interpreting smaller topics)
    #although it does not seem to make a great deal of difference
    topic_doc_dict[topic_num].append(top_n)
    
    all_names.extend(names)
    
print('Number of researchers that have been listed in at least one set of top 10 papers: ', len(set(all_names)))

### Inspect results

In [None]:
#print results of top 10 key article titles
for key in sorted(topic_doc_dict.keys()):
    if key <= num_topics_to_name:
        data = topic_doc_dict[key]
        print('Topic: ', key, "\n")
        
        for key in data[1]:
            print('Top words lambda ' + str(key) + ': ', *data[1][key], '\n')

        print("Article titles:")
        for i in data[2]:
            print(i[0]) 
            print(i[1],'\n')

        print("AK keywords: ", *data[3], '\n')

        print("Researchers: ", *data[4], '\n')