# Topic network graph for Computation and Language
1. Load the trained LDA model
2. Load the tokenized test dataset
4. Assign topics to all entries in the test dataset
5. Save the assigned topics to a CSV file

In [7]:
import pandas as pd
import pickle
import os

DATA_PATH = '../data'
MODELS_PATH = '../models'

Load the trained LDA model

In [9]:
# Ensemble LDA computing & language model
with open(os.path.join(MODELS_PATH, 'ensemble_cscl.pickle'), 'rb') as handle:
    ensemble_cscl = pickle.load(handle)

Load the tokenized test dataset

In [31]:
# load test dataset metadata
cscl_test_df = pd.read_csv(os.path.join(DATA_PATH, 'cscl_test.csv.zip'), index_col=0)

# load tokenized test dataset
with open(os.path.join(DATA_PATH, 'corpus_test_cscl.pickle'), 'rb') as handle:
    corpus_test_cscl = pickle.load(handle)


## Assign topics to the data

Aggregate topic information in a dataframe

In [18]:
def get_topic_details(ldamodel, corpus):
    topic_details_list = []
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_details_list.append([topic_num, prop_topic, row])
    topic_details_df = pd.DataFrame(topic_details_list)
    topic_details_df.columns = ['Dominant_Topic', '% Score', 'Topics']
    return topic_details_df

In [80]:
def assign_topics(ldamodel, corpus, df):
    # Assign topics toeach document
    topics_df = pd.DataFrame()
    topics_df['id'] = list(df['id'])
    topic_details = get_topic_details(ldamodel, corpus)
    topics_df['Topics'] = topic_details['Topics']

    # loop through topics in the model
    topic_ids = [topic[0] for topic in ensemble_cscl.generate_gensim_representation().show_topics(num_topics=-1, formatted=False)]
    for topic_id in topic_ids:
        topic_probs = []
        # for each document, list the probability for each topic (a tuple <topic_id, topic_prob>)
        for probs in topics_df['Topics']:
#            print(probs)
            for prob in probs:
                found = False
                if prob[0] == topic_id:
                    topic_probs.append(prob[1])
                    found = True
                    break
            if not found:
                topic_probs.append(0)
        topics_df[topic_id] = topic_probs
    return topics_df

In [81]:
topics_cscl = assign_topics(ensemble_cscl.generate_gensim_representation(), corpus_test_cscl, cscl_test_df)

In [82]:
topics_cscl

Unnamed: 0,id,Topics,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,2005.11458,"[(8, 0.53143597), (13, 0.38946804), (9, 0.0715...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.531436,0.07154,0.0,0.0,0.0,0.389468,0
1,1601.03348,"[(2, 0.36424133), (13, 0.31752044), (10, 0.233...",0.0,0.080181,0.364241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233084,0.0,0.0,0.31752,0
2,2310.02071,"[(2, 0.7228803), (10, 0.27109733)]",0.0,0.0,0.72288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.271097,0.0,0.0,0.0,0
3,2306.0213,"[(4, 0.4293362), (9, 0.24680696), (7, 0.218965...",0.0,0.0,0.0,0.0,0.429336,0.0,0.0,0.218966,0.0,0.246807,0.0,0.09615,0.0,0.0,0
4,1808.06738,"[(12, 0.52148545), (3, 0.21140073), (11, 0.110...",0.0,0.0,0.0,0.211401,0.102818,0.0,0.0,0.0,0.0,0.047569,0.0,0.110543,0.521485,0.0,0
5,2104.0861,"[(5, 0.44130826), (1, 0.3848251), (6, 0.137723...",0.0,0.384825,0.0,0.0,0.0,0.441308,0.137724,0.0,0.0,0.0,0.0,0.0,0.030364,0.0,0
6,1504.07071,"[(12, 0.3405917), (13, 0.30935565), (8, 0.2301...",0.0,0.067504,0.0,0.0,0.0,0.0,0.0,0.0,0.230109,0.0,0.0,0.042608,0.340592,0.309356,0
7,2106.13858,"[(11, 0.8891568), (0, 0.099277906)]",0.099278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.889157,0.0,0.0,0
8,2210.0999,"[(3, 0.50689435), (4, 0.40156892), (11, 0.0823...",0.0,0.0,0.0,0.506894,0.401569,0.0,0.0,0.0,0.0,0.0,0.0,0.082323,0.0,0.0,0
9,2303.17006,"[(7, 0.8611985), (5, 0.13069755)]",0.0,0.0,0.0,0.0,0.0,0.130698,0.0,0.861198,0.0,0.0,0.0,0.0,0.0,0.0,0


In [41]:
topics_cscl.Topics[10]

[(4, 0.4191592), (11, 0.3041339), (12, 0.13958803), (7, 0.12770718)]

In [46]:
topic_ids = [topic[0] for topic in ensemble_cscl.generate_gensim_representation().show_topics(num_topics=-1, formatted=False)]

In [47]:
topic_ids

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [55]:
[prob_tuple for prob_tuple in topics_cscl['Topics'][0]]

[(8, 0.53140974), (13, 0.38948494), (9, 0.071549125)]