# Assign topics to each article
* Load the train/validate/test tokenized article abstracts
* Load the model
* Use the model to assign topics probability to all articles
* Save the topic assignments

In [1]:
import pandas as pd
import pickle
import os

In [2]:
DATA_PATH = '../data'
MODELS_PATH = '../models'

## Load the tokenized data
Load the dictionary and the tokenized data for the train/validate/test datasets

In [4]:
with open(os.path.join(DATA_PATH, 'dictionary.pickle'), 'rb') as handle:
    dictionary = tokenized_dataset = pickle.load(handle)

In [5]:
def load_tokenized_dataset(file_name):
    path = os.path.join(DATA_PATH, file_name)
    with open(path, 'rb') as handle:
        tokenized_dataset = pickle.load(handle)
    return tokenized_dataset

def load_tokenized_datasets():
    corpus_train = load_tokenized_dataset("corpus_train.pickle")
    corpus_validate = load_tokenized_dataset("corpus_validate.pickle")
    corpus_test = load_tokenized_dataset("corpus_test.pickle")
    return corpus_train, corpus_validate, corpus_test

In [6]:
corpus_train, corpus_validate, corpus_test = load_tokenized_datasets()

## Load the topic model
The model is the topic model fitted in 02_fit_topic_model.

In [7]:
with open(os.path.join(MODELS_PATH, 'topic_model.pickle'), 'rb') as handle:
    topic_model = pickle.load(handle)

## Assign topics
Use the model to assign topics probabilities to all articles.

In [8]:
def get_topic_details(topic_model, corpus):
    """
    Returns a list of pandas Series object of tuples. 
    Each tuple is a topic number and the topic probability for this entry in the corpus.
    Example: 
        [[(0, 0.22764261), (4, 0.14444388), (5, 0.62411755)],
         [(1, 0.024827635), (2, 0.3290665), (3, 0.6061594), (5, 0.03431195)],
         [(0, 0.06239689), (3, 0.03924617), (5, 0.8926314)],
         [(3, 0.09784623), (5, 0.89414346)],...
        ...]
    If for a given entry, the topic's probability is 0, then the topic is not included in the Series for this entry.
    """
    topic_details_list = []
    for row in topic_model[corpus]:
        topic_details_list.append(row)
    return topic_details_list

def get_topic_dataframe(topic_model, corpus):
    """
    Returns a data frame with a column for each topic in the topic model.
    Each row stands for an entry in the corpus, each value for the probability of thos topic for this entry.
    If for a given entry, the topic's probability is 0, the the value in the entry's column corresponding to the topic is also 0.
    Example:
             	0 	1 	2 	3 	4 	5
        0 	0.227641 	0.000000 	0.000000 	0.000000 	0.144445 	0.624118
        1 	0.000000 	0.024817 	0.329062 	0.606161 	0.000000 	0.034325
        2 	0.062392 	0.000000 	0.000000 	0.039281 	0.000000 	0.892601
        3 	0.000000 	0.000000 	0.000000 	0.097728 	0.000000 	0.894262
    """
    topic_details = get_topic_details(topic_model, corpus)
    topics_entries = []  # topics for all entries
    num_topics = len(topic_model.get_topics())  # number of topics in the model
    for row in topic_details:
        topics_entry = [0] * num_topics
        for entry in row:  # all topic probabilities for this entry
            topic_num = entry[0]  # the topic number
            topic_prob = entry[1]  # the topic probability            
            topics_entry[topic_num] = topic_prob
        topics_entries.append(topics_entry)
    return pd.DataFrame(topics_entries, columns=range(0, num_topics))

In [9]:
topics_train_df = get_topic_dataframe(topic_model, corpus_train)
topics_validate_df = get_topic_dataframe(topic_model, corpus_validate)
topics_test_df = get_topic_dataframe(topic_model, corpus_test)

### Save the topics
For each train/validate/test dataset, save the assigned topics as separate csv files

In [10]:
topics_train_df.to_csv(os.path.join(DATA_PATH, 'topics_train.csv'))
topics_validate_df.to_csv(os.path.join(DATA_PATH, 'topics_validate.csv'))
topics_test_df.to_csv(os.path.join(DATA_PATH, 'topics_test.csv'))