# LDA Prediction

## Importing of Libraries
Run this as a seperate cell in order to reduce latency

In [30]:
from gensim.models import CoherenceModel

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import gensim
import nltk
import re
import os

In [31]:
# Importing of external library; mallet
mallet_path = r'/Library/NLTK/mallet/mallet-2.0.8/bin/mallet'

In [3]:
# Gets list of stop words and stemmer object
stop_list = nltk.corpus.stopwords.words("english")
stemmer = nltk.stem.porter.PorterStemmer()

In [4]:
# Converts text files into a corpus
def corpus2docs(corpus):
    fids = corpus.fileids()
    docs1 = []
    for fid in fids:
        doc_raw = corpus.raw(fid)
        doc = nltk.word_tokenize(doc_raw)
        docs1.append(doc)
    docs2 = [[w.lower() for w in doc] for doc in docs1]
    docs3 = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs2]
    docs4 = [[w for w in doc if w not in stop_list] for doc in docs3]
    docs5 = [[stemmer.stem(w) for w in doc] for doc in docs4]
    return docs5, fids

# Converts docs into vectors
def docs2vecs(docs, dictionary):
    vecs1 = [dictionary.doc2bow(doc) for doc in docs]
    return vecs1

## Loading of Train Dataset
Load and preprocess 1872 training dataset for dictionary

In [5]:
# Creating of dictionary
train_corpus = nltk.corpus.PlaintextCorpusReader("./TrainTest_Transcripts/Train/", ".+\.txt")
train_docs, train_fids = corpus2docs(train_corpus)
dictionary = gensim.corpora.Dictionary(train_docs)

## Loading of Test Dataset
Load and preprocess 468 testing dataset for prediction

In [6]:
# Creating of the test vectors for prediciton
test_corpus = nltk.corpus.PlaintextCorpusReader("./TrainTest_Transcripts/Test/", ".+\.txt")
test_docs, test_fids = corpus2docs(test_corpus)
test_vecs = docs2vecs(test_docs, dictionary)

## Loading of LDA Model
Load LDA model with N topics

In [33]:
no_of_topics = 48

# Models are in increment of 2, from 2 to 120. Choose and even number model during loading
lda_model = gensim.models.wrappers.LdaMallet.load("./LDA_Models/train_LDA_" + str(no_of_topics))

In [38]:
lda_model.mallet_path = mallet_path

In [39]:
lda_model.mallet_path

'/Library/NLTK/mallet/mallet-2.0.8/bin/mallet'

## Prediction of Topics
The prediction of the distribtuion of topics for the testing data

In [40]:
def format_topics_sentences(ldamodel, corpus, data):
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)

        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
                
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(data)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [41]:
df_docs_topics_distribution = format_topics_sentences(ldamodel=lda_model, corpus=test_vecs, data=test_docs)

CalledProcessError: Command '/Library/NLTK/mallet/mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /var/folders/gf/h7z28hjn1bsfqnvmxnhjt5y80000gn/T/a3ba6f_corpus.txt --output /var/folders/gf/h7z28hjn1bsfqnvmxnhjt5y80000gn/T/a3ba6f_corpus.mallet.infer --use-pipe-from /var/folders/gf/h7z28hjn1bsfqnvmxnhjt5y80000gn/T/a3ba6f_corpus.mallet' returned non-zero exit status 1.

In [36]:
# Format
df_docs_dominant_topic = df_docs_topics_distribution.reset_index()
df_docs_dominant_topic.columns = ['Transcript_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Transcript']

# Find the true transcript number from the entire corpus
transcript_no_hash = {}
for index in df_docs_dominant_topic["Transcript_No"]:
    transcript_no = test_fids[index].split(".")[0].split("_")[1]
    transcript_no_hash[index] = transcript_no

# Replace data in "index" column
df_docs_dominant_topic["Transcript_No"].replace(transcript_no_hash, inplace=True)

# Show
df_docs_dominant_topic

NameError: name 'df_docs_topics_distribution' is not defined

In [52]:
df_docs_dominant_topic.loc[df_docs_dominant_topic["Dominant_Topic"] == 0.0]

Unnamed: 0,Transcript_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Transcript
0,0,0.0,0.1132,"school, kid, learn, student, children, educ, t...","[good, morn, laughter, great, blown, away, who..."
74,1317,0.0,0.1678,"school, kid, learn, student, children, educ, t...","[want, afternoon, someth, littl, differ, sched..."
86,1390,0.0,0.2485,"school, kid, learn, student, children, educ, t...","[everyon, need, coach, matter, whether, basket..."
87,1391,0.0,0.1687,"school, kid, learn, student, children, educ, t...","[teach, chemistri, explos, right, right, explo..."
88,1394,0.0,0.1934,"school, kid, learn, student, children, educ, t...","[littl, nervou, wife, yvonn, said, said, geoff..."
131,163,0.0,0.1159,"school, kid, learn, student, children, educ, t...","[welcom, five, danger, thing, let, children, c..."
141,1691,0.0,0.1582,"school, kid, learn, student, children, educ, t...","[first, children, book, publish, return, old, ..."
177,1842,0.0,0.1361,"school, kid, learn, student, children, educ, t...","[kindergarten, design, made, kindergarten, cir..."
198,193,0.0,0.1354,"school, kid, learn, student, children, educ, t...","[thank, much, everyon, ted, chri, ami, particu..."
227,2083,0.0,0.1545,"school, kid, learn, student, children, educ, t...","[today, go, show, tablet, headset, wear, go, c..."
