In [17]:
import os
import pandas as pd
import numpy as np
from pandas import DataFrame
np.random.seed(2018)

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore

In [18]:
# working directory
pwd: str = os.environ['HOME'] + '/work/assignment/assignment-8'

#### Define functions

In [19]:
stemmer: PorterStemmer = PorterStemmer()
lemmatizer: WordNetLemmatizer = WordNetLemmatizer()

def lemmatize_stemming(text):
    return stemmer.stem(lemmatizer.lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

#### Read all the text files in the directory
- get the filenames list
- loop through the filenames & read each file
- add the filename & the file content into a list of tuples

In [20]:
path: str = pwd + '/MovieReviews'
file_names = os.listdir(path)

doc_contents: list = []
for i, file_name in zip(range(len(file_names)), file_names):
    with open(path + '/' + file_name, encoding="utf8", errors='ignore') as file:
        doc_contents.append((i, file_name, file.read()))

#### Load the document contents into a dataframe
- load the tuple into a dataframe
- remove any NA values in the 'FileContent' column

In [21]:
data: DataFrame = pd.DataFrame(doc_contents, columns=['RowNum', 'FileName', 'FileContent'])
data.dropna(subset=['FileContent'], inplace= True)
print(data.head(5))

   RowNum   FileName                                        FileContent
0       0  16748.txt  DENNIS SCHWARTZ "Movie Reviews and Poetry"\nUN...
1       1  17108.txt  A brilliant, witty mock documentary of Jean Se...
2       2  17109.txt  NOSTALGHIA (director: Andrei Tarkovsky; cast: ...
3       3  17110.txt  PAYBACK (director: Brian Helgeland; cast:(Port...
4       4  17111.txt  WAKING NED DEVINE (director: Kirk Jones (III);...


#### Build dictionary out of the document contents
- execute lemmatization & stemming actions for each item in the dataframe
- build dictionary object containing the number of times a word appears in the document set.
- Filter out tokens that appear in 
    - less than 15 documents (absolute number) or 
    - more than 0.5 documents (fraction of total corpus size, not absolute number)
    - keep only the first 100000 most frequent tokens.

In [22]:
processed_docs = data['FileContent'].map(preprocess)
print(processed_docs[:5])

dictionary: Dictionary = Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

0    [denni, schwartz, movi, review, poetri, unmak,...
1    [brilliant, witti, mock, documentari, jean, se...
2    [nostalghia, director, andrei, tarkovski, cast...
3    [payback, director, brian, helgeland, cast, po...
4    [wake, devin, director, kirk, jone, cast, bann...
Name: FileContent, dtype: object


#### Build BagOfWords corpus from the document content
- for each document, create a dictionary reporting how many words and how many times they appear.
- build dictionary object containing the number of times a word appears in the document set.

In [23]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print(len(bow_corpus))

180


#### Test out one of the bag of words

In [24]:
test_corpus = bow_corpus[0]
for i, word_stat in zip(range(len(test_corpus)), test_corpus):
    # print only the first 5 words
    if i < 5:
        print("Word {} (\"{}\") appears {} times.".format(word_stat[0], dictionary[word_stat[0]], word_stat[1]))

Word 0 ("accomplish") appears 1 times.
Word 1 ("actor") appears 1 times.
Word 2 ("america") appears 1 times.
Word 3 ("apart") appears 1 times.
Word 4 ("attract") appears 3 times.


#### Build LDA model
- num_topics value is set as 2, indicating to get the top 2 k=2 models.
- for each topic, explore the words occurring in that topic and its relative weight

In [25]:
lda_model = LdaMulticore(bow_corpus, num_topics=2, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} -> Words: {}'.format(idx, topic))

Topic: 0 -> Words: 0.008*"littl" + 0.006*"money" + 0.005*"audienc" + 0.005*"father" + 0.005*"role" + 0.005*"show" + 0.005*"perform" + 0.005*"feel" + 0.005*"releas" + 0.005*"real"
Topic: 1 -> Words: 0.008*"world" + 0.007*"feel" + 0.007*"see" + 0.006*"need" + 0.006*"tri" + 0.005*"question" + 0.005*"turn" + 0.005*"wife" + 0.005*"say" + 0.005*"real"


#### Top 10 words for each of the topics
- identify the top 10 words for each topic

In [26]:
top_topics = lda_model.top_topics(corpus=bow_corpus, topn=10)

i = 0
for words, coherence in top_topics:
    print('Topic: {} -> Top Words: {}'.format(i, words))
    i += 1

Topic: 0 -> Top Words: [(0.008109855, 'littl'), (0.0057991627, 'money'), (0.0053513288, 'audienc'), (0.0053070267, 'father'), (0.0052273963, 'role'), (0.005220431, 'show'), (0.005144959, 'perform'), (0.005120052, 'feel'), (0.0050592427, 'releas'), (0.004968191, 'real')]
Topic: 1 -> Top Words: [(0.00778954, 'world'), (0.007067585, 'feel'), (0.006778942, 'see'), (0.006349574, 'need'), (0.0060187713, 'tri'), (0.0051947045, 'question'), (0.0049245534, 'turn'), (0.0049069873, 'wife'), (0.004820435, 'say'), (0.00475309, 'real')]


#### Document -> Topic probabilities for all the documents in the corpus

In [27]:
for i, corpus_item in zip(range(len(bow_corpus)), bow_corpus):
    print(data['RowNum'][i], '-', data['FileName'][i], '->', lda_model[corpus_item])


0 - 16748.txt -> [(0, 0.13139327), (1, 0.8686067)]
1 - 17108.txt -> [(0, 0.71595013), (1, 0.2840499)]
2 - 17109.txt -> [(1, 0.99555296)]
3 - 17110.txt -> [(0, 0.9443173), (1, 0.055682763)]
4 - 17111.txt -> [(0, 0.5102283), (1, 0.48977178)]
5 - 17116.txt -> [(0, 0.09412225), (1, 0.90587777)]
6 - 17117.txt -> [(0, 0.20857136), (1, 0.7914286)]
7 - 17118.txt -> [(1, 0.99045455)]
8 - 17119.txt -> [(0, 0.06693634), (1, 0.9330637)]
9 - 17139.txt -> [(1, 0.99297714)]
10 - 17144.txt -> [(0, 0.58954734), (1, 0.41045266)]
11 - 17145.txt -> [(0, 0.010423786), (1, 0.9895762)]
12 - 17146.txt -> [(0, 0.34672955), (1, 0.6532704)]
13 - 17147.txt -> [(0, 0.29252928), (1, 0.7074707)]
14 - 17150.txt -> [(0, 0.014667761), (1, 0.98533225)]
15 - 17185.txt -> [(0, 0.9744871), (1, 0.025512876)]
16 - 17192.txt -> [(0, 0.25499335), (1, 0.7450067)]
17 - 17219.txt -> [(0, 0.9094785), (1, 0.09052149)]
18 - 17239.txt -> [(0, 0.032951564), (1, 0.96704847)]
19 - 17243.txt -> [(0, 0.80417323), (1, 0.19582674)]
20 - 172