In [16]:
# imports 
import os
import sys
sys.path.append("../")

import nltk
import numpy as np
import pandas as pd
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

from gensim.corpora import Dictionary
from gensim.models import LdaModel, Phrases
from nltk.corpus import wordnet as wn
from nltk.tag.util import tuple2str
from nltk.tokenize import RegexpTokenizer

from datahandler import DataHandler

nltk.download("stopwords") # set of stopwords

  and should_run_async(code)
[nltk_data] Downloading package stopwords to C:\Users\Anna
[nltk_data]     Konvicka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# functions

def tokenize(text, stop_words="../stopwords.txt"):
    with open (stop_words, "r") as f:
        add_stop_words = f.read().splitlines()
        add_stop_words = set(add_stop_words)
    
    stopwords = set(nltk.corpus.stopwords.words("english"))
    stopwords = stopwords.union(add_stop_words)
    
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
    
    for doc in range(len(text)):
        text[doc] = text[doc].lower()
        text[doc] = text[doc].replace(r"\n", " ")
        text[doc] = tokenizer.tokenize(text[doc])
        
    text = [[token for token in doc if (not token.isdigit, token not in stopwords, len(token) > 3)] for doc in text]
    text = [[lemmatizer.lemmatize(token) for token in doc] for doc in text]
    
    return text

  and should_run_async(code)


In [18]:
# set random seed
# seed = np.random.randint(0, 2**32)
seed=123

# supply data directory
data_dir = os.path.join("D:/preproc_all")
# load corpus
corpus = DataHandler(data_dir, seed)

# print some various information from the corpus
print("Total Word Count: {}".format(corpus.total_words))
print("Number of Docs in the Corpus: {}".format(corpus.total_docs))

# summarize statistics from all institutions in the corpus
print(corpus.stats)

  and should_run_async(code)


Total Word Count: 2221727
Number of Docs in the Corpus: 5404
[{'inst': '930006', 'n_docs': 1, 'wc': 314}, {'inst': '930007', 'n_docs': 1, 'wc': 184}, {'inst': '930011', 'n_docs': 1, 'wc': 488}, {'inst': '930032', 'n_docs': 1, 'wc': 847}, {'inst': '930047', 'n_docs': 1, 'wc': 613}, {'inst': '930048', 'n_docs': 1, 'wc': 341}, {'inst': '930050', 'n_docs': 1, 'wc': 453}, {'inst': '930053', 'n_docs': 1, 'wc': 379}, {'inst': '930054', 'n_docs': 1, 'wc': 502}, {'inst': '930059', 'n_docs': 1, 'wc': 319}, {'inst': '930069', 'n_docs': 1, 'wc': 396}, {'inst': '930072', 'n_docs': 1, 'wc': 112}, {'inst': '930075', 'n_docs': 1, 'wc': 421}, {'inst': '930076', 'n_docs': 1, 'wc': 143}, {'inst': '930089', 'n_docs': 1, 'wc': 310}, {'inst': '930093', 'n_docs': 1, 'wc': 416}, {'inst': '930095', 'n_docs': 1, 'wc': 243}, {'inst': '930101', 'n_docs': 1, 'wc': 374}, {'inst': '930102', 'n_docs': 1, 'wc': 276}, {'inst': '930105', 'n_docs': 1, 'wc': 249}, {'inst': '930106', 'n_docs': 1, 'wc': 392}, {'inst': '9301

In [19]:
# create dictionary for filename and text

files_text = {}
for files in corpus.data.keys():
    with open (files, "r", encoding="utf-8") as f:
        files_text[files] = f.read()

df = pd.DataFrame.from_dict(files_text, orient="index").reset_index().rename(index=str, columns={"index":"filename", 0:"text"})
text = np.array(df["text"])      

  and should_run_async(code)


In [20]:
# tokenize corpus
corpus = tokenize(text, "../stopwords.txt")

  and should_run_async(code)


In [21]:
# Add bigrams to docs (only ones that appear 10 times or more).

count = 5
bigram = Phrases(corpus, min_count=count)

for i in range(len(corpus)):
    for token in bigram[corpus[i]]:
        if "_" in token:
            corpus[i].append(token)
            

dictionary = Dictionary(corpus)
print(len(dictionary))

# remove rare and uncommon words
dictionary.filter_extremes(no_below=count, no_above=0.30)
print(len(dictionary))

  and should_run_async(code)


46295
15163


In [22]:
# create bag of words
corpus_bow = [dictionary.doc2bow(doc) for doc in corpus]

  and should_run_async(code)


In [23]:
# LDA

# set training parameters.
num_topics = 3
chunk_size = 50 # size of the doc looked at every pass (number of documents per pass)
passes = 20 # number of passes through documents
iterations = 400
eval_every = 1  # don't evaluate model perplexity, takes too much time.

# make a index to word dictionary.

temp = dictionary[0]
id2word = dictionary.id2token

# train LDA model
model = LdaModel(corpus=corpus_bow, id2word=id2word, chunksize=chunk_size, alpha="auto", eta="auto",
                               iterations=iterations, passes=passes, eval_every= eval_every, num_topics=num_topics)

  and should_run_async(code)


In [24]:
pyLDAvis.enable_notebook()
print(model)

pyLDAvis.gensim.prepare(model, corpus_bow, dictionary)
# p = pyLDAvis.gensim.prepare(model, corpus_bow, dictionary)
# pyLDAvis.save_html(p, '../results/lda.html')

  and should_run_async(code)


LdaModel(num_terms=15163, num_topics=3, decay=0.5, chunksize=50)
