In [1]:
# imports 
import os
import sys
sys.path.append("../")

import nltk
import numpy as np
import pandas as pd
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

from gensim.corpora import Dictionary
from gensim.models import LdaModel, Phrases
from nltk.corpus import wordnet as wn
from nltk.tag.util import tuple2str
from nltk.tokenize import RegexpTokenizer

from datahandler import DataHandler

In [None]:
# functions

def tokenize(text, stop_words="../stopwords.txt"):
    with open (stop_words, "r") as f:
        add_stop_words = f.read().splitlines()
        add_stop_words = set(add_stop_words)
    
    stopwords = set(nltk.corpus.stopwords.words("english"))
    stopwords = stopwords.union(add_stop_words)
    
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
    
    for doc in range(len(text)):
        text[doc] = text[doc].lower()
        text[doc] = text[doc].replace(r"\n", " ")
        text[doc] = tokenizer.tokenize(text[doc])
        
    text = [[token for token in doc if (not token.isdigit, token not in stopwords, len(token) > 3)] for doc in text]
    text = [[lemmatizer.lemmatize(token) for token in doc] for doc in text]
    
    return text

In [2]:
# set random seed
# seed = np.random.randint(0, 2**32)
seed=123

# supply data directory
data_dir = os.path.join(os.pardir, "data", "preproc")
# load corpus
corpus = DataHandler(data_dir, seed)

# print some various information from the corpus
print("Total Word Count: {}".format(corpus.total_words))
print("Number of Docs in the Corpus: {}".format(corpus.total_docs))

# summarize statistics from all institutions in the corpus
print(corpus.stats)

Total Word Count: 2996922
Number of Docs in the Corpus: 283
[{'inst': 'AA', 'n_docs': 137, 'wc': 1532532}, {'inst': 'CU', 'n_docs': 42, 'wc': 449772}, {'inst': 'GSD', 'n_docs': 37, 'wc': 408691}, {'inst': 'Know', 'n_docs': 32, 'wc': 293989}, {'inst': 'MIT', 'n_docs': 18, 'wc': 135822}, {'inst': 'RISD', 'n_docs': 17, 'wc': 176116}]


In [4]:
# create dictionary for filename and text

files_text = {}
for files in corpus.data.keys():
    with open (files, "r") as f:
        files_text[files] = f.read()

df = pd.DataFrame.from_dict(files_text, orient="index").reset_index().rename(index=str, columns={"index":"filename", 0:"text"})
text = np.array(df["text"])      

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [10]:
# tokenize corpus
corpus = tokenize(text, "../stopwords.txt")

In [12]:
# Add bigrams to docs (only ones that appear 10 times or more).

count = 10
bigram = gensim.models.Phrases(corpus, min_count=count)

for i in range(len(corpus)):
    for token in bigram[corpus[i]]:
        if "_" in token:
            corpus[i].append(token)
            

dictionary = gensim.corpora.Dictionary(corpus)
print(len(dictionary))

# remove rare and uncommon words
dictionary.filter_extremes(no_below=count, no_above=0.20)
print(len(dictionary))

45336
7941


In [13]:
# create bag of words
corpus_bow = [dictionary.doc2bow(doc) for doc in corpus]

In [25]:
# LDA

# set training parameters.
num_topics = 6
chunksize = 50 # size of the doc looked at every pass (number of documents per pass)
passes = 20 # number of passes through documents
iterations = 400
eval_every = 1  # don't evaluate model perplexity, takes too much time.

# make a index to word dictionary.

temp = dictionary[0]
id2word = dictionary.id2token

# train LDA model
model = gensim.models.LdaModel(corpus=corpus_bow, id2word=id2word, chunksize=chunk_size, alpha="auto", eta="auto",
                               iterations=iterations, passes=passes, eval_every= eval_every, num_topics=num_topics)

  and should_run_async(code)


In [26]:
pyLDAvis.enable_notebook()
print(model)

pyLDAvis.gensim.prepare(model, corpus_bow, dictionary)
# p = pyLDAvis.gensim.prepare(model, corpus_bow, dictionary)
# pyLDAvis.save_html(p, '../results/lda.html')

  and should_run_async(code)


LdaModel(num_terms=7941, num_topics=8, decay=0.5, chunksize=50)
