In [None]:
import logging
logging.captureWarnings(True)
import os
import re
import gensim
import pyLDAvis
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

In [None]:
DATA_DIR = "hw3_data/train"
NUM_TOPICS = 50

### Load data

In [None]:
# Read and tokenize texts
review_texts = [open(f"{DATA_DIR}/pos/{x}").read().split() for x in os.listdir(f"{DATA_DIR}/pos")] \
               + [open(f"{DATA_DIR}/neg/{x}").read().split() for x in os.listdir(f"{DATA_DIR}/neg")]

In [None]:
# Strip punctuation
review_texts = [[re.sub(r"\W", "", x) for x in y] for y in review_texts]

In [None]:
# Remove empty tokens and case-normalize
review_texts = [[x.lower() for x in y if x] for y in review_texts]

In [None]:
# Remove stopwords
stopwords = ["br"]
stopwords.extend(list(ENGLISH_STOP_WORDS))
review_texts = [[x for x in y if x not in stopwords] for y in review_texts]

In [None]:
len(review_texts)

In [None]:
print(review_texts[0])

### Train LDA model

In [None]:
# Create dictionary
dictionary = gensim.corpora.Dictionary(review_texts)
dictionary.filter_extremes(no_below=3, no_above=0.3, keep_n=20000)

In [None]:
# Use to preprocess text representations
gs_preprocessed_texts = [dictionary.doc2bow(text) for text in review_texts]

In [None]:
# Train model
lda = gensim.models.LdaModel(gs_preprocessed_texts, id2word=dictionary, 
                             num_topics=NUM_TOPICS, alpha=0.1, eval_every=5,
                             gamma_threshold=1e-6)

In [None]:
lda.show_topics(NUM_TOPICS)

### Visualize topic model

In [None]:
pyLDAvis.enable_notebook()

In [None]:
import pyLDAvis.gensim_models

In [None]:
viz_data = pyLDAvis.gensim_models.prepare(lda, gs_preprocessed_texts, dictionary)

In [None]:
viz_data