In [43]:
# Run in python console
import nltk; nltk.download('stopwords')
import re
import numpy as np
import pandas as pd
from pprint import pprint
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import Phrases
from gensim.models import LdaModel
from gensim.corpora import Dictionary

from glob import glob

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wenji\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
filename='hw3-data/wiki'
#contents = open(filename, encoding='utf-8').read()
search_path = "%s/*.txt" % filename
files = glob(search_path)
docs=[]
for ii in files:
    docs.append(open(ii, encoding='utf-8').read())

## Pre-process and vectorize the documents

In [45]:
# Tokenize the documents.
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

# Remove words that belong to stopwords
docs = [[token for token in doc if token not in stop_words] for doc in docs]

In [46]:
# Lemmatize the documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [47]:
# Bag-of-words representation of the documents.
dictionary = Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [48]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 14391
Number of documents: 399


In [52]:
import time
# Train LDA model.
# Set training parameters.
num_topics = 5
chunksize = 2000
passes = 20
iterations = 1000
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

start = time.time()
model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)
total = time.time() - start
print(total)
print("Total usage of time: %0.5f seconds" % total)

9.780372381210327
Total usage of time: 9.78037 seconds


In [50]:
top_topics = model.top_topics(corpus, topn=50)#, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -5.8481.
[([(0.0064119156, 'game'),
   (0.005138796, 'new'),
   (0.004483857, 'year'),
   (0.004380243, 'also'),
   (0.00404396, 'state'),
   (0.0036535512, 'people'),
   (0.0035654153, 'team'),
   (0.0035473516, 'school'),
   (0.003416879, 'monk'),
   (0.0033195973, 'one'),
   (0.0030732115, 'city'),
   (0.003037021, 'first'),
   (0.0029875853, 'two'),
   (0.0027544051, 'national'),
   (0.002618008, 'military'),
   (0.002610266, 'season'),
   (0.0026097333, 'navy'),
   (0.0024629773, 'government'),
   (0.002385366, 'yangon'),
   (0.002340557, 'time'),
   (0.0023199574, 'said'),
   (0.0022911683, 'army'),
   (0.0022660228, 'football'),
   (0.0021490494, 'many'),
   (0.0021177197, 'united'),
   (0.0020773944, 'high'),
   (0.0020630583, 'played'),
   (0.002023459, 'report'),
   (0.0019917106, 'would'),
   (0.0019376725, 'water'),
   (0.0018478555, 'temple'),
   (0.0018472754, 'junta'),
   (0.0018429286, 'station'),
   (0.0018353564, 'october'),
   (0.0018099904, 

In [51]:
topicsfile = open("gensim.txt", 'w')
for t in range(len(top_topics)):
    i,j = top_topics[t]
    topicsfile.write("------------\nTopic %i (%0.5f)\n------------\n" % (t, j))
    for k,l in i:
        topicsfile.write('%0.5f\t%s\n' % (k,l))
topicsfile.close()