In [None]:
class ReturnTokens(object):
    def __init__(self, dir_path):
        self.dir_path = dir_path

    def __iter__(self):
        for file_name in os.listdir(self.dir_path):
            if file_name[-4:]==".txt":
                with open(os.path.join(self.dir_path, file_name), encoding='utf-8') as fp:
                    text = fp.read()
                    yield(text)

# Set up dependencies and load the data
Note: I set up two docs to play with different ways to tokenize/limit the tokens
Also, there is a path to the Vowpal Wabbit binary/executable that will have to be changed (I don't know if we will use it but it might be useful to compare

In [1]:
from gensim.models.wrappers import LdaVowpalWabbit
from gensim.parsing.preprocessing import preprocess_string
from gensim import corpora
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import preprocess_string, strip_multiple_whitespaces, strip_numeric, remove_stopwords,strip_short, stem_text, strip_punctuation
import gensim
from pprint import pprint

from smart_open import smart_open
import os
import pandas as pd

unable to import 'smart_open.gcs', disabling that module


In [24]:
import pymysql

print("Connecting to SQL...")
query = """
    SELECT
        body as text
    FROM clean_docs
    WHERE
        body IS NOT NULL
        AND body != ''
    ;
    """
con = pymysql.connect(
    host="127.0.0.1",
    port=3306,
    user='dbuser',
    password='dbuserdbuser',
    db='PDB',
    charset='utf8mb4',
    cursorclass=pymysql.cursors.DictCursor
)
cur = con.cursor()
cur.execute(query)
result = cur.fetchall()

docs = []
for item in result:
    docs.append(item['text'])

Connecting to SQL...


# Tokenizing data
## Once using NLTK
## And once with gensim built in features
Note: With the gensim built in features, I ignore stemming for now

In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer


# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')

for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [25]:
# Tokenize documents.
from gensim.parsing.preprocessing import preprocess_string, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text, strip_punctuation

CUSTOM_FILTERS = [lambda x: x.lower(), strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short]
docs = [preprocess_string(doc, CUSTOM_FILTERS) for doc in docs]

In [None]:
# Code to only keep NOUNS, ADJ, VERB, ADV
from spacy.lang.en import English

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp = English()
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ not in allowed_postags])
    return texts_out

In [None]:
docs = lemmatization(docs, allowed_postags=['NOUN','ADJ', 'VERB', 'ADV'])

In [29]:
from nltk.corpus import wordnet
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)



In [30]:
# Lemmatize the documents.
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in doc] for doc in docs]




[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/carriehaykellar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
tagged = [[nltk.pos_tag(token) for token in doc] for doc in docs]
tagged

In [31]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [41]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 50 documents, or more than 40% of the documents.
dictionary.filter_extremes(no_below=50, no_above=0.4)


In [42]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [43]:
#Making sure the documents and number of tokens seem okay
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 5016
Number of documents: 5010


# Topic models


### Running with Gensium LDA

In [None]:
print("Running LDA Gensium...")
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=20, id2word=dictionary, eta='auto', alpha='asymmetric', passes=20, per_word_topics=True, eval_every=None)

Running LDA Gensium...


In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [11]:
topics_per_document = lda_model.get_document_topics(corpus,per_word_topics=False)

In [18]:
list_topics_per_document = []
for t in topics_per_document:
    list_topics_per_document.append([item for lt in t for item in lt])

list_topics_per_document

[[0,
  0.08503742,
  1,
  0.13792183,
  3,
  0.121501274,
  11,
  0.40228868,
  12,
  0.027713677,
  17,
  0.023186143,
  19,
  0.19552797],
 [1, 0.18826362, 5, 0.24246693, 6, 0.1655537, 12, 0.09523292, 17, 0.30722255],
 [0,
  0.14216551,
  1,
  0.1839883,
  7,
  0.12598701,
  8,
  0.06554198,
  9,
  0.03377416,
  15,
  0.43150476,
  18,
  0.0146841565],
 [4,
  0.014154937,
  5,
  0.041651137,
  6,
  0.05408709,
  7,
  0.11152435,
  15,
  0.6071328,
  19,
  0.16882367],
 [0, 0.12096093, 5, 0.24782778, 6, 0.079094395, 14, 0.5504145],
 [0,
  0.18513483,
  1,
  0.19038874,
  5,
  0.2369386,
  6,
  0.069118544,
  15,
  0.20108762,
  17,
  0.032801226,
  18,
  0.0828413],
 [0,
  0.23942313,
  1,
  0.5091017,
  6,
  0.01602311,
  8,
  0.023663163,
  9,
  0.02510353,
  12,
  0.17570099],
 [0, 0.04595394, 3, 0.09334614, 15, 0.85514086],
 [1, 0.26248816, 15, 0.35836932, 17, 0.18316837, 19, 0.18998526],
 [0,
  0.059366066,
  2,
  0.1124396,
  6,
  0.4215561,
  8,
  0.019293608,
  10,
  0.0111860

In [19]:
df_topics_per_document = pd.DataFrame(data=list_topics_per_document)

In [20]:
df_topics_per_document

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0,0.085037,1.0,0.137922,3.0,0.121501,11.0,0.402289,12.0,0.027714,...,,,,,,,,,,
1,1,0.188264,5.0,0.242467,6.0,0.165554,12.0,0.095233,17.0,0.307223,...,,,,,,,,,,
2,0,0.142166,1.0,0.183988,7.0,0.125987,8.0,0.065542,9.0,0.033774,...,,,,,,,,,,
3,4,0.014155,5.0,0.041651,6.0,0.054087,7.0,0.111524,15.0,0.607133,...,,,,,,,,,,
4,0,0.120961,5.0,0.247828,6.0,0.079094,14.0,0.550415,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5005,5,0.081483,6.0,0.366744,8.0,0.027917,11.0,0.190710,12.0,0.021429,...,,,,,,,,,,
5006,6,0.048306,7.0,0.663529,11.0,0.223745,17.0,0.060682,,,...,,,,,,,,,,
5007,0,0.077557,7.0,0.737540,8.0,0.062596,12.0,0.037025,17.0,0.082232,...,,,,,,,,,,
5008,6,0.066857,7.0,0.702879,15.0,0.075581,17.0,0.152214,,,...,,,,,,,,,,


In [21]:
from sqlalchemy import create_engine
engine = create_engine('mysql://dbuser:dbuserdbuser@localhost:3306/PDB', echo=False)

In [23]:
df_topics_per_document.to_sql('topics5_per_doc', con=engine)

In [None]:
topics = lda_model.show_topics()

In [37]:
lda_model.print_topics(20, 8)

[(0,
  '0.011*"tha" + 0.010*"eee" + 0.008*"tan" + 0.008*"test" + 0.008*"far" + 0.007*"algeria" + 0.005*"guinea" + 0.005*"ben"'),
 (1,
  '0.017*"lebanon" + 0.014*"continued" + 0.013*"continued_president" + 0.012*"syrian" + 0.009*"angola" + 0.009*"palestinian" + 0.009*"christian" + 0.008*"page"'),
 (2,
  '0.028*"pakistan" + 0.024*"india" + 0.017*"indian" + 0.014*"pakistani" + 0.011*"page" + 0.009*"east" + 0.006*"bangladesh" + 0.005*"delhi"'),
 (3,
  '0.022*"vietnamese" + 0.013*"north_vietnamese" + 0.012*"enemy" + 0.011*"page" + 0.010*"south_vietnam" + 0.009*"unit" + 0.009*"province" + 0.009*"lao"'),
 (4,
  '0.009*"east" + 0.008*"corp" + 0.008*"major" + 0.007*"problem" + 0.007*"europe" + 0.006*"important" + 0.006*"ninh" + 0.006*"affair"'),
 (5,
  '0.042*"israeli" + 0.032*"arab" + 0.031*"israel" + 0.018*"egyptian" + 0.016*"egypt" + 0.015*"fedayeen" + 0.014*"jordan" + 0.013*"syria"'),
 (6,
  '0.014*"cuba" + 0.010*"cont" + 0.008*"president_secret" + 0.007*"cuban" + 0.005*"castro" + 0.005*"so

In [None]:
# Vary num of topics
import time
models2a = []
for k in [10, 20, 40, 60]:
    t1 = time.time()
    models2a.append(gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=k, id2word=dictionary, eta='auto', alpha='asymmetric', passes=2, per_word_topics=True))
    print(k)
    t2 = time.time()
    print("Model time: ", t2-t1)
for m in models2a:
    t3 = time.time()
    coherence_model_lda = CoherenceModel(model=m, corpus=corpus, coherence='u_mass')
    cm = CoherenceModel(model=m, texts=docs, dictionary=dictionary, coherence='c_v')
    print('%3d %10.3f %10.3f %10.3f'%(m.num_topics, 
              m.log_perplexity(corpus),
              coherence_model_lda.get_coherence(),
              cm.get_coherence()))
    t4 = time.time()
    print("Coherence Score Time: ", t4-t3)

In [None]:
for m in models2a:
    t3 = time.time()
    coherence_model_lda = CoherenceModel(model=m, corpus=corpus, coherence='u_mass')
    cm = CoherenceModel(model=m, texts=docs, dictionary=dictionary, coherence='c_v')
    print('%3d %10.3f %10.3f %10.3f'%(m.passes, 
              m.log_perplexity(corpus),
              coherence_model_lda.get_coherence(),
              cm.get_coherence()))
    t4 = time.time()
    print("Coherence Score Time: ", t4-t3)

In [None]:
# Vary num of passes
import time
models2a = []
for k in [5, 10, 15, 20, 25,30]:
    t1 = time.time()
    models2a.append(gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=20, id2word=dictionary, eta='auto', alpha='asymmetric', passes=k, per_word_topics=True))
    print(k)
    t2 = time.time()
    print("Model time: ", t2-t1)
for m in models2a:
    t3 = time.time()
    coherence_model_lda = CoherenceModel(model=m, corpus=corpus, coherence='u_mass')
    cm = CoherenceModel(model=m, texts=docs, dictionary=dictionary, coherence='c_v')
    print('%3d %10.3f %10.3f %10.3f'%(m.passes, 
              m.log_perplexity(corpus),
              coherence_model_lda.get_coherence(),
              cm.get_coherence()))
    t4 = time.time()
    print("Coherence Score Time: ", t4-t3)

In [None]:
for m in models2a:
    t3 = time.time()
    coherence_model_lda = CoherenceModel(model=m, corpus=corpus, coherence='u_mass')
    cm = CoherenceModel(model=m, texts=docs, dictionary=dictionary, coherence='c_v')
    print('%3d %10.3f %10.3f %10.3f'%(m.passes, 
              m.log_perplexity(corpus),
              coherence_model_lda.get_coherence(),
              cm.get_coherence()))
    t4 = time.time()
    print("Coherence Score Time: ", t4-t3)

### Running with Mallet Wrapper

In [None]:
print("Running LDA Mallet...")
mallet_path = '/Users/carriehaykellar/Downloads/mallet-2.0.8/bin/mallet'
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=5, id2word=dictionary)


In [None]:
ldamallet.print_topics()

In [None]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamallet, corpus, docs)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']


# Show
df_dominant_topic.head(10)



In [None]:
# Changes list to string s.t. it can import into sql
df_dominant_topic['Text'] = [df_dominant_topic['Text'].map(str, l) for l in df_dominant_topic['Text']]


In [None]:
from sqlalchemy import create_engine
engine = create_engine('mysql://dbuser:dbuserdbuser@localhost:3306/PDB', echo=False)

In [None]:
df_dominant_topic.to_sql('topic_per_doc', con=engine)

In [None]:
import matplotlib.pyplot as plt

c_v, = plt.plot([10,20,40,60], [0.537, 0.578, 0.515, 0.478], 'b', label='20 passes')
u_mass, = plt.plot([10, 20,40,60], [0.314, 0.382, 0.381, 0.377], 'r', label='1 pass')
two, = plt.plot([10, 20,40,60], [0.451, 0.497, 0.467, 0.403], 'g', label='2 passes')
plt.ylabel('Coherence')
plt.legend(handles=[c_v,u_mass, two])
plt.show()




In [None]:
import plotly.graph_objects as go

twenty_x = [10,20,40,60]
twenty_y = [0.537, 0.578, 0.515, 0.478]
one_x = [10, 20,40,60]
one_y = [0.314, 0.382, 0.381, 0.377]
two_x = [10, 20,40,60]
two_y = [0.451, 0.497, 0.467, 0.403]

fig = go.Figure()

fig.add_trace(go.Scatter(x=twenty_x, y=twenty_y,
                    mode='lines+markers',
                    name='20 Passes'))
fig.add_trace(go.Scatter(x=one_x, y=one_y,
                    mode='lines+markers',
                    name='1 Pass'))

fig.add_trace(go.Scatter(x=two_x, y=two_y,
                    mode='lines+markers',
                    name='2 Passes'))




fig.show()
fig.write_image("/Users/carriehaykellar/Desktop/Pass_Coherence.png")

In [None]:
import plotly.graph_objects as go

twenty_x = [5,10, 15,20,25,30]
twenty_y = [0.555, 0.571, 0.565, 0.598,0.551, 0.583]

fig = go.Figure()

fig.add_trace(go.Scatter(x=twenty_x, y=twenty_y,
                    mode='lines+markers',
                    name=''))

fig.show()
fig.write_image("/Users/carriehaykellar/Desktop/Topic_Coherence.png")

In [None]:
plt.plot([20,40,60,80,100], [0.549, 0.501, 0.443, 0.413, 0.310])
plt.ylabel('c_v coherence')
plt.show()

## Now varying number of passes/iterations

In [None]:
models2a = []
for k in [5, 10, 15, 20, 25,30]:
    models2a.append(gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=20, id2word=dictionary, eta='auto', alpha='asymmetric', passes=k, per_word_topics=True))
    print(k)
for m in models2a:
    coherence_model_lda = CoherenceModel(model=m, corpus=corpus, coherence='u_mass')
    cm = CoherenceModel(model=m, texts=docs, dictionary=dictionary, coherence='c_v')
    print('%3d %10.3f %10.3f %10.3f'%(m.passes, 
              m.log_perplexity(corpus2),
              coherence_model_lda.get_coherence(),
              cm.get_coherence()))

In [None]:
for m in models2a:
    coherence_model_lda = CoherenceModel(model=m, corpus=corpus, coherence='u_mass')
    cm = CoherenceModel(model=m, texts=docs, dictionary=dictionary, coherence='c_v')
    print('%3d %10.3f %10.3f %10.3f'%(m.passes, 
              m.log_perplexity(corpus),
              coherence_model_lda.get_coherence(),
              cm.get_coherence()))

## We can also see the coherence per topic

In [None]:
# Select the model and print the topics
optimal_model = models2a[2]
model_topics = optimal_model.show_topics(formatted=False)
#pprint(optimal_model.print_topics(num_words=7))
top_topics = optimal_model.top_topics(corpus2, texts=docs2, coherence='c_v') 

a = [t[1] for t in top_topics]
pprint(top_topics)
print(a)

In [None]:
# Another way to print the top tokens by topic
pprint(optimal_model.print_topics(num_words=10))

## Determining the average top topic score (i.e., for each document the topic probability of the top topic)

In [None]:
o = [sorted(optimal_model[corpus2][i][0],key=lambda x: (x[1]), reverse=True)[0][1]
    for i in range(len(corpus2)) ]
o2 = [sorted(optimal_model[corpus2][i][0],key=lambda x: (x[1]), reverse=True)[1][1]
    for i in range(len(corpus2)) ]

print(sum(o)/len(o))
print(sum(o2)/len(o2))

## Now let's do the same with 80 topics

In [None]:
# Select the model and print the topics
optimal_model80 = models2[3]
model_topics80 = optimal_model80.show_topics(formatted=False)
#pprint(optimal_model.print_topics(num_words=7))
top_topics80 = optimal_model80.top_topics(corpus2, texts=docs2, coherence='c_v') 

a80 = [t[1] for t in top_topics80]
pprint(top_topics80)
print(a80)
o80 = [sorted(optimal_model80[corpus2][i][0],key=lambda x: (x[1]), reverse=True)[0][1]
    for i in range(len(corpus2)) ]
o280 = [sorted(optimal_model80[corpus2][i][0],key=lambda x: (x[1]), reverse=True)[1][1]
    for i in range(len(corpus2)) ]

print(sum(o80)/len(o80))
print(sum(o280)/len(o280))

## Visualizing the topics and tokens
This is super-cool but I haven't compared models or anything

In [None]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(models2a[2], corpus2, dict2)
LDAvis_prepared