### Sources: 

- Tutorial: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
- Another tutorial: https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html
    - This tutorial has a few useful links inside
- LDA model reference: https://radimrehurek.com/gensim/models/ldamulticore.html
- LDA coherence model reference: https://radimrehurek.com/gensim/models/coherencemodel.html
- Coherence metrics research paper: http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf

In [1]:
# increase cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import re
import os
import gc
import time
import pickle
import numpy as np
import pandas as pd
from pprint import pprint
from importlib import reload

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#import nltk
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [4]:
# Install gensim if necessary
#!pip install --upgrade gensim 
#!pip install google-compute-engine

In [5]:
import gensim
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

In [6]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [7]:
from helpers import load_data
from helpers import explore_data
from helpers import preprocess_data
from helpers import train_model

  from ._conv import register_converters as _register_converters


## Load the data

In [8]:
pwd

'/home/fbm221/W266FinalProject/Code'

In [10]:
main_dir = '/home/fbm221/W266FinalProject'

In [27]:
indata_path = main_dir + '/Data/Party/'

In [28]:
outdata_path = main_dir + '/saved_files/LDA/Party/'

In [14]:
with open(os.path.join(indata_path, 'val_list'), 'rb') as fp:
    main_data = pickle.load(fp)
with open(os.path.join(indata_path, 'val_ids'), 'rb') as fp:
    main_ids = pickle.load(fp)
with open(os.path.join(indata_path, 'val_target'), 'rb') as fp:
    main_target = pickle.load(fp)

In [15]:
len(main_data)

101153

## Preprocess speech

In [16]:
stop_words = stopwords.words('english')
stop_words.extend(['mr', 'senator', 'united', 'states', 'president', 'would', 'speaker', 'senate'])

In [17]:
def lemmatize_speech(speech):
    processed_speech = []
    for word in simple_preprocess(speech) :
        if word not in stop_words:
            processed_speech.append(WordNetLemmatizer().lemmatize(word, pos='v'))

    return processed_speech

In [18]:
def preprocess_speech(speeches):
    
    speeches_processed = [lemmatize_speech(speech) for speech in speeches]
    
    bigram = Phrases(sentences=speeches_processed, 
                     scoring='npmi',
                     min_count=30, 
                     threshold=0.5)
    
    trigram = Phrases(sentences=bigram[speeches_processed], 
                      scoring='npmi',
                      min_count=30, 
                      threshold=0.5)  
    
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)
    
    return [trigram_mod[bigram_mod[speech]] for speech in speeches_processed]

In [19]:
# Only run this the first time. Otherwise import data_preprocessed below
start_time = time.time()
data_preprocessed = preprocess_speech(main_data)
print("\nIt took {:.1f} seconds to process the data".format(time.time()-start_time))


It took 2817.3 seconds to process the data


In [20]:
# Only run this the first time. Otherwise import data_preprocessed below
# Save data_preprocessed

pickle.dump(data_preprocessed, open(os.path.join(indata_path, 'data_preprocessed'), 'wb'))

In [12]:
# import data_preprocessed
with open(os.path.join(indata_path, 'data_preprocessed'), 'rb') as fp:
    data_preprocessed = pickle.load(fp)

In [21]:
# Only run this the first time. Otherwise import word_index below
word_index = Dictionary(data_preprocessed)
word_index_check = Dictionary(data_preprocessed)
word_index.filter_extremes(no_below=10, no_above=0.3)
print("Number of tokens removed: {}. {} tokens will be used".format(len(word_index_check) - len(word_index), len(word_index)))

Number of tokens removed: 244500. 30538 tokens will be used


In [23]:
# Only run this the first time. Otherwise import word_index below
word_index.save_as_text(fname=os.path.join(indata_path, 'word_index'))

In [13]:
# Import word_index
word_index = Dictionary.load_from_text(fname=os.path.join(indata_path, 'word_index'))

In [22]:
most_frequent = sorted(word_index.dfs.items(), key=lambda x: -x[1])[:20]
print("Most frequent words:")
for i in most_frequent:
    print(word_index[i[0]], i[1])

Most frequent words:
support 30191
also 30038
people 29525
many 28897
years 28771
know 28037
today 28016
need 27764
us 27277
want 27001
think 25832
like 25637
come 25597
congress 25234
provide 25135
committee 24651
state 24478
get 23534
first 23272
act 23144


In [23]:
bow_corpus = [word_index.doc2bow(speech) for speech in data_preprocessed]

In [24]:
print("Individual speech check:")
check = sorted(bow_corpus[100], key=lambda x: -x[1])
for i in range(min(len(check),10)):
    print("Word {} (\"{}\") appears {} time(s).".format(check[i][0], 
                                                     word_index[check[i][0]], 
                                                     check[i][1]))

Individual speech check:
Word 419 ("follow") appears 2 time(s).
Word 753 ("military") appears 2 time(s).
Word 1146 ("fiscal_year") appears 2 time(s).
Word 1834 ("activities") appears 2 time(s).
Word 1 ("appropriations") appears 1 time(s).
Word 108 ("add") appears 1 time(s).
Word 147 ("defense") appears 1 time(s).
Word 193 ("order") appears 1 time(s).
Word 400 ("authorize") appears 1 time(s).
Word 501 ("propose") appears 1 time(s).


## LDA model

### Picking the number of topics

In [25]:
def compute_coherence_values(dictionary, corpus, speeches, 
                            topics_range = range(1,10,1),
                            chunksize=100,
                            passes=10,
                            iterations=10,
                            eval_every=None,
                            workers=8,
                            random_state=100,
                            per_word_topics=False,
                            coherence='c_v'):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    model_list, coherence_values = [], []

    for num_topics in topics_range:
        model = LdaMulticore(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics, 
                             chunksize=chunksize,
                             passes=passes,
                             iterations=iterations,
                             eval_every=eval_every,
                             workers=workers,
                             random_state=random_state,
                             per_word_topics=per_word_topics)
        
        model_list.append(model)
        coherence_model = CoherenceModel(model=model, 
                                         texts=speeches, 
                                         dictionary=dictionary, 
                                         coherence=coherence)
        
        cv = coherence_model.get_coherence()
        coherence_values.append(cv)
        print("Coherence for {} topics is {:.2f}".format(num_topics, cv))

    return model_list, coherence_values

In [26]:
# # pre-process speech paramters:
#     bigram = Phrases(sentences=speeches_processed, 
#                      scoring='npmi',
#                      min_count=30, 
#                      threshold=0.5)
    
#     trigram = Phrases(sentences=bigram[speeches_processed], 
#                       scoring='npmi',
#                       min_count=30, 
#                       threshold=0.5) 

### Running LDA on lda_params1 ###
print("LDA Params 1")
lda_params1 = {
    'topics_range': range(1,41,5),
    'chunksize': 100,
    'passes': 5,
    'iterations': 5,
    'eval_every': None,
    'workers': 8,
    'random_state': 100,
    'per_word_topics': False,
    'coherence': 'c_v'
}
%time model_list1, coherence_values1 = compute_coherence_values(dictionary=word_index, corpus=bow_corpus, speeches=data_preprocessed, **lda_params1)


### Running LDA on lda_params2 ###
print("LDA Params 2")
lda_params2 = {
    'topics_range': range(1,41,5),
    'chunksize': 100,
    'passes': 10,
    'iterations': 10,
    'eval_every': None,
    'workers': 8,
    'random_state': 100,
    'per_word_topics': False,
    'coherence': 'c_v'
}
%time model_list2, coherence_values2 = compute_coherence_values(dictionary=word_index, corpus=bow_corpus, speeches=data_preprocessed, **lda_params2)

### Running LDA on lda_params3 ###
print("LDA Params 3")
lda_params3 = {
    'topics_range': range(1,41,5),
    'chunksize': 100,
    'passes': 15,
    'iterations': 15,
    'eval_every': None,
    'workers': 8,
    'random_state': 100,
    'per_word_topics': False,
    'coherence': 'c_v'
}
%time model_list3, coherence_values3 = compute_coherence_values(dictionary=word_index, corpus=bow_corpus, speeches=data_preprocessed, **lda_params3)

### Running LDA on lda_params4 ###
print("LDA Params 4")
lda_params4 = {
    'topics_range': range(1,41,5),
    'chunksize': 100,
    'passes': 20,
    'iterations': 20,
    'eval_every': None,
    'workers': 8,
    'random_state': 100,
    'per_word_topics': False,
    'coherence': 'c_v'
}
%time model_list4, coherence_values4 = compute_coherence_values(dictionary=word_index, corpus=bow_corpus, speeches=data_preprocessed, **lda_params4)


LDA Params 1
Coherence for 1 topics is 0.28
Coherence for 6 topics is 0.42
Coherence for 11 topics is 0.49
Coherence for 16 topics is 0.49
Coherence for 21 topics is 0.49
Coherence for 26 topics is 0.53
Coherence for 31 topics is 0.51
Coherence for 36 topics is 0.54
CPU times: user 1h 30min 17s, sys: 7min, total: 1h 37min 18s
Wall time: 3h 14min 17s
LDA Params 2
Coherence for 1 topics is 0.28
Coherence for 6 topics is 0.44
Coherence for 11 topics is 0.48
Coherence for 16 topics is 0.50
Coherence for 21 topics is 0.51
Coherence for 26 topics is 0.54
Coherence for 31 topics is 0.53
Coherence for 36 topics is 0.54
CPU times: user 3h 31min 16s, sys: 13min 32s, total: 3h 44min 48s
Wall time: 2h 14min 9s
LDA Params 3
Coherence for 1 topics is 0.28
Coherence for 6 topics is 0.43
Coherence for 11 topics is 0.48
Coherence for 16 topics is 0.53
Coherence for 21 topics is 0.50
Coherence for 26 topics is 0.52
Coherence for 31 topics is 0.53
Coherence for 36 topics is 0.54
CPU times: user 5h 15min 

In [29]:
topics_range = [1, 6, 11, 16, 21, 26, 31, 36]

for model_list in [model_list1, model_list2, model_list3, model_list4]:
    for model in model_list:
        for num_topics in topics_range:
            if model_list == model_list1:
                model.save(outdata_path + 'lda_model_list1_' + str(num_topics))
            if model_list == model_list2:
                model.save(outdata_path + 'lda_model_list2_' + str(num_topics))
            if model_list == model_list3:
                model.save(outdata_path + 'lda_model_list3_' + str(num_topics))
            if model_list == model_list4:
                model.save(outdata_path + 'lda_model_list4_' + str(num_topics))

### Pick the model based on the highest coherence value

In [45]:
# Find the number of topics with the highest coherence value in each model

# Model 1
num_topics1 = list(lda_params1['topics_range'])[np.argmax(coherence_values1)]
optimal_model1 = model_list1[np.argmax(coherence_values1)]
cv1 = [np.argmax(coherence_values1)]
passes1 = optimal_model1['passes']
iterations1 = optimal_model1['iterations']

# Model 2
num_topics2 = list(lda_params2['topics_range'])[np.argmax(coherence_values2)]
optimal_model2 = model_list2[np.argmax(coherence_values2)]
cv2 = [np.argmax(coherence_values2)]
passes2 = optimal_model2['passes']
iterations2 = optimal_model2['iterations']

# Model 3
num_topics3 = list(lda_params3['topics_range'])[np.argmax(coherence_values3)]
optimal_model3 = model_list3[np.argmax(coherence_values3)]
cv3 = [np.argmax(coherence_values3)]
passes3 = optimal_model3['passes']
iterations3 = optimal_model3['iterations']

# Model 4
num_topics4 = list(lda_params4['topics_range'])[np.argmax(coherence_values4)]
optimal_model4 = model_list4[np.argmax(coherence_values4)]
cv4 = [np.argmax(coherence_values4)]
passes4 = optimal_model4['passes']
iterations4 = optimal_model4['iterations']

# Lists of data
num_topics = [num_topics1, num_topics2, num_topics3, num_topics4]
optimal_models = [optimal_model1, optimal_model2, optimal_model3, optimal_model4]
cvs = [cv1, cv2, cv3, cv4]
passes = [passes1, passes2, passes3, passes4]
iterations = [iterations1, iterations2, iterations3, iterations4]

2

In [129]:
# number of words per topic to display (can be any number within vocabulary size)
num_words = 10

# Model 1
top_topics1 = optimal_model1.top_topics(corpus=bow_corpus, 
                                      texts=data_preprocessed,
                                      coherence=lda_params1['coherence'], 
                                      topn=num_words)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence1 = sum([t[1] for t in top_topics1]) / num_topics1

# Model 2
top_topics2 = optimal_model2.top_topics(corpus=bow_corpus, 
                                      texts=data_preprocessed,
                                      coherence=lda_params2['coherence'], 
                                      topn=num_words)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence2 = sum([t[1] for t in top_topics2]) / num_topics2

# Model 3
top_topics3 = optimal_model3.top_topics(corpus=bow_corpus, 
                                      texts=data_preprocessed,
                                      coherence=lda_params3['coherence'], 
                                      topn=num_words)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence3 = sum([t[1] for t in top_topics3]) / num_topics3

# Model 4
top_topics4 = optimal_model4.top_topics(corpus=bow_corpus, 
                                      texts=data_preprocessed,
                                      coherence=lda_params4['coherence'], 
                                      topn=num_words)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence4 = sum([t[1] for t in top_topics4]) / num_topics4

# Lists of data
models = [model1, model2, model3, model4]
top_topics = [top_topics1, top_topics2, top_topics3, top_topics4]
avg_topic_coherence = [avg_topic_coherence1, avg_topic_coherence2, avg_topic_coherence3, avg_topic_coherence4]


Average topic coherence: 0.5512.
[([(0.043567196, 'defense'),
   (0.025677465, 'military'),
   (0.024918953, 'arm'),
   (0.014024991, 'nuclear'),
   (0.012702031, 'force'),
   (0.010622187, 'security'),
   (0.008375348, 'aircraft'),
   (0.0071074055, 'weapons'),
   (0.0070584887, 'strategic'),
   (0.006676709, 'must')],
  0.7334587837592823),
 ([(0.035327453, 'budget'),
   (0.028354025, 'tax'),
   (0.019736348, 'billion'),
   (0.01870228, 'cut'),
   (0.01565298, 'percent'),
   (0.014587059, 'increase'),
   (0.014322598, 'spend'),
   (0.012709231, 'year'),
   (0.012372468, 'program'),
   (0.010558332, 'federal')],
  0.6686538163011398),
 ([(0.047873456, 'benefit'),
   (0.032219175, 'social_security'),
   (0.030723514, 'veterans'),
   (0.025986213, 'service'),
   (0.025632817, 'pay'),
   (0.017189408, 'elderly'),
   (0.016738096, 'employees'),
   (0.01629803, 'retire'),
   (0.016189791, 'receive'),
   (0.012604101, 'health')],
  0.6436194626358426),
 ([(0.027453482, 'go'),
   (0.02645531

In [None]:
model_comparisons = pd.DataFrame(
    {'Model Number': models,
     'Number of Topics': num_topics,
     'Coherence Values': cvs,
     'Average Topic Coherence': avg_topic_coherence,
     'Top Topics': top_topics
    })

In [127]:
def get_topics_keywords_dict(ldamodel, num_topics):
    topic_dict = {}
    for i in range(num_topics):
        topic_dict[i] = [word[0] for word in ldamodel.show_topic(i)]
    return topic_dict
keywords_dict = get_topics_keywords_dict(optimal_model, num_topics)

### Append primary and secondary topics to the speech file

In [136]:
def append_topic(ldamodel, corpus, speeches, ids, kw_dict):
    # Init output
    speech_topics_df = pd.DataFrame()
    
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        sorted_topics = sorted(row, key=lambda x: -x[1])
        topic_count = len(sorted_topics)
        
        topic1_num, topic1_contrib = sorted_topics[0]
        topic1_keywords = ','.join(kw_dict[topic1_num])
        
        if topic_count > 1:
            topic2_num, topic2_contrib = sorted_topics[1]
        else:
            topic2_num, topic2_contrib = -1, 0
        
        new_row = [topic_count, int(topic1_num), round(topic1_contrib,2), topic1_keywords, int(topic2_num), round(topic2_contrib,2)]
        speech_topics_df = speech_topics_df.append(pd.Series(new_row), ignore_index=True)

    speech_topics_df = pd.concat([speech_topics_df, pd.Series(ids)], axis=1)
    speech_topics_df.columns = ['Topic_Count', 'Prim_Topic', 'Prim_Topic_Contrib', 'Prim_Topic_Keywords', 'Sec_Topic', 'Sec_Topic_Contrib', 'Speech_id']
    
    return speech_topics_df


all_speeches_topics_df = append_topic(ldamodel=optimal_model, 
                                      corpus=bow_corpus, 
                                      speeches=data_preprocessed, 
                                      ids=main_ids,
                                      kw_dict=keywords_dict)
all_speeches_topics_df.shape

(3244, 7)

In [138]:
all_speeches_topics_df.head()

Unnamed: 0,Topic_Count,Prim_Topic,Prim_Topic_Contrib,Prim_Topic_Keywords,Sec_Topic,Sec_Topic_Contrib,Speech_id
0,3.0,2.0,0.84,"go,say,think,amendment,get,want,gentleman,know...",9.0,0.13,970228387
1,6.0,11.0,0.51,"years,us,great,live,work,many,day,know,world,t...",7.0,0.3,970020112
2,2.0,2.0,0.65,"go,say,think,amendment,get,want,gentleman,know...",4.0,0.32,970222405
3,3.0,2.0,0.74,"go,say,think,amendment,get,want,gentleman,know...",6.0,0.18,970192743
4,5.0,10.0,0.34,"program,provide,federal,support,bill,need,fund...",8.0,0.32,970005171


### Pick one speech per document with the highest score

In [141]:
# Selecting n top speeches per topic
top_speeches = 1
top_speeches_df = pd.DataFrame()

all_speeches_topics_df_grpd = all_speeches_topics_df.groupby('Prim_Topic')

for i, grp in all_speeches_topics_df_grpd:
    top_speeches_df = pd.concat([top_speeches_df, grp.sort_values(by='Prim_Topic_Contrib', ascending=False).head(top_speeches)], axis=0)

top_speeches_df.reset_index(drop=True, inplace=True)

In [142]:
for i in range(top_speeches_df.shape[0]):
    print("Topic number: ", top_speeches_df.Prim_Topic[i])
    print("Topic contribution: {:.2f}".format(top_speeches_df.Prim_Topic_Contrib[i]))
    print("Keywords: \n", top_speeches_df.Prim_Topic_Keywords[i])
    print("Speech: \n", main_data[main_ids.index(top_speeches_df.Speech_id[i])])
    print("-"*50)

Topic number:  0.0
Topic contribution: 0.80
Keywords: 
 price,market,company,percent,industry,interest,american,cost,farm,trade
Speech: 
 Mr. Chairman. I C rise in opposition to the amendment. 0 Mr. Chairman. I support H.R. 5133. t the Fair Practices Automotive Products Act of 1982. Simply stated. what this bill is about Is helping the U.S. auto industry. For the past 4 years. the industry has been sinking deeper and deeper into depression. Total sales are less than half their 1978 level and almost 1 million workers have lost their jobs. including 280.000 auto workers and another 670.000 workers in auto supply industries. At the same time. sales of imported cars have been rising. Imports from Japan have increased by over 37 percent. and more than 1 of every 5 cars sold in the United States is Japanese built. Overall. imports make up 27 percent of the U.S. car market. We cant continue to allow imports to take a larger share of our market. with the result of lost American Jobs and an- ev

### Primary and secondary topic distribution in the speech file

In [145]:
topics_df = pd.DataFrame(index=range(num_topics))
# Number of Documents for Each Topic
prim_topic_counts = all_speeches_topics_df['Prim_Topic'].value_counts().sort_index()
sec_topic_counts = all_speeches_topics_df['Sec_Topic'].value_counts().sort_index()
# Percentage of Documents for Each Topic
prim_topic_share = round(prim_topic_counts/len(main_data), 2)
prim_topic_share.name = 'Prim_Topic_Contr'
sec_topic_share = round(sec_topic_counts/len(main_data), 2)
sec_topic_share.name = 'Sec_Topic_Contr'

In [146]:
topics_df= topics_df.join(prim_topic_counts)\
                    .join(prim_topic_share)\
                    .join(sec_topic_counts)\
                    .join(sec_topic_share)\
                    .join(pd.DataFrame.from_dict(keywords_dict, orient='index'))
topics_df.reset_index(inplace=True)
topics_df.columns = ['Topic_Num', "Prim_Cnt", "Prim_Share", "Sec_Cnt", "Sec_Share"] + ['kw'+str(i) for i in range(num_words)]
topics_df.Prim_Cnt.sum(), topics_df.Prim_Share.sum(), topics_df.Sec_Cnt.sum(), topics_df.Sec_Share.sum()

(3244, 1.0, 3193, 0.9900000000000001)

In [147]:
topics_df.sort_values(by='Prim_Cnt', ascending=False)

Unnamed: 0,Topic_Num,Prim_Cnt,Prim_Share,Sec_Cnt,Sec_Share,kw0,kw1,kw2,kw3,kw4,kw5,kw6,kw7,kw8,kw9
2,2,852,0.26,494,0.15,go,say,think,amendment,get,want,gentleman,know,us,come
9,9,570,0.18,477,0.15,committee,bill,amendment,house,act,provision,congress,legislation,vote,report
11,11,430,0.13,327,0.1,years,us,great,live,work,many,day,know,world,today
10,10,349,0.11,352,0.11,program,provide,federal,support,bill,need,fund,legislation,act,work
6,6,280,0.09,269,0.08,budget,tax,billion,cut,percent,increase,spend,year,program,federal
5,5,185,0.06,252,0.08,school,serve,service,national,community,public,education,state,article,years
7,7,176,0.05,332,0.1,international,government,world,nations,resolution,american,policy,support,must,concern
0,0,126,0.04,149,0.05,price,market,company,percent,industry,interest,american,cost,farm,trade
4,4,82,0.03,151,0.05,energy,project,cost,million,construction,job,need,study,percent,build
3,3,81,0.02,140,0.04,defense,military,arm,nuclear,force,security,aircraft,weapons,strategic,must
