### Sources: 

- Tutorial: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
- Another tutorial: https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html
    - This tutorial has a few useful links inside
- LDA model reference: https://radimrehurek.com/gensim/models/ldamulticore.html
- LDA coherence model reference: https://radimrehurek.com/gensim/models/coherencemodel.html
- Coherence metrics research paper: http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf

In [34]:
# increase cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [1]:
import sys
import re
import os
import gc
import time
import pickle
import numpy as np
import pandas as pd
from pprint import pprint
from importlib import reload

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#import nltk
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [3]:
# Install gensim if necessary
#!pip install --upgrade gensim 
#!pip install google-compute-engine

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

In [5]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [6]:
from helpers import load_data
from helpers import explore_data
from helpers import preprocess_data
from helpers import train_model

  from ._conv import register_converters as _register_converters


## Load the data

In [7]:
pwd

'/home/fbm221/W266FinalProject/Code'

In [8]:
main_dir = '/home/fbm221/W266FinalProject'

In [9]:
indata_path = main_dir + '/Data/Ethnicity/'

In [10]:
outdata_path = main_dir + '/saved_files/LDA/Ethnicity/'

In [11]:
with open(os.path.join(indata_path, 'val_list'), 'rb') as fp:
    main_data = pickle.load(fp)
with open(os.path.join(indata_path, 'val_ids'), 'rb') as fp:
    main_ids = pickle.load(fp)
with open(os.path.join(indata_path, 'val_target'), 'rb') as fp:
    main_target = pickle.load(fp)

In [12]:
len(main_data)

62840

## Preprocess speech

In [11]:
stop_words = stopwords.words('english')
stop_words.extend(['mr', 'senator', 'united', 'states', 'president', 'would', 'speaker', 'senate'])

In [12]:
def lemmatize_speech(speech):
    processed_speech = []
    for word in simple_preprocess(speech) :
        if word not in stop_words:
            processed_speech.append(WordNetLemmatizer().lemmatize(word, pos='v'))

    return processed_speech

In [13]:
def preprocess_speech(speeches):
    
    speeches_processed = [lemmatize_speech(speech) for speech in speeches]
    
    bigram = Phrases(sentences=speeches_processed, 
                     scoring='npmi',
                     min_count=30, 
                     threshold=0.5)
    
    trigram = Phrases(sentences=bigram[speeches_processed], 
                      scoring='npmi',
                      min_count=30, 
                      threshold=0.5)  
    
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)
    
    return [trigram_mod[bigram_mod[speech]] for speech in speeches_processed]

In [22]:
# Only run this the first time. Otherwise import data_preprocessed below
start_time = time.time()
data_preprocessed = preprocess_speech(main_data)
print("\nIt took {:.1f} seconds to process the data".format(time.time()-start_time))


It took 429.0 seconds to process the data


In [11]:
# Only run this the first time. Otherwise import data_preprocessed below
# Save data_preprocessed

pickle.dump(data_preprocessed, open(os.path.join(indata_path, 'data_preprocessed'), 'wb'))

NameError: name 'data_preprocessed' is not defined

In [12]:
# import data_preprocessed
with open(os.path.join(indata_path, 'data_preprocessed'), 'rb') as fp:
    data_preprocessed = pickle.load(fp)

In [19]:
# Only run this the first time. Otherwise import word_index below
word_index = Dictionary(data_preprocessed)
word_index_check = Dictionary(data_preprocessed)
word_index.filter_extremes(no_below=10, no_above=0.3)
print("Number of tokens removed: {}. {} tokens will be used".format(len(word_index_check) - len(word_index), len(word_index)))

Number of tokens removed: 174881. 27061 tokens will be used


In [13]:
# Only run this the first time. Otherwise import word_index below
word_index.save_as_text(fname=os.path.join(indata_path, 'word_index'))

NameError: name 'word_index' is not defined

In [13]:
# Import word_index
word_index = Dictionary.load_from_text(fname=os.path.join(indata_path, 'word_index'))

In [16]:
most_frequent = sorted(word_index.dfs.items(), key=lambda x: -x[1])[:20]
print("Most frequent words:")
for i in most_frequent:
    print(word_index[i[0]], i[1])

Most frequent words:
many 18657
need 17682
today 17528
know 17369
us 17216
want 16753
provide 16296
like 16188
congress 15965
come 15792
state 15165
house 15157
chairman 15148
act 14824
first 14575
committee 14572
give 14452
think 14327
country 14093
well 13867


In [14]:
bow_corpus = [word_index.doc2bow(speech) for speech in data_preprocessed]

In [18]:
print("Individual speech check:")
check = sorted(bow_corpus[100], key=lambda x: -x[1])
for i in range(min(len(check),10)):
    print("Word {} (\"{}\") appears {} time(s).".format(check[i][0], 
                                                     word_index[check[i][0]], 
                                                     check[i][1]))

Individual speech check:
Word 553 ("chairman") appears 3 time(s).
Word 4207 ("gentleman_illinois") appears 2 time(s).
Word 86 ("personnel") appears 1 time(s).
Word 265 ("agencies") appears 1 time(s).
Word 380 ("give") appears 1 time(s).
Word 499 ("similar") appears 1 time(s).
Word 539 ("various") appears 1 time(s).
Word 554 ("distinguish") appears 1 time(s).
Word 560 ("yield_time_may_consume") appears 1 time(s).
Word 576 ("committee") appears 1 time(s).


## LDA model

In [16]:
def LDA_model(dictionary, corpus, speeches, 
                            topics_range = range(1,10,1),
                            chunksize=100,
                            passes=10,
                            iterations=10,
                            eval_every=None,
                            workers=8,
                            random_state=100,
                            per_word_topics=False,
                            coherence='c_v'):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    model_list = []

    for num_topics in topics_range:
        model = LdaMulticore(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics, 
                             chunksize=chunksize,
                             passes=passes,
                             iterations=iterations,
                             eval_every=eval_every,
                             workers=workers,
                             random_state=random_state,
                             per_word_topics=per_word_topics)
        
        model_list.append(model)


    return model_list

In [17]:
def compute_coherence_values(model):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_model = CoherenceModel(model=model, 
                                     texts=data_preprocessed, 
                                     dictionary=word_index, 
                                     coherence='c_v')

    cv = coherence_model.get_coherence()

    return cv

In [47]:
# Only run this the first time:
# # pre-process speech paramters:
#     bigram = Phrases(sentences=speeches_processed, 
#                      scoring='npmi',
#                      min_count=30, 
#                      threshold=0.5)
    
#     trigram = Phrases(sentences=bigram[speeches_processed], 
#                       scoring='npmi',
#                       min_count=30, 
#                       threshold=0.5) 

### Running LDA on lda_params1 ###
print("LDA Params 1")
lda_params1 = {
    'topics_range': range(1,41,5),
    'chunksize': 100,
    'passes': 5,
    'iterations': 5,
    'eval_every': None,
    'workers': 8,
    'random_state': 100,
    'per_word_topics': False,
    'coherence': 'c_v'
}
%time model_list1, coherence_values1 = LDA_model(dictionary=word_index, corpus=bow_corpus, speeches=data_preprocessed, **lda_params1)


### Running LDA on lda_params2 ###
print("LDA Params 2")
lda_params2 = {
    'topics_range': range(1,41,5),
    'chunksize': 100,
    'passes': 10,
    'iterations': 10,
    'eval_every': None,
    'workers': 8,
    'random_state': 100,
    'per_word_topics': False,
    'coherence': 'c_v'
}
%time model_list2, coherence_values2 = LDA_model(dictionary=word_index, corpus=bow_corpus, speeches=data_preprocessed, **lda_params2)

### Running LDA on lda_params3 ###
print("LDA Params 3")
lda_params3 = {
    'topics_range': range(1,41,5),
    'chunksize': 100,
    'passes': 15,
    'iterations': 15,
    'eval_every': None,
    'workers': 8,
    'random_state': 100,
    'per_word_topics': False,
    'coherence': 'c_v'
}
%time model_list3, coherence_values3 = LDA_model(dictionary=word_index, corpus=bow_corpus, speeches=data_preprocessed, **lda_params3)

### Running LDA on lda_params4 ###
print("LDA Params 4")
lda_params4 = {
    'topics_range': range(1,41,5),
    'chunksize': 100,
    'passes': 20,
    'iterations': 20,
    'eval_every': None,
    'workers': 8,
    'random_state': 100,
    'per_word_topics': False,
    'coherence': 'c_v'
}
%time model_list4, coherence_values4 = LDA_model(dictionary=word_index, corpus=bow_corpus, speeches=data_preprocessed, **lda_params4)


LDA Params 1
Coherence for 1 topics is 0.29
Coherence for 6 topics is 0.42
Coherence for 11 topics is 0.46
Coherence for 16 topics is 0.47
Coherence for 21 topics is 0.52
Coherence for 26 topics is 0.52
Coherence for 31 topics is 0.52
Coherence for 36 topics is 0.53
CPU times: user 52min 47s, sys: 2min 46s, total: 55min 34s
Wall time: 42min 25s
LDA Params 2
Coherence for 1 topics is 0.29
Coherence for 6 topics is 0.44
Coherence for 11 topics is 0.50
Coherence for 16 topics is 0.50
Coherence for 21 topics is 0.52
Coherence for 26 topics is 0.53
Coherence for 31 topics is 0.54
Coherence for 36 topics is 0.54
CPU times: user 1h 44min 25s, sys: 5min 16s, total: 1h 49min 41s
Wall time: 1h 7min 52s
LDA Params 3
Coherence for 1 topics is 0.29
Coherence for 6 topics is 0.45
Coherence for 11 topics is 0.51
Coherence for 16 topics is 0.52
Coherence for 21 topics is 0.52
Coherence for 26 topics is 0.53
Coherence for 31 topics is 0.54
Coherence for 36 topics is 0.55
CPU times: user 2h 36min 23s, s

In [24]:
cvs = [0.53, 0.54, 0.55, 0.56]

In [66]:
# save models

topics_range = [1, 6, 11, 16, 21, 26, 31, 36]

for model_list in [model_list1, model_list2, model_list3, model_list4]:
    for model in model_list:
        for num_topics in topics_range:
            if model_list == model_list1:
                model.save(outdata_path + 'lda_model_list1_' + str(num_topics))
            if model_list == model_list2:
                model.save(outdata_path + 'lda_model_list2_' + str(num_topics))
            if model_list == model_list3:
                model.save(outdata_path + 'lda_model_list3_' + str(num_topics))
            if model_list == model_list4:
                model.save(outdata_path + 'lda_model_list4_' + str(num_topics))

In [17]:
## Load back models and save models in dict
topics_range = ['1', '6', '11', '16', '21', '26', '31', '36']
model_lists = ['model_list1_', 'model_list2_', 'model_list3_', 'model_list4_']
lda = 'lda_'
models_dict = {}

for model_list in model_lists:
    for num_topics in topics_range:
        filename = model_list + num_topics
        model = LdaMulticore.load(os.path.join(outdata_path,lda+filename))
        models_dict[filename] = model

In [18]:
models_dict

{'model_list1_1': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1bfa943c8>,
 'model_list1_6': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1bf4e15f8>,
 'model_list1_11': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1bf11f6a0>,
 'model_list1_16': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1bed5b748>,
 'model_list1_21': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1be9997f0>,
 'model_list1_26': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1be5d5898>,
 'model_list1_31': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1be211940>,
 'model_list1_36': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1bde4e9e8>,
 'model_list2_1': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1bda8bac8>,
 'model_list2_6': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1bd6c7ba8>,
 'model_list2_11': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1bd305c88>,
 'model_list2_16': <gensim.models.ldamulticore.LdaMulticore at 0x7fb1bcf40d68>,
 'model_list2_21': <gensim.models.ldamultico

### Pick the model based on the highest coherence value

In [25]:
num_words = 10
num_topics = 36

In [22]:
## Only run this the first time ##
# number of words per topic to display (can be any number within vocabulary size)


# Model 1
top_topics1 = models_dict['model_list1_36'].top_topics(corpus=bow_corpus, 
                                      texts=data_preprocessed,
                                      coherence='c_v', 
                                      topn=num_words)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence1 = sum([t[1] for t in top_topics1]) / num_topics

# Model 2
top_topics2 = models_dict['model_list2_36'].top_topics(corpus=bow_corpus, 
                                      texts=data_preprocessed,
                                      coherence='c_v', 
                                      topn=num_words)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence2 = sum([t[1] for t in top_topics2]) / num_topics

# Model 3
top_topics3 = models_dict['model_list3_36'].top_topics(corpus=bow_corpus, 
                                      texts=data_preprocessed,
                                      coherence='c_v', 
                                      topn=num_words)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence3 = sum([t[1] for t in top_topics3]) / num_topics

# Model 4
top_topics4 = models_dict['model_list4_36'].top_topics(corpus=bow_corpus, 
                                      texts=data_preprocessed,
                                      coherence='c_v', 
                                      topn=num_words)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence4 = sum([t[1] for t in top_topics4]) / num_topics


In [25]:
## Save top_topics and avg_topic_coherence ##

with open(os.path.join(outdata_path, "top_topics.txt"), "w") as output:
    output.write(str(top_topics))

with open(os.path.join(outdata_path, "avg_topic_coherence.txt"), "w") as output:
    output.write(str(avg_topic_coherence))

In [15]:
## Load top_topics and avg_topic coherence ##

with open(os.path.join(outdata_path, "top_topics.txt"), "r") as output:
    top_topics = output.read()

with open(os.path.join(outdata_path, "avg_topic_coherence.txt"), "r") as output:
    avg_topic_coherence = output.read()

In [19]:
avg_topic_coherence

'[0.5714211825039608, 0.5845008274680911, 0.5863295522987295, 0.5921006908178101]'

In [20]:
# Lists of data
models = ['Model 1', 'Model 2', 'Model 3', 'Model 4']
#top_topics = [top_topics1, top_topics2, top_topics3, top_topics4]
avg_topic_coherence = [0.571, 0.585, 0.586, 0.592]
passes = [5, 10, 15, 20]
iterations = [5, 10, 15, 20]

In [25]:
model_comparisons = pd.DataFrame(
    {'Model Number': models,
     'Coherence Values': cvs,
     'Average Topic Coherence': avg_topic_coherence,
     'Number of Topics': num_topics,
     'Passes': passes,
     'Iterations': iterations
     #'Top Topics': top_topics top 10 words/score
    })

In [26]:
model_comparisons

Unnamed: 0,Model Number,Coherence Values,Average Topic Coherence,Number of Topics,Passes,Iterations
0,Model 1,0.53,0.571,36,5,5
1,Model 2,0.54,0.585,36,10,10
2,Model 3,0.55,0.586,36,15,15
3,Model 4,0.56,0.592,36,20,20


### We see that Model 4 is performing the best in terms of coherence, with 20 passes and 20 iterations. We agree that 36 topics is a reasonable number of topics to have across nearly 40 years.

In [19]:
optimal_model = models_dict['model_list4_36']
num_topics = 36

In [20]:
def get_topics_keywords_dict(ldamodel, num_topics):
    topic_dict = {}
    for i in range(num_topics):
        topic_dict[i] = [word[0] for word in ldamodel.show_topic(i)]
    return topic_dict
keywords_dict = get_topics_keywords_dict(optimal_model, num_topics)

In [21]:
keywords_dict_names = keywords_dict.copy()

In [22]:
keywords_dict_names["Environment"] = keywords_dict_names.pop(0)
keywords_dict_names["Energy"] = keywords_dict_names.pop(1)
keywords_dict_names["Air Travel"] = keywords_dict_names.pop(2)
keywords_dict_names["Random Verbs"] = keywords_dict_names.pop(3)
keywords_dict_names["Great Nation"] = keywords_dict_names.pop(4)
keywords_dict_names["Scientific Research"] = keywords_dict_names.pop(5)
keywords_dict_names["State and Local"] = keywords_dict_names.pop(6)
keywords_dict_names["Foreign Policy"] = keywords_dict_names.pop(7)
keywords_dict_names["Healthcare"] = keywords_dict_names.pop(8)
keywords_dict_names["Judicial System"] = keywords_dict_names.pop(9)
keywords_dict_names["Common Congressional Phrases"] = keywords_dict_names.pop(10)
keywords_dict_names["Economy"] = keywords_dict_names.pop(11)
keywords_dict_names["Programs and Budget"] = keywords_dict_names.pop(12)
keywords_dict_names["Veterans"] = keywords_dict_names.pop(13)
keywords_dict_names["Elections and Parties"] = keywords_dict_names.pop(14)
keywords_dict_names["Education"] = keywords_dict_names.pop(15)
keywords_dict_names["Freedom"] = keywords_dict_names.pop(16)
keywords_dict_names["Children's and Family Programs"] = keywords_dict_names.pop(17)
keywords_dict_names["The Navy"] = keywords_dict_names.pop(18)
keywords_dict_names["Positive Words"] = keywords_dict_names.pop(19)
keywords_dict_names["Military Service"] = keywords_dict_names.pop(20)
keywords_dict_names["Foreign Trade"] = keywords_dict_names.pop(21)
keywords_dict_names["Random Verbs 2"] = keywords_dict_names.pop(22)
keywords_dict_names["Common Congressional Phrases 2"] = keywords_dict_names.pop(23)
keywords_dict_names["Common Congressional Phrases 3"] = keywords_dict_names.pop(24)
keywords_dict_names["Bankruptcy and Liability"] = keywords_dict_names.pop(25)
keywords_dict_names["Drug and Violent Crime"] = keywords_dict_names.pop(26)
keywords_dict_names["War and Defense"] = keywords_dict_names.pop(27)
keywords_dict_names["Art and Culture"] = keywords_dict_names.pop(28)
keywords_dict_names["Finance"] = keywords_dict_names.pop(29)
keywords_dict_names["Intelligence and Security"] = keywords_dict_names.pop(30)
keywords_dict_names["Taxes and Budget"] = keywords_dict_names.pop(31)
keywords_dict_names["Women's and Labor Rights"] = keywords_dict_names.pop(32)
keywords_dict_names["Common Congressional Phrases 4"] = keywords_dict_names.pop(33)
keywords_dict_names["Common Congressional Phrases 5"] = keywords_dict_names.pop(34)
keywords_dict_names["Healthcare Studies"] = keywords_dict_names.pop(35)

In [23]:
## Mapping from keywords_dict to keywords_dict_names:

mapping = {0: "Environment", 1: "Energy", 2: "Air Travel", 3: "Random Verbs",
          4: "Great Nation", 5: "Scientific Research", 6: "State and Local", 7: "Foreign Policy",
          8: "Healthcare", 9: "Judicial System", 10: "Common Congressional Phrases", 11: "Economy",
          12: "Programs and Budget", 13: "Veterans", 14: "Elections and Parties", 15: "Education", 16: "Freedom",
          17: "Children's and Family Programs", 18: "The Navy", 19: "Positive Words", 
          20: "Military Service", 21: "Foreign Trade", 22: "Random Verbs 2", 23: "Common Congressional Phrases 2",
          24: "Common Congressional Phrases 3", 25: "Bankruptcy and Liability", 26: "Drug and Violent Crime", 27: "War and Defense",
          28: "Art and Culture", 29: "Finance", 30: "Intelligence and Security", 31: "Taxes and Budget", 32: "Women's and Labor Rights",
          33: "Common Congressional Phrases 4", 34: "Common Congressional Phrases 5", 35: "Healthcare Studies"}


In [24]:
keywords_dict

{0: ['water',
  'land',
  'forest',
  'environmental',
  'act',
  'state',
  'national',
  'project',
  'legislation',
  'protect'],
 1: ['energy',
  'oil',
  'price',
  'fuel',
  'use',
  'cost',
  'production',
  'power',
  'increase',
  'plant'],
 2: ['border',
  'air',
  'aircraft',
  'aviation',
  'fly',
  'airport',
  'travel',
  'haiti',
  'flight',
  'security'],
 3: ['life',
  'know',
  'many',
  'live',
  'family',
  'man',
  'us',
  'great',
  'come',
  'love'],
 4: ['history',
  'nation',
  'american',
  'americans',
  'day',
  'today',
  'black',
  'america',
  'great',
  'first'],
 5: ['research',
  'technology',
  'new',
  'science',
  'space',
  'develop',
  'national',
  'advance',
  'center',
  'engineer'],
 6: ['state',
  'district',
  'local',
  'city',
  'build',
  'county',
  'park',
  'center',
  'area',
  'residents'],
 7: ['world',
  'countries',
  'nuclear',
  'international',
  'china',
  'agreement',
  'nations',
  'treaty',
  'economic',
  'policy'],
 8: ['

In [25]:
keywords_dict_names

{'Environment': ['water',
  'land',
  'forest',
  'environmental',
  'act',
  'state',
  'national',
  'project',
  'legislation',
  'protect'],
 'Energy': ['energy',
  'oil',
  'price',
  'fuel',
  'use',
  'cost',
  'production',
  'power',
  'increase',
  'plant'],
 'Air Travel': ['border',
  'air',
  'aircraft',
  'aviation',
  'fly',
  'airport',
  'travel',
  'haiti',
  'flight',
  'security'],
 'Random Verbs': ['life',
  'know',
  'many',
  'live',
  'family',
  'man',
  'us',
  'great',
  'come',
  'love'],
 'Great Nation': ['history',
  'nation',
  'american',
  'americans',
  'day',
  'today',
  'black',
  'america',
  'great',
  'first'],
 'Scientific Research': ['research',
  'technology',
  'new',
  'science',
  'space',
  'develop',
  'national',
  'advance',
  'center',
  'engineer'],
 'State and Local': ['state',
  'district',
  'local',
  'city',
  'build',
  'county',
  'park',
  'center',
  'area',
  'residents'],
 'Foreign Policy': ['world',
  'countries',
  'nuclea

In [42]:
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import table
import subprocess

df=pd.DataFrame.from_dict(keywords_dict_names ,orient='index')


df.to_html('table.html')
#df

127

In [35]:
f = open(os.path.join(outdata_path, "keywords_dict_ethnicity.txt"),"w")
f.write( str(keywords_dict) )
f.close()

In [36]:
f = open(os.path.join(outdata_path, "keywords_dict_ethnicity_names.txt"),"w")
f.write( str(keywords_dict_names) )
f.close()

### Append primary and secondary topics to the speech file

In [34]:
def append_topic(ldamodel, corpus, speeches, ids, kw_dict, mapping):
    # Init output
    speech_topics_df = pd.DataFrame()
    
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        sorted_topics = sorted(row, key=lambda x: -x[1])
        topic_count = len(sorted_topics)
        
        topic1_num, topic1_contrib = sorted_topics[0]
        topic1_name = mapping[topic1_num]
        topic1_keywords = ','.join(kw_dict[topic1_num])
        
        if topic_count > 1:
            topic2_num, topic2_contrib = sorted_topics[1]
            topic2_name = mapping[topic2_num]
        else:
            topic2_num, topic2_contrib = -1, 0
            topic2_name = "None"
            
        
        new_row = [topic_count, int(topic1_num), topic1_name, round(topic1_contrib,2), topic1_keywords, int(topic2_num), topic2_name, round(topic2_contrib,2)]
        speech_topics_df = speech_topics_df.append(pd.Series(new_row), ignore_index=True)

    speech_topics_df = pd.concat([speech_topics_df, pd.Series(ids)], axis=1)
    speech_topics_df.columns = ['Topic_Count', 'Prim_Topic', 'Prim_Topic_Name', 'Prim_Topic_Contrib', 'Prim_Topic_Keywords', 
                                'Sec_Topic', 'Sec_Topic_Name', 'Sec_Topic_Contrib', 'Speech_id']
    
    return speech_topics_df


all_speeches_topics_df = append_topic(ldamodel=optimal_model, 
                                      corpus=bow_corpus, 
                                      speeches=data_preprocessed, 
                                      ids=main_ids,
                                      kw_dict=keywords_dict,
                                      mapping = mapping)
all_speeches_topics_df.shape

(62840, 9)

In [36]:
all_speeches_topics_df.head()

Unnamed: 0,Topic_Count,Prim_Topic,Prim_Topic_Name,Prim_Topic_Contrib,Prim_Topic_Keywords,Sec_Topic,Sec_Topic_Name,Sec_Topic_Contrib,Speech_id
0,8.0,22.0,Random Verbs 2,0.35,"get,think,want,come,know,talk,us,way,see,try",32.0,Women's and Labor Rights,0.21,1010243717
1,10.0,19.0,Positive Words,0.2,"community,serve,service,honor,award,dr,univers...",20.0,Military Service,0.14,1120097305
2,4.0,23.0,Common Congressional Phrases 2,0.37,"amendment,vote,amendments,senators,ask_unanimo...",29.0,Finance,0.21,1020183502
3,10.0,17.0,Children's and Family Programs,0.27,"children,program,families,provide,help,child,n...",19.0,Positive Words,0.27,1080187463
4,11.0,13.0,Veterans,0.31,"veterans,service,federal,legislation,benefit,t...",21.0,Foreign Trade,0.18,1090010764


In [37]:
all_speeches_topics_df.to_pickle(outdata_path + 'speeches_topics_ethnicity_' + str(num_topics))
# all_speeches_topics_df = pd.read_pickle(outdata_path+'speeches_topics_'+str(num_topics))

### Pick one speech per document with the highest score

In [38]:
# Selecting n top speeches per topic
top_speeches = 1
top_speeches_df = pd.DataFrame()

all_speeches_topics_df_grpd = all_speeches_topics_df.groupby('Prim_Topic')

for i, grp in all_speeches_topics_df_grpd:
    top_speeches_df = pd.concat([top_speeches_df, grp.sort_values(by='Prim_Topic_Contrib', ascending=False).head(top_speeches)], axis=0)

top_speeches_df.reset_index(drop=True, inplace=True)

In [39]:
all_speeches_topics_df.to_pickle(outdata_path + 'topics_summary_' + str(num_topics))
# topics_df = pd.read_pickle(outdata_path+'topics_summary_'+str(num_topics))

In [40]:
for i in range(top_speeches_df.shape[0]):
    print("Topic number: ", top_speeches_df.Prim_Topic[i])
    print("Topic contribution: {:.2f}".format(top_speeches_df.Prim_Topic_Contrib[i]))
    print("Keywords: \n", top_speeches_df.Prim_Topic_Keywords[i])
    print("Speech: \n", main_data[main_ids.index(top_speeches_df.Speech_id[i])])
    print("-"*50)

Topic number:  0.0
Topic contribution: 0.90
Keywords: 
 water,land,forest,environmental,act,state,national,project,legislation,protect
Speech: 
 Mr. Speaker. I move to suspend the rules and pass the bill to direct the Secretary of the Interior to continue stocking fish in certain lakes in the North Cascades National Park. Ross Lake National Recreation Area. and Lake Chelan National Recreation Area.
--------------------------------------------------
Topic number:  1.0
Topic contribution: 0.85
Keywords: 
 energy,oil,price,fuel,use,cost,production,power,increase,plant
Speech: 
 No. It so happens that the ozone model is one that is attractive. but the ozone model is not a five exceedences per year model. An ozone model is one that allows one exceedence per year. 1 O 1750
--------------------------------------------------
Topic number:  2.0
Topic contribution: 0.52
Keywords: 
 border,air,aircraft,aviation,fly,airport,travel,haiti,flight,security
Speech: 
 The slot conversion provision ensur

### Primary and secondary topic distribution in the speech file

In [41]:
topics_df = pd.DataFrame(index=range(num_topics))
# Number of Documents for Each Topic
prim_topic_counts = all_speeches_topics_df['Prim_Topic'].value_counts().sort_index()
sec_topic_counts = all_speeches_topics_df['Sec_Topic'].value_counts().sort_index()
# Percentage of Documents for Each Topic
prim_topic_share = round(prim_topic_counts/len(main_data), 2)
prim_topic_share.name = 'Prim_Topic_Contr'
sec_topic_share = round(sec_topic_counts/len(main_data), 2)
sec_topic_share.name = 'Sec_Topic_Contr'

In [42]:
topics_df_joined= topics_df.join(prim_topic_counts)\
                    .join(prim_topic_share)\
                    .join(sec_topic_counts)\
                    .join(sec_topic_share)\
                    .join(pd.DataFrame.from_dict(keywords_dict, orient='index'))
topics_df_joined.reset_index(inplace=True)
topics_df_joined.columns = ['Topic_Num', "Prim_Cnt", "Prim_Share", "Sec_Cnt", "Sec_Share"] + ['kw'+str(i) for i in range(num_words)]
topics_df_joined.Prim_Cnt.sum(), topics_df_joined.Prim_Share.sum(), topics_df_joined.Sec_Cnt.sum(), topics_df_joined.Sec_Share.sum()

(62840, 1.01, 62104, 0.9400000000000001)

In [43]:
topics_df_joined.sort_values(by='Prim_Cnt', ascending=False)

Unnamed: 0,Topic_Num,Prim_Cnt,Prim_Share,Sec_Cnt,Sec_Share,kw0,kw1,kw2,kw3,kw4,kw5,kw6,kw7,kw8,kw9
22,22,8004,0.13,6523,0.1,get,think,want,come,know,talk,us,way,see,try
23,23,6832,0.11,2731,0.04,amendment,vote,amendments,senators,ask_unanimous_consent,committee,order,rule,resolution,debate
10,10,6335,0.1,5170,0.08,chairman,gentleman,amendment,committee,thank,want,think,issue,like,distinguish
19,19,5194,0.08,2293,0.04,community,serve,service,honor,award,dr,university,state,member,recognize
24,24,3984,0.06,4598,0.07,amendment,provision,act,legislation,law,require,congress,section,state,change
31,31,2579,0.04,2171,0.03,tax,budget,billion,percent,spend,pay,cut,increase,year,debt
14,14,2538,0.04,3611,0.06,vote,congress,house,pass,republican,us,american_people,rule,debate,members
12,12,2342,0.04,2583,0.04,program,fund,million,provide,budget,billion,need,appropriations,house,fiscal_year
3,3,2248,0.04,3062,0.05,life,know,many,live,family,man,us,great,come,love
4,4,1722,0.03,2092,0.03,history,nation,american,americans,day,today,black,america,great,first
