In [4]:
import pandas as pd
raw_corpus = pd.read_csv("~/Downlods/speech_w_data.csv", encoding = 'latin1')
headers = list(raw_corpus)

In [261]:
records = raw_corpus.loc[raw_corpus['congress_id'] >= 100, ['speech_id', 'speech']]

In [327]:
import nltk
import re
from nltk import word_tokenize          
from nltk.stem import SnowballStemmer 
class LemmaTokenizer(object):
     def __init__(self):
         self.wnl = SnowballStemmer(language = 'english')
     def __call__(self, doc):
        doc = re.sub(r'[^A-Za-z0-9\s]',r' ',doc)
        doc = re.sub(r'\n',r' ',doc)
        doc = re.sub(r'[0-9]',r' ',doc)
        #doc = re.sub(r'[a-z]\040' ,r'',doc) #remove singletons
        return [self.wnl.stem(t) for t in word_tokenize(doc)]
        

In [328]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1500 #words

tfidf_vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(records['speech'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

tf_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(records['speech'])
tf_feature_names = tf_vectorizer.get_feature_names()

In [None]:
word_list = [tfidf_feature_names[np.asarray(tfidf.sum(0).argsort())[0][-i]] for i in range(no_features)]
print(pd.DataFrame(word_list))
pd.DataFrame(word_list).to_csv('../Data/list_of_{}words.csv'.format(no_features))

In [316]:
def display_topics(df, num_of_top_words = 10):
    for t in df.columns[1:] :
        words = df.sort_values(by = t, ascending = False)['word']
        print('topic {}: '.format(t) +", ".format(t).join([w for w in words[:num_of_top_words]] ))


In [329]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
no_topics = 75

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


In [330]:
dic = {'word' : tfidf_feature_names}
for t,l in enumerate(nmf.components_) :
    dic.update({ 'T{}'.format(t) : l})

word_75topics_NMF =pd.DataFrame(dic)
word_75topics_NMF.to_csv('../Data/word_75topics_NMF.csv')

dic = {'word' : tf_feature_names}
for t,l in enumerate(lda.components_) :
    dic.update({ 'T{}'.format(t) : l})

word_75topics_LDA = pd.DataFrame(dic)
word_75topics_LDA.to_csv('../Data/word_75topics_LDA.csv')

In [336]:
print("nmf:")
display_topics(word_75topics_NMF, num_of_top_words = 10)


nmf:
topic T0: peopl, think, say, dont, want, know, thing, becaus, talk, just
topic T1: iran, nuclear, agreement, deal, sanction, iranian, weapon, negoti, israel, regim
topic T2: court, suprem, constitut, justic, scalia, decis, case, elect, law, v
topic T3: famili, communiti, life, love, live, year, citi, young, day, honor
topic T4: republican, democrat, leader, parti, hous, major, polit, shutdown, said, govern
topic T5: veteran, va, affair, care, servic, medic, serv, homeless, employe, facil
topic T6: school, educ, student, teacher, high, parent, state, kid, achiev, learn
topic T7: secur, homeland, depart, border, dhs, threat, nation, social, protect, secretari
topic T8: health, care, afford, medicar, patient, medicaid, access, medic, coverag, act
topic T9: terrorist, attack, terror, intellig, threat, radic, fbi, islam, american, foreign
topic T10: water, clean, flint, drink, california, lake, river, infrastructur, corp, michigan
topic T11: committe, legisl, chairman, work, thank, mem

In [343]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
num_of_topics = 25

# Run NMF
nmf = NMF(n_components=num_of_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=num_of_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

dic = {'word' : tfidf_feature_names}
for t,l in enumerate(nmf.components_) :
    dic.update({ 'T{}'.format(t) : l})

word_topics_NMF =pd.DataFrame(dic)
word_topics_NMF.to_csv('../Data/word_{}topics_NMF.csv'.format(num_of_topics))

dic = {'word' : tf_feature_names}
for t,l in enumerate(lda.components_) :
    dic.update({ 'T{}'.format(t) : l})

word_topics_LDA = pd.DataFrame(dic)
word_topics_LDA.to_csv('../Data/word_{}topics_LDA.csv'.format(num_of_topics))

print("nmf:")
display_topics(word_topics_NMF, num_of_top_words = 5)

print("LDA")
display_topics(word_topics_LDA, num_of_top_words = 5)

nmf:
topic T0: peopl, say, want, think, dont
topic T1: iran, nuclear, agreement, deal, sanction
topic T2: court, suprem, nomine, judg, justic
topic T3: serv, communiti, offic, honor, servic
topic T4: budget, spend, debt, cut, republican
topic T5: veteran, va, care, affair, servic
topic T6: student, school, educ, colleg, teacher
topic T7: secur, homeland, immigr, depart, border
topic T8: health, care, insur, plan, women
topic T9: amend, vote, chairman, offer, chair
topic T10: water, epa, clean, drink, flint
topic T11: senat, vote, republican, democrat, committe
topic T12: energi, coal, electr, effici, technolog
topic T13: opioid, drug, addict, heroin, prescript
topic T14: gun, violenc, background, check, terrorist
topic T15: trade, agreement, worker, tpa, negoti
topic T16: fund, program, transport, highway, infrastructur
topic T17: traffick, victim, human, sex, children
topic T18: puerto, rico, zika, virus, debt
topic T19: isi, terrorist, militari, defens, attack
topic T20: rule, h, r, 

In [333]:
def display_topics(df, num_of_top_words = 10):
    for t in df.columns[1:] :
        words = df.sort_values(by = t, ascending = False)['word']
        print('topic {}: '.format(t) +", ".format(t).join([w for w in words[:num_of_top_words]] ))


In [334]:
print("nmf:")
display_topics(word_topics_NMF, num_of_top_words = 10)

print("LDA")
display_topics(word_topics_LDA, num_of_top_words = 10)

nmf:
topic T0: peopl, say, want, think, dont, know, becaus, thing, just, talk
topic T1: iran, nuclear, agreement, deal, sanction, weapon, iranian, negoti, israel, regim
topic T2: court, suprem, nomine, judg, justic, constitut, nomin, presid, confirm, vacanc
topic T3: serv, communiti, offic, honor, servic, famili, life, year, polic, nation
topic T4: budget, spend, debt, cut, republican, defens, trillion, billion, year, fiscal
topic T5: veteran, va, care, affair, servic, militari, mental, serv, medic, homeless
topic T6: student, school, educ, colleg, teacher, children, child, program, loan, parent
topic T7: secur, homeland, immigr, depart, border, enforc, fund, dhs, cyber, law
topic T8: health, care, insur, plan, women, parenthood, obamacar, afford, abort, coverag
topic T9: amend, vote, chairman, offer, chair, debat, order, process, right, balanc
topic T10: water, epa, clean, drink, flint, california, land, state, rule, river
topic T11: senat, vote, republican, democrat, committe, work, 