In [19]:
import numpy as np
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk import PorterStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')

from gensim import corpora, models

[nltk_data] Downloading package wordnet to /Users/andrei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [246]:
# loading in the data
# data = pd.read_csv('wiki_movie_plots_deduped.csv')
data = pd.read_csv('inaug_speeches.csv', encoding='windows-1252')
data = np.array(data)
# plots = list(data[:,7])
plots = list(data[:,4])

In [247]:
# returns an array with the lemmatized and stemmed data
def preprocess(text):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    wordsArr = []
    process1 = gensim.utils.simple_preprocess(text)
    for token in process1:
        if token not in gensim.parsing.preprocessing.STOPWORDS:
#             wordsArr.append(stemmer.stem(lemmatizer.lemmatize(token)))
#             wordsArr.append(lemmatizer.lemmatize(token))
            wordsArr.append(token)
    return wordsArr

In [248]:
clean_data = list(map(preprocess, plots))

In [249]:
# print(len(clean_data))
# print(plots[0])
# print(clean_data[0])

In [250]:
# create dict of the words
word_dict = gensim.corpora.Dictionary(clean_data)

# filter out words that are in less than 20 documents and that are in above 20% of documents
word_dict.filter_extremes(no_below=4, no_above=0.5)

In [251]:
# counter of how many times words appear
word_counts = [word_dict.doc2bow(doc) for doc in clean_data]

In [252]:
# using tfidf
tfidf = models.TfidfModel(word_counts)
words_tfidf = tfidf[word_counts]

In [253]:
# Running LDA using TF-IDF
lda_model = gensim.models.LdaMulticore(words_tfidf, num_topics=20, id2word=word_dict, passes=5, workers=6)

In [254]:
# show the generated topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {} \n'.format(idx, topic))

Topic: 0 Word: 0.002*"specie" + 0.002*"dream" + 0.002*"necessarily" + 0.002*"deal" + 0.002*"urge" + 0.001*"treatment" + 0.001*"consummation" + 0.001*"desirable" + 0.001*"intentions" + 0.001*"administrations" 

Topic: 1 Word: 0.004*"importance" + 0.004*"circumstances" + 0.003*"sentiments" + 0.003*"principle" + 0.003*"effect" + 0.003*"objects" + 0.003*"event" + 0.003*"extent" + 0.003*"actual" + 0.003*"connected" 

Topic: 2 Word: 0.004*"story" + 0.002*"compassion" + 0.001*"commitment" + 0.001*"schools" + 0.001*"birth" + 0.001*"commitments" + 0.001*"jefferson" + 0.001*"ambitions" + 0.001*"petty" + 0.001*"build" 

Topic: 3 Word: 0.003*"jobs" + 0.003*"slaves" + 0.002*"cease" + 0.002*"workers" + 0.002*"survive" + 0.002*"righteous" + 0.002*"expressly" + 0.002*"city" + 0.002*"plainly" + 0.002*"address" 

Topic: 4 Word: 0.004*"today" + 0.004*"congress" + 0.003*"americans" + 0.003*"business" + 0.003*"economic" + 0.003*"peoples" + 0.003*"change" + 0.003*"question" + 0.003*"republic" + 0.003*"milli

In [255]:
import os
# f = open('shore.txt', 'r')
# content = f.read()
# content = 'We should, at all costs, reduce our carbon footprint and go green. We need solar panels.'
# content = 'The battle began in the skies above us. In those first tense midnight hours, 1,000 aircraft roared overhead, with 17,000 allied airborne troops preparing to leap into the dark just beyond these trees. Then came dawn. The enemy who had occupied these heights saw the largest naval armada in the history of the world.'
# content = "There's new energy to harness, new jobs to be created, new schools to build, and threats to meet, alliances to repair"
content = "Now is the time to rise from the dark and desolate valley of segregation to the sunlit path of racial justice. Now is the time [applause] to lift our nation from the quicksands of racial injustice to the solid rock of brotherhood. Now is the time (Yes) [applause] (Now) to make justice a reality for all of God’s children."
bow_vector = word_dict.doc2bow(preprocess(content))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 3)))

Score: 0.33216527104377747	 Topic: 0.004*"democracy" + 0.004*"mr" + 0.004*"friends"
Score: 0.12886330485343933	 Topic: 0.004*"problems" + 0.003*"self" + 0.003*"generation"
Score: 0.12593846023082733	 Topic: 0.002*"drawn" + 0.002*"creed" + 0.002*"journey"
Score: 0.11245398223400116	 Topic: 0.004*"revenue" + 0.003*"federal" + 0.003*"respect"
Score: 0.11058087646961212	 Topic: 0.004*"today" + 0.004*"congress" + 0.003*"americans"
Score: 0.10993235558271408	 Topic: 0.004*"dreams" + 0.003*"nuclear" + 0.003*"th"
