In [3]:
import numpy
import pandas as pd
import nltk
import re

In [1]:
titles = open('locations.txt').read().split('\n')
contents = open('trump_speeches.txt').read().split('\n')
print(str(len(titles)) + ' titles')
print(str(len(contents)) + ' contents')
print(titles[:3])
print(contents[2])

60 titles
60 contents
['Remarks Announcing Candidacy for President in New York City ', 'Remarks at the AIPAC Policy Conference in Washington, DC', 'Remarks on Foreign Policy at the National Press Club in Washington, DC']
I would like to talk today about how to develop a new foreign policy direction for our country - one that replaces randomness with purpose, ideology with strategy, and chaos with peace. It is time to shake the rust off of America's foreign policy. It's time to invite new voices and new visions into the fold. The direction I will outline today will also return us to a timeless principle. My foreign policy will always put the interests of the American people, and American security, above all else. That will be the foundation of every decision that I will make. America First will be the major and overriding theme of my administration. But to chart our path forward, we must first briefly look back. We have a lot to be proud of. In the 1940s we saved the world. The Greatest

In [4]:
# generates index for each item in the corpora (in this case it's just rank) and this scoring will be used later
ranks = []

for i in range(0,len(titles)):
    ranks.append(i)

In [33]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords += []

In [34]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [35]:
# here a tokenizer and stemmer are defined which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [36]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in contents:
    allwords_stemmed = tokenize_and_stem(i)
    totalvocab_stemmed.extend(allwords_stemmed)
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [37]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.7, max_features=200000,
                                 min_df=0.1, stop_words= stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(contents)

print(tfidf_matrix.shape)

CPU times: user 5.35 s, sys: 34 ms, total: 5.38 s
Wall time: 5.4 s
(60, 3729)


In [39]:
terms = tfidf_vectorizer.get_feature_names()
print(terms)

["'d", "'d like", "'d say", "'m ask", "'m elect", "'m elect presid", "'m go", "'m go fight", "'m go repeal", "'m presid", "'m run", "'m say", "'m talk", "'m tell", "'s 's", "'s also", "'s alway", "'s amaz", "'s bad", "'s becaus", "'s begin", "'s call", "'s campaign", "'s corrupt", "'s done", "'s donor", "'s entranc", "'s entranc world", "'s entri", "'s entri world", "'s ever", "'s fail", "'s foreign", "'s foreign polici", "'s get", "'s go", "'s go america", "'s go happen", "'s gon", "'s gon na", "'s good", "'s got", "'s great", "'s happen", "'s hard", "'s interest", "'s largest", "'s like", "'s look", "'s lot", "'s made", "'s made thing", "'s messag", "'s nafta", "'s never", "'s one", "'s onli", "'s plan", "'s polici", "'s power", "'s realli", "'s reason", "'s rig", "'s right", "'s say", "'s state", "'s state depart", "'s talk", "'s terribl", "'s time", "'s true", "'s veri", "'s watch", "'s way", "'s whi", "'s year", '2nd', '2nd amend', '2nd amend appoint', '8th', '8th go', 'abandon', 

In [40]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [41]:
from sklearn.cluster import KMeans

num_clusters = 4

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 689 ms, sys: 3.39 ms, total: 693 ms
Wall time: 696 ms


In [42]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')
km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [43]:
speeches = { 'titles': titles, 'rank': ranks, 'contents': contents, 'cluster': clusters}

frame = pd.DataFrame(speeches, index = [clusters] , columns = ['rank', 'titles', 'cluster'])

In [44]:
frame['cluster'].value_counts() #number of speeches per cluster (clusters from 0 to 4)

2    23
3    14
0    12
1    11
Name: cluster, dtype: int64

In [45]:
grouped = frame['rank'].groupby(frame['cluster'])

grouped.mean()

cluster
0    46.750000
1    30.545455
2    20.000000
3    29.500000
Name: rank, dtype: float64

In [46]:
from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :6]:
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end=',')
    print()
    print()
    print("Cluster %d titles:" % i, end='')
    for titles in frame.loc[i]['titles'].values.tolist():
        print(' %s,' % titles, end='')
    print()
    print()

Top terms per cluster:

Cluster 0 words: going, illegal, going, illegal, 'm, trump,

Cluster 0 titles: Remarks at the Collier County Fairgrounds in Naples, Florida, Remarks at the Collier County Fairgrounds in Naples, Florida, Remarks at the Collier County Fairgrounds in Naples, Florida, Remarks at the Collier County Fairgrounds in Naples, Florida, Remarks at the Collier County Fairgrounds in Naples, Florida, Remarks at the Collier County Fairgrounds in Naples, Florida, Remarks at the Collier County Fairgrounds in Naples, Florida, Remarks at the Collier County Fairgrounds in Naples, Florida, Remarks at the Collier County Fairgrounds in Naples, Florida, Remarks at the Collier County Fairgrounds in Naples, Florida, Remarks at the Collier County Fairgrounds in Naples, Florida, Remarks at the Collier County Fairgrounds in Naples, Florida,

Cluster 1 words: african-american, 'm, american, flint, nation, trade,

Cluster 1 titles: Remarks at the Collier County Fairgrounds in Naples, Florida, 