# data.world
## C-Span Inaugural Address


### Getting the data

The data is stored in data.world

> #### Setup  

> Before running data.world notebooks for the first time, you'll need to:  
1. Install data.world's Python package, including optional `pandas` dependencies: 
```shell
pip install git+git://github.com/datadotworld/data.world-py.git#egg=project[PANDAS]
```
1. Obtain an API access token at https://data.world/settings/advanced
1. Store API access token using the `dw` command-line tool: 
```shell
dw configure
```

> Once your environment is set up, these steps do not need to be repeated for other data.world notebooks.

In [1]:
import datadotworld as dw
import pandas as pd
import numpy as np

In [2]:
# Load dataset (onto the local file system)
force_update = True
dataset_local = dw.load_dataset('brianray/c-span-inaugural-address',
                                force_update=force_update)  # cached under ~/.dw/cache

In [3]:
# See what is in it
dataset_local.describe()

{'description': 'C-Span all US inaugural Presidential addresses from 1789-2017\n\nScript used to create found here: https://github.com/brianray/data.world-scripts/blob/master/load%20inaugural%20addresses.ipynb\n\nOriginal data from http://www.nltk.org/\n\nAdded Obama 2013 and Trump 2017.',
 'homepage': 'https://data.world/brianray/c-span-inaugural-address',
 'keywords': ['politics',
  'inaugural',
  'presidential',
  'text',
  'nltk',
  'corpus'],
 'license': 'Public Domain',
 'name': 'brianray_c-span-inaugural-address',
 'resources': [{'format': 'csv',
   'name': 'c_span_inagural_addresses',
   'path': 'data/c_span_inagural_addresses.csv'},
  {'format': 'csv',
   'name': 'uspresident_wikipedia_urls_thmbs_hs_brray_presidents',
   'path': 'data/uspresident_wikipedia_urls_thmbs_hs_brray_presidents.csv'},
  {'bytes': 6861,
   'format': 'csv',
   'mediatype': 'text/csv',
   'name': 'original/USPresident-Wikipedia-URLs-Thmbs-HS_brray - presidents.csv',
   'path': 'original/USPresident-Wikip

In [4]:
dataset_local.dataframes

LazyLoadedDict({'original/c-span-inagural-addresses.csv': LazyLoadedValue(<pandas.DataFrame>), 'uspresident_wikipedia_urls_thmbs_hs_brray_presidents': LazyLoadedValue(<pandas.DataFrame>), 'original/USPresident-Wikipedia-URLs-Thmbs-HS_brray - presidents.csv': LazyLoadedValue(<pandas.DataFrame>), 'c_span_inagural_addresses': LazyLoadedValue(<pandas.DataFrame>)})

In [5]:
df_presidents = dataset_local.dataframes['uspresident_wikipedia_urls_thmbs_hs_brray_presidents']
df_inagural = dataset_local.dataframes['c_span_inagural_addresses']

In [6]:
df_inagural

Unnamed: 0,year,president,speach_content
0,1789,Washington,Fellow-Citizens of the Senate and of the House...
1,1793,Washington,"Fellow citizens, I am again called upon by the..."
2,1797,Adams,"When it was first perceived, in early times, t..."
3,1801,Jefferson,Friends and Fellow Citizens:\n\nCalled upon to...
4,1805,Jefferson,"Proceeding, fellow citizens, to that qualifica..."
5,1809,Madison,Unwilling to depart from examples of the most ...
6,1813,Madison,About to add the solemnity of an oath to the o...
7,1817,Monroe,I should be destitute of feeling if I was not ...
8,1821,Monroe,"Fellow citizens, I shall not attempt to descri..."
9,1825,Adams,In compliance with an usage coeval with the ex...


In [7]:
df_presidents

Unnamed: 0,presidency,president,wikipedia_entry,took_office,left_office,party,portrait,thumbnail,home_state
0,1,George Washington,http://en.wikipedia.org/wiki/George_Washington,1789-04-03,4/3/1797,Independent,GeorgeWashington.jpg,thmb_GeorgeWashington.jpg,Virginia
1,2,John Adams,http://en.wikipedia.org/wiki/John_Adams,1797-04-03,4/3/1801,Federalist,JohnAdams.jpg,thmb_JohnAdams.jpg,Massachusetts
2,3,Thomas Jefferson,http://en.wikipedia.org/wiki/Thomas_Jefferson,1801-04-03,4/3/1809,Democratic-Republican,Thomasjefferson.gif,thmb_Thomasjefferson.gif,Virginia
3,4,James Madison,http://en.wikipedia.org/wiki/James_Madison,1809-04-03,4/3/1817,Democratic-Republican,JamesMadison.gif,thmb_JamesMadison.gif,Virginia
4,5,James Monroe,http://en.wikipedia.org/wiki/James_Monroe,1817-04-03,4/3/1825,Democratic-Republican,JamesMonroe.gif,thmb_JamesMonroe.gif,Virginia
5,6,John Quincy Adams,http://en.wikipedia.org/wiki/John_Quincy_Adams,1825-04-03,4/3/1829,Democratic-Republican/National Republican,JohnQuincyAdams.gif,thmb_JohnQuincyAdams.gif,Massachusetts
6,7,Andrew Jackson,http://en.wikipedia.org/wiki/Andrew_Jackson,1829-04-03,4/3/1837,Democratic,Andrew_jackson_head.gif,thmb_Andrew_jackson_head.gif,Tennessee
7,8,Martin Van Buren,http://en.wikipedia.org/wiki/Martin_Van_Buren,1837-04-03,4/3/1841,Democratic,MartinVanBuren.gif,thmb_MartinVanBuren.gif,New York
8,9,William Henry Harrison,http://en.wikipedia.org/wiki/William_Henry_Har...,1841-04-03,4/4/1841,Whig,WilliamHenryHarrison.gif,thmb_WilliamHenryHarrison.gif,Ohio
9,10,John Tyler,http://en.wikipedia.org/wiki/John_Tyler,1841-04-04,4/3/1845,Whig,JohnTyler.jpg,thmb_JohnTyler.jpg,Virginia


In [8]:
def parse_year(row):
    return np.int64(row['took_office'].year)

df_presidents['year'] = df_presidents.apply(parse_year, axis=1)
df_presidents

Unnamed: 0,presidency,president,wikipedia_entry,took_office,left_office,party,portrait,thumbnail,home_state,year
0,1,George Washington,http://en.wikipedia.org/wiki/George_Washington,1789-04-03,4/3/1797,Independent,GeorgeWashington.jpg,thmb_GeorgeWashington.jpg,Virginia,1789
1,2,John Adams,http://en.wikipedia.org/wiki/John_Adams,1797-04-03,4/3/1801,Federalist,JohnAdams.jpg,thmb_JohnAdams.jpg,Massachusetts,1797
2,3,Thomas Jefferson,http://en.wikipedia.org/wiki/Thomas_Jefferson,1801-04-03,4/3/1809,Democratic-Republican,Thomasjefferson.gif,thmb_Thomasjefferson.gif,Virginia,1801
3,4,James Madison,http://en.wikipedia.org/wiki/James_Madison,1809-04-03,4/3/1817,Democratic-Republican,JamesMadison.gif,thmb_JamesMadison.gif,Virginia,1809
4,5,James Monroe,http://en.wikipedia.org/wiki/James_Monroe,1817-04-03,4/3/1825,Democratic-Republican,JamesMonroe.gif,thmb_JamesMonroe.gif,Virginia,1817
5,6,John Quincy Adams,http://en.wikipedia.org/wiki/John_Quincy_Adams,1825-04-03,4/3/1829,Democratic-Republican/National Republican,JohnQuincyAdams.gif,thmb_JohnQuincyAdams.gif,Massachusetts,1825
6,7,Andrew Jackson,http://en.wikipedia.org/wiki/Andrew_Jackson,1829-04-03,4/3/1837,Democratic,Andrew_jackson_head.gif,thmb_Andrew_jackson_head.gif,Tennessee,1829
7,8,Martin Van Buren,http://en.wikipedia.org/wiki/Martin_Van_Buren,1837-04-03,4/3/1841,Democratic,MartinVanBuren.gif,thmb_MartinVanBuren.gif,New York,1837
8,9,William Henry Harrison,http://en.wikipedia.org/wiki/William_Henry_Har...,1841-04-03,4/4/1841,Whig,WilliamHenryHarrison.gif,thmb_WilliamHenryHarrison.gif,Ohio,1841
9,10,John Tyler,http://en.wikipedia.org/wiki/John_Tyler,1841-04-04,4/3/1845,Whig,JohnTyler.jpg,thmb_JohnTyler.jpg,Virginia,1841


In [74]:
df = pd.merge(df_inagural, df_presidents, how='right', on=['year'])

df

Unnamed: 0,year,president_x,speach_content,party_short,presidency,president_y,wikipedia_entry,took_office,left_office,party,portrait,thumbnail,home_state
0,1789,Washington,Fellow-Citizens of the Senate and of the House...,,1,George Washington,http://en.wikipedia.org/wiki/George_Washington,1789-04-03,4/3/1797,Independent,GeorgeWashington.jpg,thmb_GeorgeWashington.jpg,Virginia
1,1797,Adams,"When it was first perceived, in early times, t...",,2,John Adams,http://en.wikipedia.org/wiki/John_Adams,1797-04-03,4/3/1801,Federalist,JohnAdams.jpg,thmb_JohnAdams.jpg,Massachusetts
2,1801,Jefferson,Friends and Fellow Citizens:\n\nCalled upon to...,,3,Thomas Jefferson,http://en.wikipedia.org/wiki/Thomas_Jefferson,1801-04-03,4/3/1809,Democratic-Republican,Thomasjefferson.gif,thmb_Thomasjefferson.gif,Virginia
3,1809,Madison,Unwilling to depart from examples of the most ...,,4,James Madison,http://en.wikipedia.org/wiki/James_Madison,1809-04-03,4/3/1817,Democratic-Republican,JamesMadison.gif,thmb_JamesMadison.gif,Virginia
4,1817,Monroe,I should be destitute of feeling if I was not ...,,5,James Monroe,http://en.wikipedia.org/wiki/James_Monroe,1817-04-03,4/3/1825,Democratic-Republican,JamesMonroe.gif,thmb_JamesMonroe.gif,Virginia
5,1825,Adams,In compliance with an usage coeval with the ex...,,6,John Quincy Adams,http://en.wikipedia.org/wiki/John_Quincy_Adams,1825-04-03,4/3/1829,Democratic-Republican/National Republican,JohnQuincyAdams.gif,thmb_JohnQuincyAdams.gif,Massachusetts
6,1829,Jackson,"Fellow citizens, about to undertake the arduou...",,7,Andrew Jackson,http://en.wikipedia.org/wiki/Andrew_Jackson,1829-04-03,4/3/1837,Democratic,Andrew_jackson_head.gif,thmb_Andrew_jackson_head.gif,Tennessee
7,1837,VanBuren,Fellow citizens: The practice of all my predec...,,8,Martin Van Buren,http://en.wikipedia.org/wiki/Martin_Van_Buren,1837-04-03,4/3/1841,Democratic,MartinVanBuren.gif,thmb_MartinVanBuren.gif,New York
8,1841,Harrison,Called from a retirement which I had supposed ...,,9,William Henry Harrison,http://en.wikipedia.org/wiki/William_Henry_Har...,1841-04-03,4/4/1841,Whig,WilliamHenryHarrison.gif,thmb_WilliamHenryHarrison.gif,Ohio
9,1841,Harrison,Called from a retirement which I had supposed ...,,10,John Tyler,http://en.wikipedia.org/wiki/John_Tyler,1841-04-04,4/3/1845,Whig,JohnTyler.jpg,thmb_JohnTyler.jpg,Virginia


In [104]:
# Three presidents didn't have inauguration addresses:
#   Because William Henry Harrison death
#   Because Garfield assination,
#   Because of Kennedy assisnation,
#     and Nixon impeachment, the following presidents did not have formal speachs:
print(df.shape[0])
df = df[~df['presidency'].isin([10, #  John Tyler
                                21, #  Chester A. Arthur
                                36, #  Lyndon B. Johnson
                                38  #  Gerald Ford
                               ])]
print(df.shape[0])

42
41


In [105]:
def get_code(row):
    party = row['party']
    if party.startswith("Republican"):
        return 'R'
    elif party.startswith("Democratic"):
        return 'D'
    elif party.startswith("Independent"):
        return 'I'
    elif party.startswith("Whig"):
        return 'W'
    else: 
        return 'O'

df['party_short'] = df.apply(get_code, axis=1) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Clustring



In [106]:
from nltk.stem.snowball import SnowballStemmer
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text 

stemmer = SnowballStemmer("english")

#  http://brandonrose.org/clustering
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

#define vectorizer parameters


stop_words = text.ENGLISH_STOP_WORDS.union(["'s", "mr.", "let", "come"])
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, 
                                   max_features=100000,
                                   min_df=0.2,
                                   stop_words=stop_words,
                                   use_idf=True,
                                   tokenizer=tokenize_and_stem,
                                   ngram_range=(1,3))
synopses = list(df['speach_content'])
%time tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 2.76 s, sys: 5.05 ms, total: 2.77 s
Wall time: 2.77 s
(41, 914)


In [107]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [108]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 111436 items in vocab_frame


In [109]:
terms = tfidf_vectorizer.get_feature_names()

In [110]:
from sklearn.cluster import KMeans

num_clusters = 6

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()
df['cluster'] = clusters
df

CPU times: user 566 ms, sys: 5.72 ms, total: 572 ms
Wall time: 202 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,year,president_x,speach_content,party_short,presidency,president_y,wikipedia_entry,took_office,left_office,party,portrait,thumbnail,home_state,cluster
0,1789,Washington,Fellow-Citizens of the Senate and of the House...,I,1,George Washington,http://en.wikipedia.org/wiki/George_Washington,1789-04-03,4/3/1797,Independent,GeorgeWashington.jpg,thmb_GeorgeWashington.jpg,Virginia,2
1,1797,Adams,"When it was first perceived, in early times, t...",O,2,John Adams,http://en.wikipedia.org/wiki/John_Adams,1797-04-03,4/3/1801,Federalist,JohnAdams.jpg,thmb_JohnAdams.jpg,Massachusetts,2
2,1801,Jefferson,Friends and Fellow Citizens:\n\nCalled upon to...,D,3,Thomas Jefferson,http://en.wikipedia.org/wiki/Thomas_Jefferson,1801-04-03,4/3/1809,Democratic-Republican,Thomasjefferson.gif,thmb_Thomasjefferson.gif,Virginia,2
3,1809,Madison,Unwilling to depart from examples of the most ...,D,4,James Madison,http://en.wikipedia.org/wiki/James_Madison,1809-04-03,4/3/1817,Democratic-Republican,JamesMadison.gif,thmb_JamesMadison.gif,Virginia,2
4,1817,Monroe,I should be destitute of feeling if I was not ...,D,5,James Monroe,http://en.wikipedia.org/wiki/James_Monroe,1817-04-03,4/3/1825,Democratic-Republican,JamesMonroe.gif,thmb_JamesMonroe.gif,Virginia,2
5,1825,Adams,In compliance with an usage coeval with the ex...,D,6,John Quincy Adams,http://en.wikipedia.org/wiki/John_Quincy_Adams,1825-04-03,4/3/1829,Democratic-Republican/National Republican,JohnQuincyAdams.gif,thmb_JohnQuincyAdams.gif,Massachusetts,2
6,1829,Jackson,"Fellow citizens, about to undertake the arduou...",D,7,Andrew Jackson,http://en.wikipedia.org/wiki/Andrew_Jackson,1829-04-03,4/3/1837,Democratic,Andrew_jackson_head.gif,thmb_Andrew_jackson_head.gif,Tennessee,5
7,1837,VanBuren,Fellow citizens: The practice of all my predec...,D,8,Martin Van Buren,http://en.wikipedia.org/wiki/Martin_Van_Buren,1837-04-03,4/3/1841,Democratic,MartinVanBuren.gif,thmb_MartinVanBuren.gif,New York,1
8,1841,Harrison,Called from a retirement which I had supposed ...,W,9,William Henry Harrison,http://en.wikipedia.org/wiki/William_Henry_Har...,1841-04-03,4/4/1841,Whig,WilliamHenryHarrison.gif,thmb_WilliamHenryHarrison.gif,Ohio,1
10,1845,Polk,"Fellow citizens, without solicitation on my pa...",D,11,James K. Polk,http://en.wikipedia.org/wiki/James_K._Polk,1845-04-03,4/3/1849,Democratic,JamesKPolk.gif,thmb_JamesKPolk.gif,Tennessee,5


In [111]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

cluster_words = {}
for i in range(num_clusters):
    cluster_words[i] = []
    for ind in order_centroids[i, :20]: #replace 6 with n words per cluster
        cluster_words[i].append(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0])
pd.DataFrame.from_dict(cluster_words, orient="index")

Top terms per cluster:



.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,america,today,know,freedom,together,god,strength,live,presides,man,help,day,generation,words,children,earth,friends,need,because,way
1,congress,union,executing,subject,question,protected,united,important,respect,party,best,necessary,revenue,instituted,territory,policy,regard,administration,properly,foreign
2,union,principles,happiness,foreign,honorable,form,preservation,present,improve,author,united,opinions,best,fellow-citizens,republican,success,respect,general,difficulty,commerce
3,justice,help,america,task,support,things,business,human,activity,party,responsibility,old,seeking,order,progress,industry,congress,ideal,executing,use
4,god,union,ceased,addressed,came,answer,judged,years,said,ago,extended,occasion,conflict,cause,need,party,seeking,continuance,toiling,anxious
5,union,protected,executing,revenue,federal,foreign,objects,congress,cares,branches,appointed,honorable,properly,preservation,limited,extended,general,enlarged,examples,perform


In [112]:
df

Unnamed: 0,year,president_x,speach_content,party_short,presidency,president_y,wikipedia_entry,took_office,left_office,party,portrait,thumbnail,home_state,cluster
0,1789,Washington,Fellow-Citizens of the Senate and of the House...,I,1,George Washington,http://en.wikipedia.org/wiki/George_Washington,1789-04-03,4/3/1797,Independent,GeorgeWashington.jpg,thmb_GeorgeWashington.jpg,Virginia,2
1,1797,Adams,"When it was first perceived, in early times, t...",O,2,John Adams,http://en.wikipedia.org/wiki/John_Adams,1797-04-03,4/3/1801,Federalist,JohnAdams.jpg,thmb_JohnAdams.jpg,Massachusetts,2
2,1801,Jefferson,Friends and Fellow Citizens:\n\nCalled upon to...,D,3,Thomas Jefferson,http://en.wikipedia.org/wiki/Thomas_Jefferson,1801-04-03,4/3/1809,Democratic-Republican,Thomasjefferson.gif,thmb_Thomasjefferson.gif,Virginia,2
3,1809,Madison,Unwilling to depart from examples of the most ...,D,4,James Madison,http://en.wikipedia.org/wiki/James_Madison,1809-04-03,4/3/1817,Democratic-Republican,JamesMadison.gif,thmb_JamesMadison.gif,Virginia,2
4,1817,Monroe,I should be destitute of feeling if I was not ...,D,5,James Monroe,http://en.wikipedia.org/wiki/James_Monroe,1817-04-03,4/3/1825,Democratic-Republican,JamesMonroe.gif,thmb_JamesMonroe.gif,Virginia,2
5,1825,Adams,In compliance with an usage coeval with the ex...,D,6,John Quincy Adams,http://en.wikipedia.org/wiki/John_Quincy_Adams,1825-04-03,4/3/1829,Democratic-Republican/National Republican,JohnQuincyAdams.gif,thmb_JohnQuincyAdams.gif,Massachusetts,2
6,1829,Jackson,"Fellow citizens, about to undertake the arduou...",D,7,Andrew Jackson,http://en.wikipedia.org/wiki/Andrew_Jackson,1829-04-03,4/3/1837,Democratic,Andrew_jackson_head.gif,thmb_Andrew_jackson_head.gif,Tennessee,5
7,1837,VanBuren,Fellow citizens: The practice of all my predec...,D,8,Martin Van Buren,http://en.wikipedia.org/wiki/Martin_Van_Buren,1837-04-03,4/3/1841,Democratic,MartinVanBuren.gif,thmb_MartinVanBuren.gif,New York,1
8,1841,Harrison,Called from a retirement which I had supposed ...,W,9,William Henry Harrison,http://en.wikipedia.org/wiki/William_Henry_Har...,1841-04-03,4/4/1841,Whig,WilliamHenryHarrison.gif,thmb_WilliamHenryHarrison.gif,Ohio,1
10,1845,Polk,"Fellow citizens, without solicitation on my pa...",D,11,James K. Polk,http://en.wikipedia.org/wiki/James_K._Polk,1845-04-03,4/3/1849,Democratic,JamesKPolk.gif,thmb_JamesKPolk.gif,Tennessee,5


In [113]:
# https://medium.com/@mishra.thedeepak/doc2vec-in-a-simple-way-fa80bfe81104

In [114]:
#Import all the dependencies
import gensim
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from os import listdir
from os.path import isfile, join

In [115]:
docLabels = ["{year}_{president_x}_{party_short}".format(**x) for x in df[['year', 'president_x', 'party_short']].to_dict(orient="records")]
data = list(df['speach_content'])

In [116]:
docLabels

['1789_Washington_I',
 '1797_Adams_O',
 '1801_Jefferson_D',
 '1809_Madison_D',
 '1817_Monroe_D',
 '1825_Adams_D',
 '1829_Jackson_D',
 '1837_VanBuren_D',
 '1841_Harrison_W',
 '1845_Polk_D',
 '1849_Taylor_W',
 '1850_Fillmore_W',
 '1853_Pierce_D',
 '1857_Buchanan_D',
 '1861_Lincoln_R',
 '1865_Lincoln_D',
 '1869_Grant_R',
 '1877_Hayes_R',
 '1881_Garfield_R',
 '1885_Cleveland_D',
 '1889_Harrison_R',
 '1893_Cleveland_D',
 '1897_McKinley_R',
 '1901_McKinley_R',
 '1909_Taft_R',
 '1913_Wilson_D',
 '1921_Harding_R',
 '1923_Coolidge_R',
 '1929_Hoover_R',
 '1933_Roosevelt_D',
 '1945_Roosevelt_D',
 '1953_Eisenhower_R',
 '1961_Kennedy_D',
 '1969_Nixon_R',
 '1977_Carter_D',
 '1981_Reagan_R',
 '1989_Bush_R',
 '1993_Clinton_D',
 '2001_Bush_R',
 '2009_Obama_D',
 '2017_Trump_R']

In [117]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('english'))
#This function does all cleaning of data using two objects above
def nlp_clean(data):
    new_data = []
    for d in data:
        new_str = d.lower()
        dlist = tokenizer.tokenize(new_str)
        dlist = list(set(dlist).difference(stopword_set))
        new_data.append(dlist)
    return new_data

In [118]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
              yield gensim.models.doc2vec.LabeledSentence(doc,    
[self.labels_list[idx]])

In [119]:
data = nlp_clean(data)

In [120]:
it = LabeledLineSentence(data, docLabels)

In [None]:
model = gensim.models.Doc2Vec(size=300, min_count=0, alpha=0.025, min_alpha=0.025)
model.build_vocab(it)
#training of model
for epoch in range(100):
    print('iteration {}'.format(epoch+1))
    model.train(it)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
    model.train(it)
#saving the created model
#model.save(‘doc2vec.model’)
# print("model saved")

iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration 77
iteratio

In [None]:

d2v_model = model
d2v_model.docvecs.similarity('2017_Trump_R', '2009_Obama_D')

In [None]:
import numpy as np 
from pandas import DataFrame
%matplotlib


dfh = DataFrame(index=docLabels, columns=docLabels)

for x in docLabels:
    for y in docLabels:
        val = np.float64(d2v_model.docvecs.similarity(x, y))
        dfh.set_value(x, y, val*1000)
dfh = dfh[dfh.columns].astype(float)  # or int
dfh

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(15, 15))

ax = sns.heatmap(dfh)
ax

In [None]:
ax.get_figure().savefig("heatmap.png")

In [None]:
!open heatmap.png