In [1]:
from datetime import datetime
from matplotlib import pyplot as plt

import pandas as pd

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
from sklearn.decomposition import ProjectedGradientNMF
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import FactorAnalysis
from sklearn.decomposition import LatentDirichletAllocation

import utils

In [2]:
%matplotlib inline

In [82]:
suas = utils.get_suas()
years = utils.get_years(suas)

In [83]:
stopwords_fname = 'stopwords1.txt'
stops = open(stopwords_fname, 'r').read().splitlines()

In [85]:
def print_top_words(vectorizer, tfm, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for row_idx in range(tfm.shape[0]):
        row = tf_s_c.getrow(row_idx).toarray()[0].ravel()
        top_n_indexes = row.argsort()[-10:]
        top_n_values = row[top_n_indexes]
        print ("year:", years[row_idx])
        print (" ".join([feature_names[i]
                        for i in top_n_indexes]))
        print ()
    print ()

In [86]:
v_tf_nostops_tokens = CountVectorizer(tokenizer=utils.basic_tokenizer, decode_error='ignore')

In [87]:
m_tf_nostops_tokens = v_tf_nostops_tokens.fit_transform(utils.get_texts(suas))

In [88]:
print_top_words(tfv_no_stops, tf_no_stops, 10)

year: 1789
190 114,536. 17,710,114.27 19,901,325.45 19,814,550 19,796. 17,753,931.24 1,697,490.00 118 12,505,360

year: 1790
1835 1833. 10-10. 1831. 10-percent 183,000 183 10.6 1863. 2,527,129,552

year: 1791
1883. 1883 16,292,600 16,375,214.39 1,700,000 1,700,000,000 1880 1,700. 16,030,923.79 1838.

year: 1792
154,747 154,977,876 156 156,100,000 157,267,722.35 16,039 16,077,974.54 16,276,970.00 16,883,153.44 2,527,129,552

year: 1793
16,233,234.40 16,292,600 16,405 16,684,253 16,770 16,871,534.72 16,886,581.32 16,931,000,000 158,713,049 145,543,810.71

year: 1794
144,956,000 145,918 146,497,595.45 147,040.16 148,513 149,277,504 15 15,247,790.58 14,222 2,527,129,552

year: 1795
10.6 181 181,471,939.34 10-10. 1815. 1817 1818. 10,793 1809 1840

year: 1796
18,877,500 10,508,621 18,997,163.76 10,407,868 18.5 180,000,000 10,223 180,921 18,633,580.27 1.

year: 1797
183,000 1818. 1820 1821 1822 10,813 10,770 1827 10-10. 145,543,810.71

year: 1798
1885. 112,512,613.06 1,651,461.61 112,498,725.

In [89]:
v_tf_stops_tokens = CountVectorizer(stop_words=stops, tokenizer=utils.basic_tokenizer, decode_error='ignore')

In [91]:
m_tf_stops_tokens = v_tf_stops_tokens.fit_transform(utils.get_texts(suas))

In [92]:
print_top_words(v_tf_stops_tokens, m_tf_stops_tokens, 10)

year: 1789
190 114,536. 17,710,114.27 19,901,325.45 19,814,550 19,796. 17,753,931.24 1,697,490.00 118 12,505,360

year: 1790
1835 1833. 10-10. 1831. 10-percent 183,000 183 10.6 1863. 2,527,129,552

year: 1791
1883. 1883 16,292,600 16,375,214.39 1,700,000 1,700,000,000 1880 1,700. 16,030,923.79 1838.

year: 1792
154,747 154,977,876 156 156,100,000 157,267,722.35 16,039 16,077,974.54 16,276,970.00 16,883,153.44 2,527,129,552

year: 1793
16,233,234.40 16,292,600 16,405 16,684,253 16,770 16,871,534.72 16,886,581.32 16,931,000,000 158,713,049 145,543,810.71

year: 1794
144,956,000 145,918 146,497,595.45 147,040.16 148,513 149,277,504 15 15,247,790.58 14,222 2,527,129,552

year: 1795
10.6 181 181,471,939.34 10-10. 1815. 1817 1818. 10,793 1809 1840

year: 1796
18,877,500 10,508,621 18,997,163.76 10,407,868 18.5 180,000,000 10,223 180,921 18,633,580.27 1.

year: 1797
183,000 1818. 1820 1821 1822 10,813 10,770 1827 10-10. 145,543,810.71

year: 1798
1885. 112,512,613.06 1,651,461.61 112,498,725.

In [93]:
v_tf_stops_stems = CountVectorizer(stop_words=stops, tokenizer=utils.stem_tokenizer, decode_error='ignore')

In [94]:
m_tf_stops_stems = tfv_stem.fit_transform(utils.get_texts(suas))

In [44]:
tfv_stem_cutoffs = CountVectorizer(stop_words=stops, tokenizer=utils.stem_tokenizer, decode_error='ignore',\
                          max_df=0.98, min_df=0.2)

In [45]:
tf_s_c = tfv_stem_cutoffs.fit_transform(utils.get_texts(suas))

In [46]:
tf_s_c.shape

(224, 1727)

In [37]:
tf_s_c.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       ..., 
       [1, 1, 1, ..., 1, 0, 0],
       [1, 1, 1, ..., 1, 0, 0],
       [0, 0, 1, ..., 1, 0, 0]], dtype=int64)

In [70]:
print_top_words(tfv_stem_cutoffs, tf_s_c, 10)

row_index: 0
scienc enlighten nor satisfactori satisfact sanction north comfort essenti expect

row_index: 1
proof promot debt progress decid product produc deem recommend zeal

row_index: 2
requisit requir manner manufactur commenc commend repres commerc main prospect

row_index: 3
likewis limit loan local look maintain mainten manifest matur zeal

row_index: 4
mani manner march mark market materi mean measur love intellig

row_index: 5
instruct intercours intern interrupt invas invit issu judg immedi zeal

row_index: 6
deem practic preciou debt preserv press prevent danger power protect

row_index: 7
perman cultiv perpetu cruiser plan pleasur creation polici perhap constant

row_index: 8
product prevent pride princip principl day damag proceed debt intellig

row_index: 9
resist encourag coast enact requisit requir collect enabl magnitud sens

row_index: 10
safe mission moder engag enforc moment commerc commerci satisfactori zeal

row_index: 11
rais cours creat credit purpos public pr

In [72]:
print_top_words(tfv_stem, tf_stem, 10)

row_index: 0
prescrib discard judici prefer preexist preemin judiciari cathol dispos durabl

row_index: 1
necessari near constitut natur consult mutual murder content organ sovereignti

row_index: 2
perman period indirectli indispens caus ceas perceiv celer incur news

row_index: 3
implor import impress improv inadmiss incurs inde indic inflict sovereignti

row_index: 4
indian indirectli indistinctli indulg industri inexpedi influenc infract inclin graciou

row_index: 5
glori grate greatli ground guard guidanc half health forgotten sovereignti

row_index: 6
content midst mile constitut miscarri misconcept misinterpret consist michilimackinac none

row_index: 7
manufactori congratul march confirm maryland matur conduc measur manifest commend

row_index: 8
mutual misinterpret mississippi mitig mode conspicu consider motion constitut graciou

row_index: 9
persev digest capac diffus perman period captiv difficulti increas principl




In [73]:
print_top_words(tfv, tf, 10)

row_index: 0
humanity condition establishing hostility hostilities hostile establishment attained conform constantly

row_index: 1
foot flourishing capture fled care fit fisheries carolina fullest maturity

row_index: 2
guidance guarding efforts effusion attempt attempts growing attended effected foresight

row_index: 3
doubtful doubtless drawn driven durable effecting effects efficient embraced maturity

row_index: 4
efficacy efforts egbert eligible embarked emboldened emergencies eminent ease direct

row_index: 5
dignity directions directory disappeared discern discharging disclose discover descriptions maturity

row_index: 6
carolina falls families capture favorite feature feeling candid fallen forged

row_index: 7
express calculations expressing calculated extensive extinguished bullion extortion explicit beneficial

row_index: 8
fit feeling fellow-citizens fervent feudal capable canada fired capture direct

row_index: 9
habit conciliating assured conciliate guidance guarding assur

In [74]:
print_top_words(tfv_no_stops, tf_no_stops, 10)

row_index: 0
190 114,536. 17,710,114.27 19,901,325.45 19,814,550 19,796. 17,753,931.24 1,697,490.00 118 12,505,360

row_index: 1
1835 1833. 10-10. 1831. 10-percent 183,000 183 10.6 1863. 2,527,129,552

row_index: 2
1883. 1883 16,292,600 16,375,214.39 1,700,000 1,700,000,000 1880 1,700. 16,030,923.79 1838.

row_index: 3
154,747 154,977,876 156 156,100,000 157,267,722.35 16,039 16,077,974.54 16,276,970.00 16,883,153.44 2,527,129,552

row_index: 4
16,233,234.40 16,292,600 16,405 16,684,253 16,770 16,871,534.72 16,886,581.32 16,931,000,000 158,713,049 145,543,810.71

row_index: 5
144,956,000 145,918 146,497,595.45 147,040.16 148,513 149,277,504 15 15,247,790.58 14,222 2,527,129,552

row_index: 6
10.6 181 181,471,939.34 10-10. 1815. 1817 1818. 10,793 1809 1840

row_index: 7
18,877,500 10,508,621 18,997,163.76 10,407,868 18.5 180,000,000 10,223 180,921 18,633,580.27 1.

row_index: 8
183,000 1818. 1820 1821 1822 10,813 10,770 1827 10-10. 145,543,810.71

row_index: 9
1885. 112,512,613.06 1,651