# 1. Packages

In [1]:
import pandas as pd
import re
import nltk
import spacy
import numpy as np
from nltk.corpus import inaugural
from spacy.tokens import Token
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from sklearn.svm import SVR, SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.metrics import mean_squared_error, f1_score

nltk.download('all-nltk', quiet=True)

nlp = spacy.load('en_core_web_sm')
future_getter = lambda token: token.text in ("will", "going", "shall")
Token.set_extension("is_future", getter=future_getter)

# 2. Preprocessing

In [2]:
def get_tenses_count(doc):
  past = 0
  present = 0
  future = 0
  all = 0
  for token in doc:
    tense = token.morph.get('Tense')
    if tense:
      if tense[0] == 'Past':
        past += 1
        all += 1
      elif tense[0] == 'Pres':
        present += 1
        all += 1
    if token._.is_future:
      future += 1
      all += 1
  return past / all, present / all, future / all

def get_pos_count(doc):
  nn = 0
  vrb = 0
  adj = 0
  adv = 0
  all = 0
  for token in doc:
    pos = token.pos_
    if pos == 'NOUN':
      nn += 1
      all += 1
    elif pos == 'VERB':
      vrb += 1
      all += 1
    elif pos == 'ADJ':
      adj += 1
      all += 1
    elif pos == 'ADV':
      adv += 1
      all += 1
  return  nn / all, vrb / all, (adj + adv) / all

def sentence_statistics(doc):
  length = []
  for s in doc.sents:
    length.append(len(s))
  length = np.array(length)
  return np.min(length), np.max(length), np.mean(length), np.median(length)

def preprocessing(nlp, doc):
  new_doc = []
  for token in doc:
    if not (token.is_stop or token.is_punct or (not token.is_ascii) or token._.is_future or len(token.text) < 2):
      new_doc.append(token.lemma_)
  return nlp(' '.join(new_doc))


data = {}
data_raw = {}
data_w2v = []
data_w2v_raw = []
for speech in inaugural.fileids():
  text = inaugural.raw(speech).lower()
  text_raw = text
  text = re.sub('\d\dth|\dth|1st|2nd|3rd', '', text)
  text = re.sub('(¡x)|¨|â|¡', '', text)
  text = re.sub('\s{1,}', ' ', text)
  doc = nlp(text)
  doc_raw = nlp(text_raw)
  data_w2v_raw += [[t.lemma_ for t in s] for s in doc_raw.sents]
  past, pres, future = get_tenses_count(doc)
  past_raw, pres_raw, future_raw = get_tenses_count(doc_raw)
  min_, max_, mean, median = sentence_statistics(doc)
  min_raw, max_raw, mean_raw, median_raw = sentence_statistics(doc_raw)
  noun, verb, ad = get_pos_count(doc)
  noun_raw, verb_raw, ad_raw = get_pos_count(doc_raw)
  doc = preprocessing(nlp, doc)
  data_w2v += [[t.lemma_ for t in s] for s in doc.sents]
  frequent_words = nltk.FreqDist([token.text for token in doc]).most_common(5)
  frequent_words_raw = nltk.FreqDist([token.text for token in doc_raw]).most_common(5)
  data[speech.split('.')[0]] = {'noun': noun, 'verb': verb, 'adj': ad,
                                'past': past, 'pres': pres, 'future': future,
                                'min': min_, 'max': max_, 'mean': mean.round(2), 'median': median,
                                '1st_freq': frequent_words[0][0], '2nd_freq': frequent_words[1][0],
                                '3rd_freq': frequent_words[2][0], '4th_freq': frequent_words[3][0],
                                '5th_freq': frequent_words[4][0]}
  
  data_raw[speech.split('.')[0]] = {'noun': noun_raw, 'verb': verb_raw, 'adj': ad_raw,
                                'past': past_raw, 'pres': pres_raw, 'future': future_raw,
                                'min': min_raw, 'max': max_raw, 'mean': mean_raw.round(2), 'median': median_raw,
                                '1st_freq': frequent_words_raw[0][0], '2nd_freq': frequent_words_raw[1][0],
                                '3rd_freq': frequent_words_raw[2][0], '4th_freq': frequent_words_raw[3][0],
                                '5th_freq': frequent_words_raw[4][0]}

# 3. Creating DataFrame

In [3]:
df = pd.DataFrame.from_dict(data, orient='index')
df.head()

Unnamed: 0,noun,verb,adj,past,pres,future,min,max,mean,median,1st_freq,2nd_freq,3rd_freq,4th_freq,5th_freq
1789-Washington,0.469841,0.230159,0.3,0.490066,0.423841,0.086093,13,150,66.87,58.0,government,public,citizen,present,country
1793-Washington,0.464286,0.232143,0.303571,0.3125,0.5,0.1875,19,64,36.75,32.0,oath,fellow,citizen,call,voice
1797-Adams,0.554275,0.166186,0.279539,0.598901,0.368132,0.032967,10,810,69.84,45.0,people,nation,government,country,state
1801-Jefferson,0.493719,0.229899,0.276382,0.358025,0.530864,0.111111,4,270,47.15,38.0,government,principle,man,good,fellow
1805-Jefferson,0.504569,0.262944,0.232487,0.465909,0.458333,0.075758,15,139,52.93,41.0,public,state,citizen,fellow,duty


Below, we can observe that, without preprocessing, the most frequent tokens consist of stopwords and punctuation marks

In [4]:
df_raw = pd.DataFrame.from_dict(data_raw, orient='index')
df_raw.select_dtypes('object').head()

Unnamed: 0,1st_freq,2nd_freq,3rd_freq,4th_freq,5th_freq
1789-Washington,the,of,",",and,to
1793-Washington,the,of,i,",",to
1797-Adams,",",the,of,and,to
1801-Jefferson,the,",",of,and,to
1805-Jefferson,the,",",of,and,to


# 4. Normalization

In [5]:
scaler = StandardScaler()
scaled = scaler.fit_transform(df.select_dtypes('number'))
df[df.select_dtypes('number').columns] = scaled
df.select_dtypes('number').head()

Unnamed: 0,noun,verb,adj,past,pres,future,min,max,mean,median
1789-Washington,-0.58688,-0.321907,0.997758,1.239301,-1.322275,-0.070993,1.479079,0.253096,2.904883,2.989814
1793-Washington,-0.757372,-0.258658,1.119702,-0.651301,-0.433522,2.260775,2.853342,-0.541096,0.366499,0.400105
1797-Adams,2.004263,-2.361185,0.299132,2.398099,-1.972386,-1.29257,0.791948,6.348055,3.155182,1.69496
1801-Jefferson,0.145881,-0.33017,0.191339,-0.166585,-0.073346,0.504282,-0.582315,1.36127,1.242966,0.99773
1805-Jefferson,0.47885,0.72321,-1.307403,0.982093,-0.91976,-0.308641,1.937167,0.151513,1.73008,1.296543


In [6]:
scaler_raw = StandardScaler()
scaled_raw = scaler_raw.fit_transform(df_raw.select_dtypes('number'))
df_raw[df_raw.select_dtypes('number').columns] = scaled_raw
df_raw.select_dtypes('number').head()

Unnamed: 0,noun,verb,adj,past,pres,future,min,max,mean,median
1789-Washington,-0.584785,-0.33042,1.003492,1.240356,-1.324357,-0.07039,1.478977,0.19456,2.7772,2.955934
1793-Washington,-1.277802,-0.25549,1.695019,-0.65055,-0.435082,2.265301,2.849547,-0.604789,0.322265,0.398601
1797-Adams,2.019308,-2.359543,0.269381,2.39934,-1.97485,-1.294022,0.793692,6.329099,3.35141,1.953058
1801-Jefferson,0.167965,-0.327054,0.161939,-0.165757,-0.074694,0.505853,-0.576878,1.300636,1.34536,0.950182
1805-Jefferson,0.468635,0.727091,-1.297381,0.983106,-0.921606,-0.308437,1.935834,0.083023,1.836838,1.251045


# 5. Word Embedding (Word2Vec)

In [7]:
model = Word2Vec(data_w2v, min_count=1, vector_size=5, sg=1)

embeddings = []
for i in range(df.shape[0]):
  words = df.select_dtypes('object').iloc[i].values
  emb = []
  for w in words:
    emb += list(model.wv[w])
  embeddings.append(emb)
embeddings = np.array(embeddings)
for attr in range(embeddings.shape[1]):
  df[f'{attr}'] = embeddings[:, attr]
df = df.select_dtypes('number')
df.head()

Unnamed: 0,noun,verb,adj,past,pres,future,min,max,mean,median,...,15,16,17,18,19,20,21,22,23,24
1789-Washington,-0.58688,-0.321907,0.997758,1.239301,-1.322275,-0.070993,1.479079,0.253096,2.904883,2.989814,...,-0.378656,0.740103,0.976573,-0.866303,0.463789,-0.358406,0.836906,1.089879,-0.635956,0.540957
1793-Washington,-0.757372,-0.258658,1.119702,-0.651301,-0.433522,2.260775,2.853342,-0.541096,0.366499,0.400105,...,-0.480206,0.92852,0.695885,-1.10641,0.216744,-0.002661,0.991658,0.904866,-0.899023,0.373025
1797-Adams,2.004263,-2.361185,0.299132,2.398099,-1.972386,-1.29257,0.791948,6.348055,3.155182,1.69496,...,-0.358406,0.836906,1.089879,-0.635956,0.540957,-1.439569,0.643511,0.264713,-1.34514,0.674462
1801-Jefferson,0.145881,-0.33017,0.191339,-0.166585,-0.073346,0.504282,-0.582315,1.36127,1.242966,0.99773,...,-0.35655,0.60357,1.175406,-0.699271,0.849161,-1.629942,1.022455,0.99622,-0.43979,-0.265505
1805-Jefferson,0.47885,0.72321,-1.307403,0.982093,-0.91976,-0.308641,1.937167,0.151513,1.73008,1.296543,...,-1.629942,1.022455,0.99622,-0.43979,-0.265505,-0.601932,0.208415,0.867257,-1.438185,0.653744


In [8]:
model_raw = Word2Vec(data_w2v_raw, min_count=1, vector_size=5, sg=1)

embeddings_raw = []
for i in range(df_raw.shape[0]):
  words = df_raw.select_dtypes('object').iloc[i].values
  emb = []
  for w in words:
    emb += list(model_raw.wv[w])
  embeddings_raw.append(emb)
embeddings_raw = np.array(embeddings_raw)
for attr in range(embeddings_raw.shape[1]):
  df_raw[f'{attr}'] = embeddings_raw[:, attr]
df_raw = df_raw.select_dtypes('number')
df_raw.head()

Unnamed: 0,noun,verb,adj,past,pres,future,min,max,mean,median,...,15,16,17,18,19,20,21,22,23,24
1789-Washington,-0.584785,-0.33042,1.003492,1.240356,-1.324357,-0.07039,1.478977,0.19456,2.7772,2.955934,...,-0.106616,0.254264,0.771636,0.048364,-1.449779,-0.810487,0.217499,0.927087,-0.056826,-1.251179
1793-Washington,-1.277802,-0.25549,1.695019,-0.65055,-0.435082,2.265301,2.849547,-0.604789,0.322265,0.398601,...,-0.791904,0.301797,0.997741,0.109041,-1.152525,-0.810487,0.217499,0.927087,-0.056826,-1.251179
1797-Adams,2.019308,-2.359543,0.269381,2.39934,-1.97485,-1.294022,0.793692,6.329099,3.35141,1.953058,...,-0.106616,0.254264,0.771636,0.048364,-1.449779,-0.810487,0.217499,0.927087,-0.056826,-1.251179
1801-Jefferson,0.167965,-0.327054,0.161939,-0.165757,-0.074694,0.505853,-0.576878,1.300636,1.34536,0.950182,...,-0.106616,0.254264,0.771636,0.048364,-1.449779,-0.810487,0.217499,0.927087,-0.056826,-1.251179
1805-Jefferson,0.468635,0.727091,-1.297381,0.983106,-0.921606,-0.308437,1.935834,0.083023,1.836838,1.251045,...,-0.106616,0.254264,0.771636,0.048364,-1.449779,-0.810487,0.217499,0.927087,-0.056826,-1.251179


# 6. Generating Target for Training

In [9]:
average_inflation = [2.99, 6.42, 0.0, -2.03, 1.26, 6.08, -4.52, -3.99, -3.25, -1.55, -2.51, 3.39, -2.43, -3.32, 0.33, 0.0,
 3.1, 0.29, 16.66, -4.43, -3.08, -3.28, -0.72, -1.25, -1.31, -0.55, -1.18, 0.6, 0.87, 0.84, 2.13, 6.63,
 8.75, -0.56, -0.58, -6.62, 2.59, 0.52, 5.19, 7.23, 2.92, 1.29, 1.56, 1.31, 3.89, 4.88, 8.09, 10.67,
 4.31, 3.61, 3.90, 2.66, 2.49, 2.48, 2.38, 2.08, 1.28, 2.54, 6.58]

df['inflation'] = average_inflation
df_raw['inflation'] = average_inflation
df['inflation'].describe()

count    59.000000
mean      1.654746
std       4.080609
min      -6.620000
25%      -0.950000
50%       1.290000
75%       3.500000
max      16.660000
Name: inflation, dtype: float64

# 7. Dimensionality Reduction (tSNE)

In [10]:
tsne = TSNE(n_components=3)
data = tsne.fit_transform(df[df.columns[:-1]])
data = pd.DataFrame(data)
data['target'] = df[df.columns[-1]].values
data.head()

Unnamed: 0,0,1,2,target
0,9.325642,116.6959,82.16922,2.99
1,41.731297,83.989624,37.317051,6.42
2,-90.81942,29.960852,-14.669362,0.0
3,-74.9664,-34.594482,-25.349226,-2.03
4,-93.853081,-98.170319,2.542092,1.26


In [11]:
tsne_raw = TSNE(n_components=3)
data_raw = tsne_raw.fit_transform(df_raw[df_raw.columns[:-1]])
data_raw = pd.DataFrame(data_raw)
data_raw['target'] = df_raw[df_raw.columns[-1]].values
data_raw.head()

Unnamed: 0,0,1,2,target
0,-18.627115,111.067673,-80.688667,2.99
1,-72.231529,64.095329,-3.084186,6.42
2,-107.044403,80.322861,52.702499,0.0
3,46.619778,55.121151,-47.84816,-2.03
4,-63.8932,13.676625,-54.740559,1.26


# 8. Training Model

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    data[data.columns[:-1]],
    data['target'],
    test_size=0.2,
)

regressor = SVR()
regressor.fit(X_train, y_train)
train_pred = regressor.predict(X_train)
train_loss = mean_squared_error(y_train, train_pred)
test_pred = regressor.predict(X_test)
test_loss = mean_squared_error(y_test, test_pred)
print(f"Train Loss: {train_loss}, Test Loss: {test_loss}")

Train Loss: 15.999721409189839, Test Loss: 14.649146224041935


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    data_raw[data_raw.columns[:-1]],
    data_raw['target'],
    test_size=0.2,
)

regressor = SVR()
regressor.fit(X_train, y_train)
train_pred = regressor.predict(X_train)
train_loss = mean_squared_error(y_train, train_pred)
test_pred = regressor.predict(X_test)
test_loss = mean_squared_error(y_test, test_pred)
print(f"Train Loss: {train_loss}, Test Loss: {test_loss}")

Train Loss: 9.13347218162066, Test Loss: 33.45190290786114


In [14]:
inflation_class = []
for i in average_inflation:
  if i > 0:
    inflation_class.append(1)
  elif i < 0:
    inflation_class.append(-1)
  else:
    inflation_class.append(0)

data['target'] = inflation_class
data.groupby('target')['target'].count()

target
-1    19
 0     2
 1    38
Name: target, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    data[data.columns[:-1]],
    data['target'],
    test_size=0.2,
    stratify=data['target']
)

classify = SVC(C=0.005, gamma= 0.005, kernel= 'rbf') # using parameters from GridSearchCV below
classify.fit(X_train, y_train)
train_pred = classify.predict(X_train)
train_loss = f1_score(y_train, train_pred, average='micro')
test_pred = classify.predict(X_test)
test_loss = f1_score(y_test, test_pred, average='micro')
print(f"Train Loss: {train_loss}, Test Loss: {test_loss}")

Train Loss: 0.6382978723404256, Test Loss: 0.6666666666666666


In [43]:
### Search took 34 mins
# model = SVC()
# params = {
#     'gamma': [0.005, 0.01, 0.02, 1, 2, 5],
#     'kernel': ['rbf', 'linear', 'sigmoid'],
#     'C': [0.005, 0.01, 0.02, 1, 2, 5]
# }

# cv = RepeatedKFold(n_splits=4)

# search = GridSearchCV(model, params, scoring='f1_micro', n_jobs=-1, cv=cv, verbose=3)

# result = search.fit(X_train, y_train)

# print('Best Score:', result.best_score_)
# print('Best Hyperparameters:', result.best_params_)

# ### Score: 0.6382575757575758
# ### Best Hyperparameters: {'C': 0.005, 'gamma': 0.005, 'kernel': 'rbf'}

Fitting 40 folds for each of 108 candidates, totalling 4320 fits
[CV 1/40] END .C=0.005, gamma=0.005, kernel=rbf;, score=0.833 total time=   0.0s[CV 4/40] END .C=0.005, gamma=0.005, kernel=rbf;, score=0.364 total time=   0.0s[CV 3/40] END .C=0.005, gamma=0.005, kernel=rbf;, score=0.750 total time=   0.0s

[CV 7/40] END .C=0.005, gamma=0.005, kernel=rbf;, score=0.667 total time=   0.0s
[CV 6/40] END .C=0.005, gamma=0.005, kernel=rbf;, score=0.667 total time=   0.0s
[CV 2/40] END .C=0.005, gamma=0.005, kernel=rbf;, score=0.583 total time=   0.0s
[CV 8/40] END .C=0.005, gamma=0.005, kernel=rbf;, score=0.636 total time=   0.0s
[CV 5/40] END .C=0.005, gamma=0.005, kernel=rbf;, score=0.583 total time=   0.0s

[CV 9/40] END .C=0.005, gamma=0.005, kernel=rbf;, score=0.333 total time=   0.0s
[CV 10/40] END C=0.005, gamma=0.005, kernel=rbf;, score=0.583 total time=   0.0s
[CV 14/40] END C=0.005, gamma=0.005, kernel=rbf;, score=0.500 total time=   0.0s
[CV 11/40] END C=0.005, gamma=0.005, kernel=