In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import sys
sys.path.append('../src/features/')
import build_features as bf

In [8]:
import importlib
importlib.reload(bf)

<module 'build_features' from '../src/features/build_features.py'>

In [4]:
INT_PATH = '../data/interim/'
X_train_raw, y_train = bf.get_raw_data('train', path=INT_PATH)
X_val_raw, y_val = bf.get_raw_data('val', path=INT_PATH)

In [9]:
bow = bf.BagOfWords(high_abs=2000, low_abs=6)

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

In [10]:
pipe1 = make_pipeline(bow, LogisticRegression()).fit(X_train_raw, y_train)

In [11]:
def scoring(clf, X_train, y_train, X_val, y_val):
    print("Training score:", clf.score(X_train, y_train))
    print("Validation score:", clf.score(X_val, y_val))

In [13]:
scoring(pipe1, X_train_raw, y_train, X_val_raw, y_val)

Training score: 0.9988833054159687
Validation score: 0.9799163179916318


In [14]:
pipe2 = make_pipeline(bf.BagOfWords(high_abs=2000, low_abs=6, stemming=True),
                      LogisticRegression()).fit(X_train_raw, y_train)

In [15]:
scoring(pipe2, X_train_raw, y_train, X_val_raw, y_val)

Training score: 0.9988833054159687
Validation score: 0.9765690376569037


In [16]:
pipe2.named_steps.keys()

dict_keys(['bagofwords', 'logisticregression'])

In [17]:
bow2 = pipe2.named_steps['bagofwords']

In [19]:
bow2.working_word_freq.shape

(7466,)

In [20]:
bow.working_word_freq.shape

(10039,)

In [23]:
bow2.set_filters(high_abs=1500, low_abs=100)

In [24]:
scoring(pipe2, X_train_raw, y_train, X_val_raw, y_val)

ValueError: X has 1115 features per sample; expecting 7466

Changing the filters broke our pipeline because the model wasn't retrained on the new bag of words. It's probably not possible to update the pipeline without re-fitting the bag of words, which is the slowest part (well... maybe transforming is slower).

In [25]:
X_train2 = bow2.transform(X_train_raw)

In [26]:
X_val2 = bow2.transform(X_val_raw)

In [27]:
clf2 = LogisticRegression().fit(X_train2, y_train)

In [28]:
scoring(clf2, X_train2, y_train, X_val2, y_val)

Training score: 0.9980457844779452
Validation score: 0.9807531380753138


In [29]:
X_train2.shape

(3582, 1115)

We seem to have improved the validation score and decreased overfitting by filtering out more words.

In [35]:
feature_weights = pd.Series(index=X_train2.columns, data=clf2.coef_[0])

In [47]:
feature_weights[feature_weights.gt(0.0)].sort_values(ascending=False)[:50]

sight       1.759620
remov       1.608940
websit      1.174485
guarante    1.076444
offer       1.070636
opt         0.968608
below       0.958504
market      0.947140
price       0.927196
pleas       0.926523
credit      0.925995
repli       0.865705
money       0.828253
window      0.809870
contact     0.792579
improv      0.776050
access      0.763946
color       0.761498
visit       0.760643
name        0.751352
limit       0.737242
instruct    0.729157
web         0.721511
dollar      0.719117
amp         0.682896
opportun    0.677532
china       0.665205
parti       0.663037
payment     0.662804
minut       0.641375
final       0.633853
size        0.627231
record      0.619909
site        0.617344
invest      0.609582
ever        0.608503
receiv      0.604273
low         0.599970
longer      0.598845
transfer    0.590153
letter      0.589095
track       0.585905
care        0.584621
hundr       0.578068
paid        0.570889
industri    0.570611
cost        0.569355
util        0

In [48]:
from sklearn.metrics import precision_score, recall_score, f1_score

def score(truth, preds):
    print(f'Precision: {precision_score(truth, preds):.3f}')
    print(f'Recall: {recall_score(truth, preds):.3f}')
    print(f'F_1 score: {f1_score(truth, preds):.3f}')

In [49]:
score(y_train, clf2.predict(X_train2))
score(y_val, clf2.predict(X_val2))

Precision: 0.997
Recall: 0.996
F_1 score: 0.997
Precision: 0.965
Recall: 0.973
F_1 score: 0.969


It might be useful to transform the raw data lists into arrays of lists/counters containing the words we extract from each 