In [110]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from  sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion

In [67]:
import sys
sys.path.append('../')

In [70]:
from utils.modelsearch import *
from utils.preprocessing import *

In [71]:
RANDOM_STATE=0

# Load Train Set

In [72]:
train_file_name = '..//data//raw//train//task1.train.txt'

In [73]:
articles_id, articles_content, gold_labels = ([], [], [])
with open(train_file_name, "r", encoding='utf-8') as f:
    for line in f.readlines():
        article_content, article_id, gold_label = line.rstrip().split("\t")
        articles_id.append(article_id)
        articles_content.append(article_content)
        gold_labels.append(gold_label)
print("Number of documents in the training set: %d"%(len(articles_content)))

Number of documents in the training set: 35993


In [74]:
train = pd.DataFrame({'id':articles_id, 'text': articles_content, 'target': gold_labels})

In [75]:
train.shape

(35993, 3)

In [76]:
train.head()

Unnamed: 0,id,target,text
0,727600136,non-propaganda,"Et tu, Rhody? A recent editorial in the Provi..."
1,731714618,non-propaganda,A recent post in The Farmington Mirror — our t...
2,731714635,non-propaganda,"President Donald Trump, as he often does while..."
3,728627182,non-propaganda,"February is Black History Month, and nothing l..."
4,728627443,non-propaganda,"The snow was so heavy, whipped up by gusting w..."


# Preprocessing

Load into dataframe

In [77]:
target = pd.Series(train['target'].map({'propaganda':1,'non-propaganda':0}))
train = train.drop('target', axis=1)

In [78]:
train.head()

Unnamed: 0,id,text
0,727600136,"Et tu, Rhody? A recent editorial in the Provi..."
1,731714618,A recent post in The Farmington Mirror — our t...
2,731714635,"President Donald Trump, as he often does while..."
3,728627182,"February is Black History Month, and nothing l..."
4,728627443,"The snow was so heavy, whipped up by gusting w..."


## Text Preprocessing

Overview of preprocessing steps;

* add vader sentiments
* tokenize words
* convert to lowercase
* replace links with tag

### Sentiment Features

In [80]:
#defined in preprocessing.py in utils
sentiment_features_dict = get_sentiment_features(train)

In [84]:
#save features
with open('..//data//processed//sentiment_features.pickle', 'wb') as handle:
    pickle.dump(sentiment_features_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [85]:
#load features
with open('..//data//processed//sentiment_features.pickle', 'rb') as handle:
    sentiment_features_dict = pickle.load(handle)

In [100]:
train = train.merge(pd.DataFrame(sentiment_features_dict), on='id')

In [101]:
train.head()

Unnamed: 0,id,text,neg_max,neg_median,neg_min,neu_max,neu_median,neu_min,pos_max,pos_median,pos_min
0,727600136,"Et tu, Rhody? A recent editorial in the Provi...",0.42,0.0,0.0,1.0,0.822,0.471,0.444,0.087,0.0
1,731714618,A recent post in The Farmington Mirror — our t...,0.565,0.0,0.0,1.0,0.896,0.374,0.626,0.0,0.0
2,731714635,"President Donald Trump, as he often does while...",0.672,0.0,0.0,1.0,0.885,0.328,0.375,0.0,0.0
3,728627182,"February is Black History Month, and nothing l...",0.49,0.0,0.0,1.0,0.9635,0.51,0.487,0.0,0.0
4,728627443,"The snow was so heavy, whipped up by gusting w...",0.5,0.0,0.0,1.0,0.896,0.0,0.277,0.0,0.0


### Tokenizer

In [87]:
from spacy.en import English
parser = English()

In [106]:
def tokenize(text):
    final_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            final_tokens.append('URL')
        else:
            final_tokens.append(token.lower_)
    return final_tokens

## Get Data

Split into Train and Valid

In [102]:
X_train, X_valid, y_train, y_valid = train_test_split(train.values, target.values, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)

## Vectorize

In [108]:
vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 1), stop_words='english', lowercase=False, tokenizer=tokenize)

In [109]:
text_train = vectorizer.fit_transform(X_train[:,1])
text_valid = vectorizer.transform(X_valid[:,1])
print("Checking that the number of features in train and dev correspond: %s - %s" % (text_train.shape[1], text_valid.shape[1]))

Checking that the number of features in train and dev correspond: 186428 - 186428


In [119]:
from scipy.sparse import hstack

In [124]:
train_matrix = hstack([text_train, X_train[:, 2:].astype(float)])
train_matrix.shape

(28794, 186437)

In [125]:
valid_matrix = hstack([text_valid, X_valid[:, 2:].astype(float)])
valid_matrix.shape

(7199, 186437)

# Models

In [126]:
metrics_dict = {'accuracy':lambda y,y_pred: accuracy_score(y, y_pred),
                'f1':lambda y,y_pred: f1_score(y, y_pred, labels=[0,1])}

del get_metrics,print_metrics,model_search

In [127]:
model_list = [('Logistic Regression', '', LogisticRegression()), \
              ('Naive Bayes', '', MultinomialNB()), \
              ('SVM', '', SVC()), \
              ('Decision Trees', '', DecisionTreeClassifier()), \
              ('Random Forest', '', RandomForestClassifier()), \
              ('AdaBoost', '', AdaBoostClassifier())]

In [128]:
#model_search is defined in modelsearch.py in utils
models_d, models_i = model_search(model_list, (train_matrix, y_train), metrics_dict, valid=(valid_matrix, y_valid))

Fitting Logistic Regression ...
Training scores:
accuracy = 0.9548864346738903
f1 = 0.7585950566809143
Validation scores:
accuracy = 0.9441589109598555
f1 = 0.6959152798789712

Fitting Naive Bayes ...


  'precision', 'predicted', average, warn_for)


Training scores:
accuracy = 0.8881364173091616
f1 = 0.0
Validation scores:
accuracy = 0.8888734546464787
f1 = 0.0

Fitting SVM ...
Training scores:
accuracy = 0.8881364173091616
f1 = 0.0
Validation scores:
accuracy = 0.8888734546464787
f1 = 0.0

Fitting Decision Trees ...
Training scores:
accuracy = 1.0
f1 = 1.0
Validation scores:
accuracy = 0.9208223364356161
f1 = 0.6346153846153846

Fitting Random Forest ...
Training scores:
accuracy = 0.9896506216572897
f1 = 0.9514973958333334
Validation scores:
accuracy = 0.9127656618974858
f1 = 0.3643724696356275

Fitting AdaBoost ...
Training scores:
accuracy = 0.9439119260957144
f1 = 0.7199583839084445
Validation scores:
accuracy = 0.9409640227809418
f1 = 0.7034193998604327



In [17]:
#load models and scores
with open('../models/task1-ml/models_dict.pickle','rb') as handle:
    models_d = pickle.load(handle)
with open('../models/task1-ml/models_info.pickle','rb') as handle:
    models_i = pickle.load(handle)

**Conclusion**

Models based on decision trees and ensembles show some promise (Decision Trees and Random Forest in particular is overfit).

Other models do not do very well.

In [18]:
#looking at decision tree mode
models_d['Decision Trees']

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [19]:
#looking at Random Forest model
models_d['Random Forest']

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

**Hyperparameter tuning**

In [20]:
random_grid = \
{'bootstrap': [True, False],
 'max_depth': list(range(5,50,5)),
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': list(range(1,12)),
 'min_samples_split': list(range(2,21)),
 'n_estimators': list(range(1,25))}

In [21]:
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=RANDOM_STATE, n_jobs = -1)
rf_random.fit(train_matrix, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 21.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 46.0min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'bootstrap': [True, False], 'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]},
          pre_dispatch='2*n_jobs', random_state=0, refi

In [24]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 45,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 17}

In [27]:
rf = [('optimized Random Forest', str(rf_random.best_params_), RandomForestClassifier().set_params(**rf_random.best_params_))]
models, info = model_search(rf, (train_matrix, y_train), metrics_dict, valid=(text_valid, y_valid))

Fitting optimized Random Forest ...
Training scores:
accuracy = 0.9662776967423768
f1 = 0.8225187351489672
Validation scores:
accuracy = 0.9092929573551882
f1 = 0.31335436382754994



No performance improvement

**Hand-tuning decision tree to reduce variance**

In [32]:
dt_params = {'max_depth':50, 'min_samples_split':3}
dt = [('Decision Tree 2', str(rf_params), DecisionTreeClassifier().set_params(**dt_params))]

In [33]:
models, info = model_search(dt, (text_train, y_train), metrics_dict, valid=(text_valid, y_valid))

Fitting Decision Tree 2 ...
Training scores:
accuracy = 0.9784330068764326
f1 = 0.8933172994330871
Validation scores:
accuracy = 0.9459647173218503
f1 = 0.7307958477508651



Improved validation f1 performance