# Baseline for submission

## Copied from fnc_kflod.py

In [22]:
import sys
import numpy as np
import os
import re

from sklearn.ensemble import GradientBoostingClassifier
from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats
from feature_engineering import word_overlap_features
from utils.dataset import DataSet
from utils.generate_test_splits import kfold_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission

from utils.system import parse_params, check_version

In [23]:
def generate_features(stances,dataset,name):
    h, b, y = [],[],[]

    for stance in stances:
        y.append(LABELS.index(stance['Stance']))
        h.append(stance['Headline'])
        b.append(dataset.articles[stance['Body ID']])

    X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap."+name+".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity."+name+".npy")
    X_hand = gen_or_load_feats(hand_features, h, b, "features/hand."+name+".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X,y

In [24]:
def clean_cache():
    dr = "features"
    for f in os.listdir(dr):
        if re.search('\.npy$', f):
            fname = os.path.join(dr, f)
            os.remove(fname)
    for f in ['hold_out_ids.txt', 'training_ids.txt']:
        fname = os.path.join('splits', f)
        if os.path.isfile(fname):
            os.remove(fname)
    print("All clear")

In [25]:
check_version()
clean_cache()

All clear


In [26]:
#Load the training dataset and generate folds
d = DataSet()
folds,hold_out = kfold_split(d,n_folds=10)
fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

# Load the competition dataset
#competition_dataset = DataSet("competition_test")
competition_dataset = DataSet("competition_test")
X_competition, y_competition = generate_features(competition_dataset.stances, competition_dataset, "competition")

Xs = dict()
ys = dict()

# Load/Precompute all features now
X_holdout,y_holdout = generate_features(hold_out_stances,d,"holdout")
for fold in fold_stances:
    Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold))


Reading dataset
Total stances: 49972
Total bodies: 1683
Reading dataset
Total stances: 25413
Total bodies: 904


25413it [02:02, 206.95it/s]
25413it [00:05, 5050.56it/s]
25413it [01:28, 288.37it/s]
25413it [02:08, 198.23it/s]
9622it [00:34, 279.75it/s]
9622it [00:01, 4901.76it/s]
9622it [00:41, 233.31it/s]
9622it [00:59, 161.92it/s]
4124it [00:18, 229.05it/s]
4124it [00:01, 4047.46it/s]
4124it [00:18, 229.11it/s]
4124it [00:25, 164.75it/s]
4663it [00:20, 230.65it/s]
4663it [00:01, 3974.44it/s]
4663it [00:20, 226.21it/s]
4663it [00:29, 159.27it/s]
3783it [00:13, 274.28it/s]
3783it [00:00, 4193.39it/s]
3783it [00:14, 267.96it/s]
3783it [00:19, 190.69it/s]
3388it [00:15, 213.59it/s]
3388it [00:00, 3863.45it/s]
3388it [00:15, 219.91it/s]
3388it [00:21, 158.66it/s]
3644it [00:15, 228.01it/s]
3644it [00:00, 4029.00it/s]
3644it [00:16, 224.23it/s]
3644it [00:21, 165.75it/s]
4644it [00:24, 191.95it/s]
4644it [00:02, 2202.82it/s]
4644it [00:21, 215.18it/s]
4644it [10:05,  7.67it/s] 
3848it [00:16, 230.14it/s]
3848it [00:00, 4290.60it/s]
3848it [00:20, 184.49it/s]
3848it [05:57, 10.77it/s] 
4273it [00:19, 

In [27]:
best_score = 0
best_fold = None


# Classifier for each fold
for fold in fold_stances:
    ids = list(range(len(folds)))
    del ids[fold]

    X_train = np.vstack(tuple([Xs[i] for i in ids]))
    y_train = np.hstack(tuple([ys[i] for i in ids]))

    X_test = Xs[fold]
    y_test = ys[fold]

    clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
    clf.fit(X_train, y_train)

    predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
    actual = [LABELS[int(a)] for a in y_test]

    fold_score, _ = score_submission(actual, predicted)
    max_fold_score, _ = score_submission(actual, actual)

    score = fold_score/max_fold_score

    print("Score for fold "+ str(fold) + " was - " + str(score))
    if score > best_score:
        best_score = score
        best_fold = clf

      Iter       Train Loss   Remaining Time 
         1           0.6946            2.04m
         2           0.6307            1.89m
         3           0.5834            1.92m
         4           0.5463            1.94m
         5           0.5175            2.13m
         6           0.4928            2.20m
         7           0.4729            2.17m
         8           0.4560            2.19m
         9           0.4427         2547.62m
        10           0.4299         2281.71m
        20           0.3696         1081.58m
        30           0.3488         1343.39m
        40           0.3378          948.60m
        50           0.3311          711.89m
        60           0.3258         1208.42m
        70           0.3219          962.03m
        80           0.3186          777.24m
        90           0.3158          633.40m
       100           0.3133          518.32m
       200           0.2943            0.00s
Score for fold 6 was - 0.7700373455903476
      Iter  

In [28]:
#Run on Holdout set and report the final score on the holdout set
predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
actual = [LABELS[int(a)] for a in y_holdout]

print("Scores on the dev set")
report_score(actual,predicted)
print("")
print("")

#Run on competition dataset
predicted = [LABELS[int(a)] for a in best_fold.predict(X_competition)]
actual = [LABELS[int(a)] for a in y_competition]

print("Scores on the test set")
report_score(actual,predicted)

Scores on the dev set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    115    |     8     |    557    |    82     |
-------------------------------------------------------------
| disagree  |    16     |     3     |    128    |    15     |
-------------------------------------------------------------
|  discuss  |    62     |     3     |   1528    |    207    |
-------------------------------------------------------------
| unrelated |     5     |     1     |    96     |   6796    |
-------------------------------------------------------------
Score: 3538.5 out of 4448.5	(79.54366640440598%)


Scores on the test set
-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    167    |    11     

75.0885098165433