In [1]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.model_selection import RandomizedSearchCV
import random
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.decomposition import TruncatedSVD as PCA

In [2]:
def get_data(dataset,vectorizer):
    '''
    returns feature matrix for specified dataset and vectorizer
    @param dataset: string specifying dataset, "train","dev",etc
    @param vectorizer: string specifying vectorizer "binary","count",etc

    '''
    with open(f'../data/{dataset}_{vectorizer}_subsampled_data.pckl', 'rb') as f:
        return pickle.load(f)


In [3]:
with open('../data/train_labels.pckl', 'rb') as f:
    trainY = pickle.load(f)

with open('../data/dev_labels.pckl', 'rb') as f:
    devY = pickle.load(f)

In [4]:
def data_transformation(train,
                        dev,
                        last4=False,
                        all_except_last4=False,
                        truncatedSVD=False,
                        scaling=False,
                        ngram_30=False,
                        ngram_mean=False,
                        ncomponent=None,
                        ):
    train_tf = train.copy()
    dev_tf = dev.copy()
    
    if last4:
        train_tf = train_tf[:,-4:]
        dev_tf = dev_tf[:,-4:]
        
    if all_except_last4:
        train_tf = train_tf[:,:-4]
        dev_tf = dev_tf[:,:-4]
        
    if ngram_mean:
        sums = (train_tf > 0).sum(axis=0)
        lim = sums.mean()
        train_tf = train_tf[:, np.asarray(sums > lim)[0]]
        dev_tf = dev_tf[:, np.asarray(sums > lim)[0]]
    
    if ngram_30:
        sums = (train_tf > 0).sum(axis=0)
        lim = 30
        train_tf = train_tf[:, np.asarray(sums > lim)[0]]
        dev_tf = dev_tf[:, np.asarray(sums > lim)[0]]
        
    if scaling:
        scaler = MaxAbsScaler()
        train_tf = scaler.fit_transform(train_tf)
        dev_tf = scaler.transform(dev_tf)
        
    if truncatedSVD:
        pca = PCA(n_components=ncomponent,random_state=0)
        train_tf = pca.fit_transform(train_tf)
        dev_tf = pca.transform(dev_tf)
    
    return train_tf, dev_tf
    

In [5]:
def revert():
    train = get_data("train", "tfidf")
    dev = get_data("dev", "tfidf")
    return train, dev

In [18]:
def getSVMresults(string, train, dev, l1=False):
    print(string)

    if l1:
        penalty = 'l1'
        dual = False
    else:
        penalty = 'l2'
        dual = True

    clf = LinearSVC(C=0.020329421573461483,
                    dual=dual,
                    penalty=penalty, random_state=0,
                    )
    clf.fit(train, trainY)
    
    yval_score = clf.decision_function(dev)

    print("Val AUC", roc_auc_score(devY, yval_score))
    print("Val AP", average_precision_score(devY, yval_score))
    print("-----------------------------")   

In [33]:
def getMNBresults(string, train, dev, svd=False):
    print(string)
    
    nb_multi = MultinomialNB(alpha=0.17693089816649998)
    
    if svd:
        train += abs(train.min())
        dev += abs(dev.min())
        #print(f'min is: {train.min()}')
    
    nb_multi.fit(train, trainY)
    
    nb_dev_proba = nb_multi.predict_proba(dev)[:, 1]

    nb_dev_auc = print("VAL AUC", roc_auc_score(devY, nb_dev_proba))
    nb_dev_ap = print("VAL AP", average_precision_score(devY, nb_dev_proba))
    
    if string == 'no mods':
        import pickle
        with open('../data/best_mnb_tfidf.pckl', 'wb') as f:
            pickle.dump(nb_multi, f)

    print("-----------------------------")

### SVM

In [9]:
from sklearn.svm import LinearSVC
import scipy.stats

train, dev = revert()

getSVMresults("no mods", train, dev)

train_tf, dev_tf = data_transformation(train, dev, last4=True)
getSVMresults("last4", train_tf, dev_tf)

train_tf, dev_tf = data_transformation(train, dev, all_except_last4=True)
getSVMresults("all but last 4", train_tf, dev_tf)

train_tf, dev_tf = data_transformation(train, dev, scaling=True)
getSVMresults("scaling", train_tf, dev_tf)

train_tf, dev_tf = data_transformation(train, dev, truncatedSVD=True, ncomponent=100)
getSVMresults("truncatedSVD100", train_tf, dev_tf)

train_tf, dev_tf = data_transformation(train, dev, truncatedSVD=True, ncomponent=10)
getSVMresults("truncatedSVD10", train_tf, dev_tf)

no mods
[LibLinear]



Val AUC 0.7514917776749358
Val AP 0.2317726239381011
-----------------------------
last4
[LibLinear]



Val AUC 0.6941519462634351
Val AP 0.18116968617844315
-----------------------------
all but last 4
[LibLinear]Val AUC 0.7125164711534802
Val AP 0.2053572619567911
-----------------------------
scaling
[LibLinear]Val AUC 0.7334506913637129
Val AP 0.21935211298471108
-----------------------------
truncatedSVD100
[LibLinear]



Val AUC 0.7402465754611585
Val AP 0.2219357880331605
-----------------------------
truncatedSVD10
[LibLinear]Val AUC 0.714849059164995
Val AP 0.20162762249021143
-----------------------------




In [19]:
getSVMresults('with L1', train, dev, l1=True)

train_tf, dev_tf = data_transformation(train, dev, ngram_mean=True)
getSVMresults("ngram_mean", train_tf, dev_tf)

train_tf, dev_tf = data_transformation(train, dev, ngram_30=True)
getSVMresults("ngram_30", train_tf, dev_tf)

train_tf, dev_tf = data_transformation(train, dev, truncatedSVD=True, scaling=True, ncomponent=100)
getSVMresults("scaling + svd100", train_tf, dev_tf)

train_tf, dev_tf = data_transformation(train, dev, last4=True, scaling=True)
getSVMresults("last 4 + scaling", train_tf, dev_tf)

with L1
Val AUC 0.7204560173481427
Val AP 0.20606592195047516
-----------------------------
ngram_mean




Val AUC 0.7512333615016391
Val AP 0.2316057835286238
-----------------------------
ngram_30




Val AUC 0.7503096814704876
Val AP 0.23059991152936476
-----------------------------
scaling + svd100
Val AUC 0.6917416193343989
Val AP 0.19332778082469648
-----------------------------
last 4 + scaling
Val AUC 0.6903680151775861
Val AP 0.176578647251165
-----------------------------


### MNB

In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.fixes import loguniform

train, dev = revert()

getMNBresults("no mods", train, dev)

no mods
VAL AUC 0.7498575657215163
VAL AP 0.2293512205357953
-----------------------------


In [21]:
train_tf, dev_tf = data_transformation(train, dev, last4=True)
getMNBresults("last4", train_tf, dev_tf)

train_tf, dev_tf = data_transformation(train, dev, all_except_last4=True)
getMNBresults("all but last 4", train_tf, dev_tf)

train_tf, dev_tf = data_transformation(train, dev, scaling=True)
getMNBresults("scaling", train_tf, dev_tf)

no mods
VAL AUC 0.7498575657215163
VAL AP 0.2293512205357953
-----------------------------
last4
VAL AUC 0.6581026055173183
VAL AP 0.1437128131114489
-----------------------------
all but last 4
VAL AUC 0.7070023469057676
VAL AP 0.20591464071739765
-----------------------------
scaling
VAL AUC 0.6536068300836146
VAL AP 0.159505790364585
-----------------------------


In [31]:
train_tf, dev_tf = data_transformation(train, dev, truncatedSVD=True, ncomponent=100)
getMNBresults("truncatedSVD100", train_tf, dev_tf, svd=True)

train_tf, dev_tf = data_transformation(train, dev, truncatedSVD=True, ncomponent=10)
getMNBresults("truncatedSVD10", train_tf, dev_tf, svd=True)

truncatedSVD100
VAL AUC 0.7096800773626037
VAL AP 0.18745670852850765
-----------------------------
truncatedSVD10
VAL AUC 0.6878866303842578
VAL AP 0.15705489949022805
-----------------------------


In [32]:
train_tf, dev_tf = data_transformation(train, dev, ngram_mean=True)
getMNBresults("ngram_mean", train_tf, dev_tf)

train_tf, dev_tf = data_transformation(train, dev, ngram_30=True)
getMNBresults("ngram_30", train_tf, dev_tf)

train_tf, dev_tf = data_transformation(train, dev, truncatedSVD=True, scaling=True, ncomponent=100)
getMNBresults("scaling + svd100", train_tf, dev_tf, svd=True)

train_tf, dev_tf = data_transformation(train, dev, last4=True, scaling=True)
getMNBresults("last 4 + scaling", train_tf, dev_tf)

ngram_mean
VAL AUC 0.7157689633180022
VAL AP 0.18875477473370467
-----------------------------
ngram_30
VAL AUC 0.6931802543914015
VAL AP 0.16490223532898143
-----------------------------
scaling + svd100
VAL AUC 0.67029364609327
VAL AP 0.1807745196643682
-----------------------------
last 4 + scaling
VAL AUC 0.6793984223370249
VAL AP 0.16104718364310758
-----------------------------
