In [1]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.model_selection import RandomizedSearchCV
import random
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.decomposition import TruncatedSVD as PCA

In [2]:
def get_data(dataset,vectorizer):
    '''
    returns feature matrix for specified dataset and vectorizer
    @param dataset: string specifying dataset, "train","dev",etc
    @param vectorizer: string specifying vectorizer "binary","count",etc

    '''
    with open(f'../data/{dataset}_{vectorizer}_subsampled_data.pckl', 'rb') as f:
        return pickle.load(f)


In [None]:
with open('../data/train_labels.pckl', 'rb') as f:
    trainY = pickle.load(f)

with open('../data/dev_labels.pckl', 'rb') as f:
    devY = pickle.load(f)

In [19]:
def data_transformation(train,dev,last4=False,truncatedSVD=False,scaling=False,ngram=False,ncomponent=None):
    if(last4):
        train = train[:,:-4]
        dev = dev[:,:-4]
        
    if(ngram):
        sums = (train > 0).sum(axis=0)
        lim = sums.mean()
        train = train[:, np.asarray(sums > lim)[0]]
        dev = dev[:, np.asarray(sums > lim)[0]]
        
    if(scaling):
        scaler = MaxAbsScaler()
        train = scaler.fit_transform(train)
        dev = scaler.transform(dev)
        
    if(truncatedSVD):
        pca = PCA(n_components=ncomponent,random_state=0)
        train = pca.fit_transform(train)
        dev = pca.transform(dev)
    
    return train,dev
    

In [20]:
def revert():
    train = get_data("train","tfidf")
    dev = get_data("dev","tfidf")
    return train,dev

In [None]:
def getSVMresults(string,train,dev):
    print(string)
    clf = LinearSVC()
    param_grid = {"penalty":["l1","l2"],"loss":["squared_hinge"],"C":scipy.stats.reciprocal(a=1e-4,b=1e2)}
    rsearch = RandomizedSearchCV(n_jobs=-1, estimator=clf,param_distributions=param_grid)

    rsearch.fit(train,trainY) 
    print(rsearch.best_estimator_)

    yval_score = rsearch.decision_function(dev)

    print("Val AUC",roc_auc_score(devY,yval_score))
    print("Val AP",average_precision_score(devY,yval_score))
    print("-----------------------------")   

In [None]:
def getMNBresults(string,train,dev):
    print(string)
    param_dist = {'alpha': loguniform(1e-4, 1e0)}
    nb_multi = MultinomialNB()  
    rsearch = RandomizedSearchCV(nb_multi, param_distributions=param_dist, random_state=22)
    rsearch.fit(train, trainY)
    print(rsearch.best_params_)

    nb_dev_proba = random_search.predict_proba(dev)

    nb_dev_auc = print("VAL AUC",roc_auc_score(devY, nb_dev_proba[:, 1]))
    nb_dev_ap = print("VAL AP",average_precision_score(devY, nb_dev_proba[:, 1]))

    print("-----------------------------")

### SVM

In [None]:
from sklearn.svm import LinearSVC
import scipy.stats

getSVMresults("no mods",train,dev)
train, dev = revert()


train,dev = data_transformation(train,dev,last4=True)
getSVMresults("last4",train,dev)
train_dev = revert()


train,dev = data_transformation(train,dev,truncatedSVD=True,scaling=True,ngram=True,ncomponent=100)
getSVMresults("all but last 4",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,scaling=True)
getSVMresults("scaling",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,truncatedSVD=True,ncomponent=100)
getSVMresults("truncatedSVD100",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,truncatedSVD=True,ncomponent=10)
getSVMresults("truncatedSVD10",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,ngram=True)
getSVMresults("NGRAMS",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,truncatedSVD=True,scaling=True,ncomponent=100)
getSVMresults("scaling + svd100",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,last4=True,scaling=True)
getSVMresults("last 4+ scaling",train,dev)
train,dev = revert()

### MNB

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.fixes import loguniform
param_dist = {'alpha': loguniform(1e-4, 1e0)}


train,dev = revert()
getMNBresults("no mod",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,last4=True)
getMNBresults("last4,train,dev")
train,dev = revert()


train,dev = data_transformation(train,dev,truncatedSVD=True,scaling=True,ngram=True,ncomponent=100)
getMNBresults("all except last 4",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,scaling=True)
getMNBresults("scaling",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,truncatedSVD=True,ncomponent=100)
getMNBresults("truncatedSVD 100",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,truncatedSVD=True,ncomponent=10)
getMNBresults("truncatedSVD 10",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,ngram=True)
getMNBresults("ngram",train,dev)
train,dev = revert()



train,dev = data_transformation(train,dev,truncatedSVD=True,scaling=True,ncomponent=100)
getMNBresults("svd + scaling",train,dev)
train,dev = revert()


train,dev = data_transformation(train,dev,last4=True,scaling=True)
getMNBresults("last4+scaling",train,dev)
train,dev = revert()