In [16]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.model_selection import RandomizedSearchCV
import random
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.decomposition import TruncatedSVD as PCA

In [17]:
def get_data(dataset,vectorizer):
    '''
    returns feature matrix for specified dataset and vectorizer
    @param dataset: string specifying dataset, "train","dev",etc
    @param vectorizer: string specifying vectorizer "binary","count",etc

    '''
    with open(f'../data/{dataset}_{vectorizer}_subsampled_data.pckl', 'rb') as f:
        return pickle.load(f)


In [18]:
with open('../data/train_labels.pckl', 'rb') as f:
    trainY = pickle.load(f)

with open('../data/dev_labels.pckl', 'rb') as f:
    devY = pickle.load(f)

In [19]:
def data_transformation(train,dev,last4=False,truncatedSVD=False,scaling=False,ngram=False,ncomponent=None):
    if(last4):
        train = train[:,:-4]
        dev = dev[:,:-4]
        
    if(ngram):
        sums = (train > 0).sum(axis=0)
        lim = sums.mean()
        train = train[:, np.asarray(sums > lim)[0]]
        dev = dev[:, np.asarray(sums > lim)[0]]
        
    if(scaling):
        scaler = MaxAbsScaler()
        train = scaler.fit_transform(train)
        dev = scaler.transform(dev)
        
    if(truncatedSVD):
        pca = PCA(n_components=ncomponent,random_state=0)
        train = pca.fit_transform(train)
        dev = pca.transform(dev)
    
    return train,dev
    

In [20]:
def revert():
    train = get_data("train","tfidf")
    dev = get_data("dev","tfidf")
    return train,dev

### SVM

In [None]:
from sklearn.svm import LinearSVC
import scipy.stats
param_grid = {"penalty":["l1","l2"],"loss":["squared_hinge"],"C":scipy.stats.reciprocal(a=1e-4,b=1e2)}

'''
No Mod
'''
print("NO MODIFICATIONS")
clf = LinearSVC()
rsearch = RandomizedSearchCV(n_jobs=-1, estimator=clf,param_distributions=param_grid)

rsearch.fit(train,trainY) 
print(rsearch.best_estimator_)

yval_score = rsearch.decision_function(dev)

print("Val AUC",roc_auc_score(devY,yval_score))
print("Val AP",average_precision_score(devY,yval_score))
print("-----------------------------")

train, dev = revert()

'''
Last 4 Features
'''
print("LAST 4 FEATURES")
clf = LinearSVC()
rsearch = RandomizedSearchCV(n_jobs=-1, estimator=clf,param_distributions=param_grid)

train,dev = data_transformation(train,dev,last4=True)

rsearch.fit(train,trainY) 
print(rsearch.best_estimator_)

yval_score = rsearch.decision_function(dev)

print("Val AUC",roc_auc_score(devY,yval_score))
print("Val AP",average_precision_score(devY,yval_score))
print("-----------------------------")

train_dev = revert()

'''
All Except Last 4 
'''
print("ALL EXCEPT LAST 4")
clf = LinearSVC()
rsearch = RandomizedSearchCV(n_jobs=-1, estimator=clf,param_distributions=param_grid)

train,dev = data_transformation(train,dev,truncatedSVD=True,scaling=True,ngram=True,ncomponent=100)

rsearch.fit(train,trainY) 
print(rsearch.best_estimator_)

yval_score = rsearch.decision_function(dev)

print("Val AUC",roc_auc_score(devY,yval_score))
print("Val AP",average_precision_score(devY,yval_score))
print("-----------------------------")

train,dev = revert()


'''
MaxAbsScaler
'''
print("SCALING")
clf = LinearSVC()
rsearch = RandomizedSearchCV(n_jobs=-1, estimator=clf,param_distributions=param_grid)

train,dev = data_transformation(train,dev,scaling=True)

rsearch.fit(train,trainY) 
print(rsearch.best_estimator_)

yval_score = rsearch.decision_function(dev)

print("Val AUC",roc_auc_score(devY,yval_score))
print("Val AP",average_precision_score(devY,yval_score))
print("-----------------------------")

train,dev = revert()



'''
TruncatedSVD 100
'''
print("SVD100")
clf = LinearSVC()
rsearch = RandomizedSearchCV(n_jobs=-1, estimator=clf,param_distributions=param_grid)

train,dev = data_transformation(train,dev,truncatedSVD=True,ncomponent=100)

rsearch.fit(train,trainY) 
print(rsearch.best_estimator_)

yval_score = rsearch.decision_function(dev)

print("Val AUC",roc_auc_score(devY,yval_score))
print("Val AP",average_precision_score(devY,yval_score))
print("-----------------------------")

train,dev = revert()



'''
TruncatedSVD 10
'''
print("SVD10")
clf = LinearSVC()
rsearch = RandomizedSearchCV(n_jobs=-1, estimator=clf,param_distributions=param_grid)

train,dev = data_transformation(train,dev,truncatedSVD=True,ncomponent=10)

rsearch.fit(train,trainY) 
print(rsearch.best_estimator_)

yval_score = rsearch.decision_function(dev)

print("Val AUC",roc_auc_score(devY,yval_score))
print("Val AP",average_precision_score(devY,yval_score))
print("-----------------------------")

train,dev = revert()


'''
Ngrams
'''
print("NGRAMS")
clf = LinearSVC()
rsearch = RandomizedSearchCV(n_jobs=-1, estimator=clf,param_distributions=param_grid)

train,dev = data_transformation(train,dev,ngram=True)

rsearch.fit(train,trainY) 
print(rsearch.best_estimator_)

yval_score = rsearch.decision_function(dev)

print("Val AUC",roc_auc_score(devY,yval_score))
print("Val AP",average_precision_score(devY,yval_score))
print("-----------------------------")

train,dev = revert()



'''
scaling and truncatedSVD 100
'''
print("Scaling + SVD100")
clf = LinearSVC()
rsearch = RandomizedSearchCV(n_jobs=-1, estimator=clf,param_distributions=param_grid)

train,dev = data_transformation(train,dev,truncatedSVD=True,scaling=True,ncomponent=100)

rsearch.fit(train,trainY) 
print(rsearch.best_estimator_)

yval_score = rsearch.decision_function(dev)

print("Val AUC",roc_auc_score(devY,yval_score))
print("Val AP",average_precision_score(devY,yval_score))
print("-----------------------------")

train,dev = revert()


'''
last 4 features + scaling
'''
print("last 4 + scaling")
clf = LinearSVC()
rsearch = RandomizedSearchCV(n_jobs=-1, estimator=clf,param_distributions=param_grid)

train,dev = data_transformation(train,dev,last4=True,scaling=True)

rsearch.fit(train,trainY) 
print(rsearch.best_estimator_)

yval_score = rsearch.decision_function(dev)

print("Val AUC",roc_auc_score(devY,yval_score))
print("Val AP",average_precision_score(devY,yval_score))
print("-----------------------------")

train,dev = revert()






'''
all except last 4, with sacling
'''
print("ALL EXCEPT LAST 4")
clf = LinearSVC()
rsearch = RandomizedSearchCV(n_jobs=-1, estimator=clf,param_distributions=param_grid)

train,dev = data_transformation(train,dev,truncatedSVD=True,scaling=True,ngram=True,ncomponent=100)

rsearch.fit(train,trainY) 
print(rsearch.best_estimator_)

yval_score = rsearch.decision_function(dev)

print("Val AUC",roc_auc_score(devY,yval_score))
print("Val AP",average_precision_score(devY,yval_score))
print("-----------------------------")

train,dev = revert()




### MNB

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.fixes import loguniform
param_dist = {'alpha': loguniform(1e-4, 1e0)}


train,dev = revert()

'''
NO MOD
'''
print("NO MODIFICATIONS")
nb_multi = MultinomialNB()  
rsearch = RandomizedSearchCV(nb_multi, param_distributions=param_dist, random_state=22)
rsearch.fit(train, trainY)

print(rsearch.best_params_)

nb_dev_proba = random_search.predict_proba(dev)

nb_dev_auc = print("VAL AUC",roc_auc_score(devY, nb_dev_proba[:, 1]))
nb_dev_ap = print("VAL AP",average_precision_score(devY, nb_dev_proba[:, 1]))

print("-----------------------------")

train,dev = revert()


'''
LAST 4
'''
print("LAST 4")
nb_multi = MultinomialNB()  
rsearch = RandomizedSearchCV(nb_multi, param_distributions=param_dist, random_state=22)

train,dev = data_transformation(train,dev,last4=True)


rsearch.fit(train, trainY)

print(rsearch.best_params_)

nb_dev_proba = rsearch.predict_proba(dev)

nb_dev_auc = print("VAL AUC",roc_auc_score(devY, nb_dev_proba[:, 1]))
nb_dev_ap = print("VAL AP",average_precision_score(devY, nb_dev_proba[:, 1]))

print("-----------------------------")

train,dev = revert()

'''
ALL EXCEPT LAST 4
'''
print("ALL EXCEPT LAST 4")
nb_multi = MultinomialNB()  
rsearch = RandomizedSearchCV(nb_multi, param_distributions=param_dist, random_state=22)

train,dev = data_transformation(train,dev,truncatedSVD=True,scaling=True,ngram=True,ncomponent=100)


rsearch.fit(train, trainY)

print(rsearch.best_params_)

nb_dev_proba = rsearch.predict_proba(dev)

nb_dev_auc = print("VAL AUC",roc_auc_score(devY, nb_dev_proba[:, 1]))
nb_dev_ap = print("VAL AP",average_precision_score(devY, nb_dev_proba[:, 1]))

print("-----------------------------")

train,dev = revert()


'''
SCALING
'''
print("scaling")
nb_multi = MultinomialNB()  
rsearch = RandomizedSearchCV(nb_multi, param_distributions=param_dist, random_state=22)

train,dev = data_transformation(train,dev,scaling=True)


rsearch.fit(train, trainY)

print(rsearch.best_params_)

nb_dev_proba = rsearch.predict_proba(dev)

nb_dev_auc = print("VAL AUC",roc_auc_score(devY, nb_dev_proba[:, 1]))
nb_dev_ap = print("VAL AP",average_precision_score(devY, nb_dev_proba[:, 1]))

print("-----------------------------")

train,dev = revert()

'''
TRUNCATED 100
'''
print("TRUNCATED SVD 100")
nb_multi = MultinomialNB()  
rsearch = RandomizedSearchCV(nb_multi, param_distributions=param_dist, random_state=22)

train,dev = data_transformation(train,dev,truncatedSVD=True,ncomponent=100)


rsearch.fit(train, trainY)

print(rsearch.best_params_)

nb_dev_proba = rsearch.predict_proba(dev)

nb_dev_auc = print("VAL AUC",roc_auc_score(devY, nb_dev_proba[:, 1]))
nb_dev_ap = print("VAL AP",average_precision_score(devY, nb_dev_proba[:, 1]))

print("-----------------------------")

train,dev = revert()


'''
TRUNCATED 10
'''
print("TRUNCATED SVD 10")
nb_multi = MultinomialNB()  
rsearch = RandomizedSearchCV(nb_multi, param_distributions=param_dist, random_state=22)

train,dev = data_transformation(train,dev,truncatedSVD=True,ncomponent=10)


rsearch.fit(train, trainY)

print(rsearch.best_params_)

nb_dev_proba = rsearch.predict_proba(dev)

nb_dev_auc = print("VAL AUC",roc_auc_score(devY, nb_dev_proba[:, 1]))
nb_dev_ap = print("VAL AP",average_precision_score(devY, nb_dev_proba[:, 1]))

print("-----------------------------")

train,dev = revert()

'''
NGRAMS
'''
print("NGRAMS")
nb_multi = MultinomialNB()  
rsearch = RandomizedSearchCV(nb_multi, param_distributions=param_dist, random_state=22)

train,dev = data_transformation(train,dev,ngram=True)


rsearch.fit(train, trainY)

print(rsearch.best_params_)

nb_dev_proba = rsearch.predict_proba(dev)

nb_dev_auc = print("VAL AUC",roc_auc_score(devY, nb_dev_proba[:, 1]))
nb_dev_ap = print("VAL AP",average_precision_score(devY, nb_dev_proba[:, 1]))

print("-----------------------------")

train,dev = revert()


'''
SCALING + PCA
'''
print("SCALING + PCA 100")
nb_multi = MultinomialNB()  
rsearch = RandomizedSearchCV(nb_multi, param_distributions=param_dist, random_state=22)

train,dev = data_transformation(train,dev,truncatedSVD=True,scaling=True,ncomponent=100)


rsearch.fit(train, trainY)

print(rsearch.best_params_)

nb_dev_proba = rsearch.predict_proba(dev)

nb_dev_auc = print("VAL AUC",roc_auc_score(devY, nb_dev_proba[:, 1]))
nb_dev_ap = print("VAL AP",average_precision_score(devY, nb_dev_proba[:, 1]))

print("-----------------------------")

train,dev = revert()


'''
only last 4, with scaling
'''
print("ONLY LAST 4 WITH SCALING")
nb_multi = MultinomialNB()  
rsearch = RandomizedSearchCV(nb_multi, param_distributions=param_dist, random_state=22)

train,dev = data_transformation(train,dev,last4=True,scaling=True)


rsearch.fit(train, trainY)

print(rsearch.best_params_)

nb_dev_proba = rsearch.predict_proba(dev)

nb_dev_auc = print("VAL AUC",roc_auc_score(devY, nb_dev_proba[:, 1]))
nb_dev_ap = print("VAL AP",average_precision_score(devY, nb_dev_proba[:, 1]))

print("-----------------------------")

train,dev = revert()


'''
ALL EXCEPT LAST 4, WITH SCALING
'''
print("ALL EXCEPT LAST 4, WITH SCALING")
nb_multi = MultinomialNB()  
rsearch = RandomizedSearchCV(nb_multi, param_distributions=param_dist, random_state=22)

train,dev = data_transformation(train,dev,truncatedSVD=True,scaling=True,ngram=True,ncomponent=100)


rsearch.fit(train, trainY)

print(rsearch.best_params_)

nb_dev_proba = rsearch.predict_proba(dev)

nb_dev_auc = print("VAL AUC",roc_auc_score(devY, nb_dev_proba[:, 1]))
nb_dev_ap = print("VAL AP",average_precision_score(devY, nb_dev_proba[:, 1]))

print("-----------------------------")

train,dev = revert()