## Data and Imports

In [3]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.model_selection import RandomizedSearchCV
import random
from sklearn.metrics import average_precision_score, roc_auc_score

In [2]:
def get_data(dataset,vectorizer):
    '''
    returns feature matrix for specified dataset and vectorizer
    @param dataset: string specifying dataset, "train","dev",etc
    @param vectorizer: string specifying vectorizer "binary","count",etc

    '''
    with open(f'../data/{dataset}_{vectorizer}_downsampled_data.pckl', 'rb') as f:
        return pickle.load(f)


In [4]:
## getting train_y and dev_y
with open('../data/train_labels.pckl', 'rb') as f:
    trainY = pickle.load(f)

with open('../data/dev_labels.pckl', 'rb') as f:
    devY = pickle.load(f)

## Perceptron

In [6]:
vecs = ['count', 'tfidf', 'hashing', 'binary', 'hashing_binary']
for vec in vecs:
    trainX = get_data("train",vec)
    print(trainX.shape)

(51638, 2997374)
(51638, 2997374)
(51638, 8388612)
(51638, 2997374)
(51638, 8388612)


In [None]:
from sklearn.linear_model import Perceptron


random.seed(100)

##TODO pass in loguniform

vectorizers = ['count', 'tfidf', 'hashing', 'binary', 'hashing_binary']

param_grid = {"penalty":["l1","l2","elasticnet",None],"fit_intercept":[True,False],"eta0":[.10,.5,1]}


for vectorizer in vectorizers:
    print("--------------------------")
    print(vectorizer)
    trainX = get_data("train",vectorizer)
    valX = get_data("dev",vectorizer)
    
    clf = Perceptron(early_stopping=True)
    
    rsearch = RandomizedSearchCV(estimator=clf,param_distributions=param_grid)
    
    rsearch.fit(trainX,trainY)
    
    ytrain_score = rsearch.predict(trainX)
    yval_score = rsearch.predict(valX)
    
    
    print("Train AUC",roc_auc_score(trainY,ytrain_score))
    print("Train AP",average_precision_score(trainY,ytrain_score))
    print("Val AUC",roc_auc_score(devY,yval_score))
    print("Val AP",average_precision_score(devY,yval_score))
    print("--------------------------")


## SVM

In [None]:
from sklearn import svm
from sklearn.kernel_approximation import Nystroem
vectorizers = ['count', 'tfidf', 'hashing', 'binary', 'hashing_binary']

for vectorizer in vectorizers:
    for kernel in ["rbf","polynomial"]:
        for n in [1000,5000,10000,25000,45000]:
            print("---------------------------------")
            print(kernel)
            print(vectorizer)
            print(n)
            trainX = get_data("train",vectorizer)
            valX = get_data("dev",vectorizer)
    
            clf = svm.LinearSVC(max_iter=400,tol=1e-2,C=.1)
   
    
            if(kernel=="rbf"):
                feature_map_nystroem = Nystroem(kernel=kernel,n_components=500)
        
            else:
                feature_map_nystroem = Nystroem(kernel=kernel,degree=2.0,n_components=500)
        
            train_transformed = feature_map_nystroem.fit_transform(trainX)
    
            val_transformed = feature_map_nystroem.fit_transform(valX)

    
            clf.fit(train_transformed,trainY)
    
            ytrain_score = clf.predict(train_transformed)
            yval_score = clf.predict(val_transformed)
    
    
            print("Train AUC",roc_auc_score(trainY,ytrain_score))
            print("Train AP",average_precision_score(trainY,ytrain_score))
            print("Val AUC",roc_auc_score(devY,yval_score))
            print("Val AP",average_precision_score(devY,yval_score))
    
            print("-----------------------------------")

    
    
