## Data and Imports

In [7]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.model_selection import RandomizedSearchCV
import random
from sklearn.metrics import average_precision_score, roc_auc_score

In [8]:
def get_data(dataset,vectorizer):
    '''
    returns feature matrix for specified dataset and vectorizer
    @param dataset: string specifying dataset, "train","dev",etc
    @param vectorizer: string specifying vectorizer "binary","count",etc

    '''
    with open(f'../data/{dataset}_{vectorizer}_downsampled_data.pckl', 'rb') as f:
        return pickle.load(f)


In [9]:
## getting train_y and dev_y
with open('../data/train_labels.pckl', 'rb') as f:
    trainY = pickle.load(f)

with open('../data/dev_labels.pckl', 'rb') as f:
    devY = pickle.load(f)

## Perceptron

In [10]:
vecs = ['count', 'tfidf', 'hashing', 'binary', 'hashing_binary']
for vec in vecs:
    trainX = get_data("train",vec)
    print(trainX.shape)

(51638, 2997374)
(51638, 2997374)
(51638, 8388612)
(51638, 2997374)
(51638, 8388612)


In [None]:
from sklearn.linear_model import Perceptron


random.seed(100)

##TODO pass in loguniform

vectorizers = ['count', 'tfidf', 'hashing', 'binary', 'hashing_binary']

param_grid = {"penalty":["l1","l2","elasticnet",None],"fit_intercept":[True,False],"eta0":[.10,.5,1]}


for vectorizer in vectorizers:
    print("--------------------------")
    print(vectorizer)
    trainX = get_data("train",vectorizer)
    valX = get_data("dev",vectorizer)
    
    clf = Perceptron(early_stopping=True)
    
    rsearch = RandomizedSearchCV(estimator=clf,param_distributions=param_grid)
    
    rsearch.fit(trainX,trainY)
    
    ytrain_score = rsearch.predict(trainX)
    yval_score = rsearch.predict(valX)
    
    
    print("Train AUC",roc_auc_score(trainY,ytrain_score))
    print("Train AP",average_precision_score(trainY,ytrain_score))
    print("Val AUC",roc_auc_score(devY,yval_score))
    print("Val AP",average_precision_score(devY,yval_score))
    print("--------------------------")


## SVM

In [13]:
from sklearn import svm
from sklearn.kernel_approximation import Nystroem
vectorizers = ['count', 'tfidf', 'hashing', 'binary', 'hashing_binary']

for vectorizer in vectorizers:
    for kernel in ["rbf","polynomial"]:
        for n in [1000,5000,10000,25000,45000]:
            print("---------------------------------")
            print(kernel)
            print(vectorizer)
            print(n)
            trainX = get_data("train",vectorizer)
            valX = get_data("dev",vectorizer)
    
            clf = svm.LinearSVC(max_iter=4000,tol=1e-2,C=.1)
   
    
            if(kernel=="rbf"):
                feature_map_nystroem = Nystroem(kernel=kernel,n_components=500)
        
            else:
                feature_map_nystroem = Nystroem(kernel=kernel,degree=2.0,n_components=500)
        
            train_transformed = feature_map_nystroem.fit_transform(trainX)
    
            val_transformed = feature_map_nystroem.fit_transform(valX)

    
            clf.fit(train_transformed,trainY)
    
            ytrain_score = clf.predict(train_transformed)
            yval_score = clf.predict(val_transformed)
    
    
            print("Train AUC",roc_auc_score(trainY,ytrain_score))
            print("Train AP",average_precision_score(trainY,ytrain_score))
            print("Val AUC",roc_auc_score(devY,yval_score))
            print("Val AP",average_precision_score(devY,yval_score))
    
            print("-----------------------------------")

    
    


---------------------------------
rbf
count
1000
Train AUC 0.6389674270885782
Train AP 0.5815807373783863
Val AUC 0.41858483824800613
Val AP 0.0930772023978781
-----------------------------------
---------------------------------
rbf
count
5000
Train AUC 0.6372826213253806
Train AP 0.5804093885191757
Val AUC 0.49351999847775624
Val AP 0.10039936681147975
-----------------------------------
---------------------------------
rbf
count
10000
Train AUC 0.6366822882373446
Train AP 0.579993279474494
Val AUC 0.45868582791033985
Val AP 0.09885415240917325
-----------------------------------
---------------------------------
rbf
count
25000
Train AUC 0.6380185134978118
Train AP 0.5809170836101376
Val AUC 0.5139542100234317
Val AP 0.10419224956996948
-----------------------------------
---------------------------------
rbf
count
45000
Train AUC 0.6378829544134165
Train AP 0.5808250611342676
Val AUC 0.45608517803456583
Val AP 0.09880714270333457
-----------------------------------
---------------

Train AUC 0.6256051744839072
Train AP 0.5724250378904301
Val AUC 0.5459561661746558
Val AP 0.11071843345002423
-----------------------------------
---------------------------------
rbf
hashing_binary
10000
Train AUC 0.6307176885239552
Train AP 0.5758995858582012
Val AUC 0.4390869221589766
Val AP 0.0992326128406433
-----------------------------------
---------------------------------
rbf
hashing_binary
25000
Train AUC 0.6311050001936558
Train AP 0.5761674643538008
Val AUC 0.6245093397131658
Val AP 0.13458560032253414
-----------------------------------
---------------------------------
rbf
hashing_binary
45000
Train AUC 0.6256051744839073
Train AP 0.5724254925301964
Val AUC 0.4818738396288987
Val AP 0.10026388767185847
-----------------------------------
---------------------------------
polynomial
hashing_binary
1000
Train AUC 0.6266702815755839
Train AP 0.5731444168489988
Val AUC 0.4857062837408054
Val AP 0.09918162733031775
-----------------------------------
------------------------