## Data and Imports

In [1]:
import pickle
import pandas as pd
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.model_selection import RandomizedSearchCV
import random
from sklearn.metrics import average_precision_score, roc_auc_score

In [2]:
def get_data(dataset,vectorizer):
    '''
    returns feature matrix for specified dataset and vectorizer
    @param dataset: string specifying dataset, "train","dev",etc
    @param vectorizer: string specifying vectorizer "binary","count",etc

    '''
    with open(f'../data/{dataset}_{vectorizer}_subsampled_data.pckl', 'rb') as f:
        return pickle.load(f)


In [3]:
## getting train_y and dev_y
with open('../data/train_labels.pckl', 'rb') as f:
    trainY = pickle.load(f)

with open('../data/dev_labels.pckl', 'rb') as f:
    devY = pickle.load(f)

## Perceptron

In [5]:
from sklearn.linear_model import Perceptron


random.seed(100)


vectorizers = ['count',
               'tfidf',
               #'hashing',
               'binary',
               #'hashing_binary',
               ]

param_grid = {"penalty":["l1","l2","elasticnet",None],"fit_intercept":[True,False],"eta0":sp_rand()}


for vectorizer in vectorizers:
    print("--------------------------")
    print(vectorizer)
    trainX = get_data("train",vectorizer)
    valX = get_data("dev",vectorizer)
    
    clf = Perceptron(early_stopping=True)
    
    rsearch = RandomizedSearchCV(n_jobs=-1, random_state=0, estimator=clf,param_distributions=param_grid)
    
    rsearch.fit(trainX,trainY)
    
    print(rsearch.best_estimator_)
    
    #ytrain_score = rsearch.predict_proba(trainX)
    yval_score = rsearch.decision_function(valX)
    
    
    #print("Train AUC",roc_auc_score(trainY,ytrain_score))
    #print("Train AP",average_precision_score(trainY,ytrain_score))
    print("Val AUC",roc_auc_score(devY,yval_score))
    print("Val AP",average_precision_score(devY,yval_score))
    print("--------------------------")


--------------------------
count
Perceptron(alpha=0.0001, class_weight=None, early_stopping=True,
           eta0=0.9636627605010293, fit_intercept=True, max_iter=1000,
           n_iter_no_change=5, n_jobs=None, penalty='l2', random_state=0,
           shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
           warm_start=False)
Val AUC 0.7060673349928509
Val AP 0.17427758321442435
--------------------------
--------------------------
tfidf
Perceptron(alpha=0.0001, class_weight=None, early_stopping=True,
           eta0=0.4375872112626925, fit_intercept=True, max_iter=1000,
           n_iter_no_change=5, n_jobs=None, penalty='l1', random_state=0,
           shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
           warm_start=False)
Val AUC 0.7394226525165951
Val AP 0.2157023479250944
--------------------------
--------------------------
binary
Perceptron(alpha=0.0001, class_weight=None, early_stopping=True,
           eta0=0.4236547993389047, fit_intercept=Fa

## Linear SVM

In [11]:
from sklearn.svm import LinearSVC
import scipy.stats
param_grid = {"penalty":["l1","l2"],"loss":["squared_hinge"],"C":scipy.stats.reciprocal(a=1e-4,b=1e2)}

for vectorizer in vectorizers:
    print("--------------------------")
    print(vectorizer)
    trainX = get_data("train",vectorizer)
    valX = get_data("dev",vectorizer)
    
    clf = LinearSVC()
    
    rsearch = RandomizedSearchCV(n_jobs=-1, n_iter=400, estimator=clf,param_distributions=param_grid)
    
    rsearch.fit(trainX,trainY)
    
    print(rsearch.best_estimator_)
    print(rsearch.best_estimator_.n_iter_)
    
    #ytrain_score = rsearch.predict(trainX)
    yval_score = rsearch.decision_function(valX)
    
    
    #print("Train AUC",roc_auc_score(trainY,ytrain_score))
    #print("Train AP",average_precision_score(trainY,ytrain_score))
    print("Val AUC",roc_auc_score(devY,yval_score))
    print("Val AP",average_precision_score(devY,yval_score))
    print("--------------------------")

--------------------------
count
LinearSVC(C=0.00011009618185158206, class_weight=None, dual=True,
          fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
          max_iter=1000, multi_class='ovr', penalty='l2', random_state=None,
          tol=0.0001, verbose=0)
34
Val AUC 0.7486690093251024
Val AP 0.22622179912459403
--------------------------
--------------------------
tfidf




LinearSVC(C=0.020329421573461483, class_weight=None, dual=True,
          fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
          max_iter=1000, multi_class='ovr', penalty='l2', random_state=None,
          tol=0.0001, verbose=0)
1000
Val AUC 0.751496891462659
Val AP 0.23177498413147507
--------------------------
--------------------------
binary
LinearSVC(C=0.00010810023585783914, class_weight=None, dual=True,
          fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
          max_iter=1000, multi_class='ovr', penalty='l2', random_state=None,
          tol=0.0001, verbose=0)
35
Val AUC 0.7448183526535971
Val AP 0.2216096474871898
--------------------------


In [None]:
from sklearn.svm import LinearSVC
import scipy.stats
param_grid = {"penalty":["l1","l2"],"loss":["squared_hinge"],"C":scipy.stats.reciprocal(a=1e-4,b=1e2)}

for vectorizer in ['tfidf']:
    print("--------------------------")
    print(vectorizer)
    trainX = get_data("train",vectorizer)
    valX = get_data("dev",vectorizer)
    
    clf = LinearSVC(C=0.020329421573461483, class_weight=None, dual=True,
          fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
          max_iter=1000, multi_class='ovr', penalty='l2', random_state=None,
          tol=0.0001, verbose=0)
    
    #rsearch = RandomizedSearchCV(n_jobs=-1, random_state=0, estimator=clf,param_distributions=param_grid)
    
    clf.fit(trainX,trainY)
    
    #print(rsearch.best_estimator_)
    #print(rsearch.best_estimator_.n_iter_)
    
    #ytrain_score = rsearch.predict(trainX)
    
    
    from sklearn.calibration import CalibratedClassifierCV

    calibration = CalibratedClassifierCV(clf).fit(trainX, trainY)
    
    yval_score = calibration.predict_proba(valX)
    yval_score = yval_score[:, 1]
    
    print("Val AUC",roc_auc_score(devY,yval_score))
    print("Val AP",average_precision_score(devY,yval_score))
    print("--------------------------")

In [6]:
with open('../data/best_svm_tfidf.pckl', 'wb') as f:
    pickle.dump(calibration, f)

In [10]:
with open('../data/best-lr-tfidf.pckl', 'rb') as f:
    lr = pickle.load(f)

with open('../data/best_svm_tfidf.pckl', 'rb') as f:
    svm = pickle.load(f)

with open('../data/best_mnb_tfidf.pckl', 'rb') as f:
    mnb = pickle.load(f)

In [14]:
with open(f'../data/dev_tfidf_subsampled_data.pckl', 'rb') as f:
    dev_3 = pickle.load(f)

with open(f'../data/train_tfidf_downsampled_data.pckl', 'rb') as f:
    train_2 = pickle.load(f)
    
with open(f'../data/dev_tfidf_downsampled_data.pckl', 'rb') as f:
    dev_2 = pickle.load(f)
    
    
sums = (train_2 > 0).sum(axis=0)
lim = sums.mean()
print(np.asarray(sums > lim)[0].sum())
dev_2 = dev_2[:, np.asarray(sums > lim)[0]]

83235


In [15]:
dev_2.shape, dev_3.shape

((35918, 83235), (35918, 2997374))

In [16]:
predicted_lr = lr.predict_proba(dev_2)[:, 1]
predicted_svm = svm.predict_proba(dev_3)[:, 1]
predicted_mnb = mnb.predict_proba(dev_3)[:, 1]

ensemble_df = pd.DataFrame([predicted_lr, predicted_svm, predicted_mnb]).T
ensemble_df.columns = ['lr', 'svm', 'mnb']
ensemble_df.head()

Unnamed: 0,lr,svm,mnb
0,0.433468,0.440476,0.580745
1,0.15782,0.114336,4.623869e-06
2,0.404492,0.394189,0.6714519
3,0.499979,0.383133,0.008584521
4,0.061132,0.029665,2.530761e-12


In [21]:
ensemble_df['vote'] = ((ensemble_df[['lr', 'svm', 'mnb']] > 0.5).sum(axis=1) > 2).astype(int)
ensemble_df['vote_proba'] = (ensemble_df[['lr', 'svm', 'mnb']] > 0.5).sum(axis=1) / 3
ensemble_df['avg'] = ensemble_df[['lr', 'svm', 'mnb']].mean(axis=1)
ensemble_df.head()

Unnamed: 0,lr,svm,mnb,vote,avg,vote_proba
0,0.433468,0.440476,0.580745,0,0.484896,0.333333
1,0.15782,0.114336,4.623869e-06,0,0.09072,0.0
2,0.404492,0.394189,0.6714519,0,0.490044,0.333333
3,0.499979,0.383133,0.008584521,0,0.297232,0.0
4,0.061132,0.029665,2.530761e-12,0,0.030266,0.0


In [22]:
print('vote:')
print("Val AUC",roc_auc_score(devY, ensemble_df['vote']))
print("Val AP",average_precision_score(devY, ensemble_df['vote']))
print('')
print('vote_proba:')
print("Val AUC",roc_auc_score(devY, ensemble_df['vote_proba']))
print("Val AP",average_precision_score(devY, ensemble_df['vote_proba']))
print('')
print('avg:')
print("Val AUC",roc_auc_score(devY, ensemble_df['avg']))
print("Val AP",average_precision_score(devY, ensemble_df['avg']))
print('')

vote:
Val AUC 0.6792401794888523
Val AP 0.17352959562453538

vote_proba:
Val AUC 0.7303068926722991
Val AP 0.19413119702908538

avg:
Val AUC 0.7665775703833878
Val AP 0.24074470944539986



## Non-Linear SVM

In [None]:
from sklearn import svm
from sklearn.kernel_approximation import Nystroem
vectorizers = ['count', 'tfidf', 'hashing', 'binary', 'hashing_binary']

for vectorizer in vectorizers:
    for kernel in ["rbf","polynomial"]:
        for n in [1000,5000,10000,25000,45000]:
            print("---------------------------------")
            print(kernel)
            print(vectorizer)
            print(n)
            trainX = get_data("train",vectorizer)
            valX = get_data("dev",vectorizer)
    
            clf = svm.LinearSVC(max_iter=400,tol=1e-2,C=.1)
   
    
            if(kernel=="rbf"):
                feature_map_nystroem = Nystroem(kernel=kernel,n_components=n)
        
            else:
                feature_map_nystroem = Nystroem(kernel=kernel,degree=2.0,n_components=n)
        
            train_transformed = feature_map_nystroem.fit_transform(trainX)
    
            val_transformed = feature_map_nystroem.fit_transform(valX)

    
            clf.fit(train_transformed,trainY)
    
            ytrain_score = clf.predict(train_transformed)
            yval_score = clf.predict(val_transformed)
    
    
            print("Train AUC",roc_auc_score(trainY,ytrain_score))
            print("Train AP",average_precision_score(trainY,ytrain_score))
            print("Val AUC",roc_auc_score(devY,yval_score))
            print("Val AP",average_precision_score(devY,yval_score))
    
            print("-----------------------------------")

    
    


---------------------------------
rbf
count
1000
Train AUC 0.6431891242883149
Train AP 0.5846210445619413
Val AUC 0.48710227133723677
Val AP 0.1000083194710632
-----------------------------------
---------------------------------
rbf
count
5000
Train AUC 0.6435958015415004
Train AP 0.5849188233638523
Val AUC 0.4377501848438885
Val AP 0.093414327577478
-----------------------------------
---------------------------------
rbf
count
10000
