# Linear Classifiers

In [1]:
import csv
import numpy as np

V = 'B'

if V == 'A':
    from config import TRAIN_PATH as TRAIN_PATH
    from config import TEST_PATH as TEST_PATH
    b,s,t = 9,10,61
elif V == 'B':
    from config import TRAINB_PATH as TRAIN_PATH
    from config import TESTB_PATH as TEST_PATH
    b,s,t = 62,10,61
elif V == 'C':
    from config import TRAINC_PATH as TRAIN_PATH
    from config import TESTC_PATH as TEST_PATH
    b,s,t = 62,10,61

def load_from_csv(path):
    y = []
    X = []
    with open(path,'r') as f:
        reader = csv.reader(f)
        next(reader, None)
        for row in reader:
            y.append(int(row[b]))
            X.append([int(e) for e in row[s:t]])
            
    y = np.array(y)
    X = np.array(X)
    
    return X,y

X_train, y_train = load_from_csv(TRAIN_PATH)
X_test, y_test = load_from_csv(TEST_PATH)

print('Number of training samples:',X_train.shape[0])
print()

print('Distribution of 0s and 1s :')
print('0\t',1-np.sum(y_train)/y_train.shape[0])
print('1\t',np.sum(y_train)/y_train.shape[0])

Number of training samples: 1200

Distribution of 0s and 1s :
0	 0.5391666666666667
1	 0.4608333333333333


### General framework for grid search with cross validation

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

def grid_search_and_test(estimator, tuned_parameters):

    scores = ['precision', 'recall', 'f1']

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(estimator, tuned_parameters, cv=5, scoring='%s_macro' % score, n_jobs=-1)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()
        
#     return clf

### Naive Bayes

In [3]:
from sklearn.naive_bayes import BernoulliNB
tuned_parameters = [{'alpha': np.arange(0.05,1.05,0.05)}]
grid_search_and_test(BernoulliNB(), tuned_parameters)

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'alpha': 0.8}

Grid scores on development set:

0.529 (+/-0.067) for {'alpha': 0.05}
0.529 (+/-0.066) for {'alpha': 0.1}
0.529 (+/-0.066) for {'alpha': 0.15000000000000002}
0.529 (+/-0.066) for {'alpha': 0.2}
0.529 (+/-0.066) for {'alpha': 0.25}
0.529 (+/-0.066) for {'alpha': 0.3}
0.529 (+/-0.066) for {'alpha': 0.35000000000000003}
0.529 (+/-0.069) for {'alpha': 0.4}
0.529 (+/-0.069) for {'alpha': 0.45}
0.529 (+/-0.069) for {'alpha': 0.5}
0.529 (+/-0.069) for {'alpha': 0.55}
0.529 (+/-0.069) for {'alpha': 0.6000000000000001}
0.529 (+/-0.069) for {'alpha': 0.6500000000000001}
0.529 (+/-0.069) for {'alpha': 0.7000000000000001}
0.529 (+/-0.069) for {'alpha': 0.7500000000000001}
0.530 (+/-0.067) for {'alpha': 0.8}
0.530 (+/-0.067) for {'alpha': 0.8500000000000001}
0.530 (+/-0.067) for {'alpha': 0.9000000000000001}
0.530 (+/-0.067) for {'alpha': 0.9500000000000001}
0.530 (+/-0.067) for {'alpha': 1.0}

D

### Perceptron

In [4]:
from sklearn.linear_model import Perceptron
tuned_parameters = [{'penalty': [None, 'l2', 'l1', 'elasticnet']}]
grid_search_and_test(Perceptron(), tuned_parameters)

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'penalty': 'l1'}

Grid scores on development set:

0.518 (+/-0.110) for {'penalty': None}
0.464 (+/-0.139) for {'penalty': 'l2'}
0.521 (+/-0.060) for {'penalty': 'l1'}
0.464 (+/-0.139) for {'penalty': 'elasticnet'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.49      0.35      0.41       156
           1       0.46      0.60      0.52       144

   micro avg       0.47      0.47      0.47       300
   macro avg       0.48      0.48      0.47       300
weighted avg       0.48      0.47      0.46       300


# Tuning hyper-parameters for recall

Best parameters set found on development set:

{'penalty': 'l1'}

Grid scores on development set:

0.502 (+/-0.053) for {'penalty': None}
0.497 (+/-0.038) for {'penalty': 'l2'}
0.511 (+/-0.042)



### Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
tuned_parameters = [{'penalty': ['l2', 'l1']}]
grid_search_and_test(LogisticRegression(), tuned_parameters)



# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'penalty': 'l2'}

Grid scores on development set:

0.539 (+/-0.059) for {'penalty': 'l2'}
0.538 (+/-0.060) for {'penalty': 'l1'}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.52      0.68      0.59       156
           1       0.49      0.33      0.40       144

   micro avg       0.51      0.51      0.51       300
   macro avg       0.51      0.51      0.49       300
weighted avg       0.51      0.51      0.50       300


# Tuning hyper-parameters for recall

Best parameters set found on development set:

{'penalty': 'l2'}

Grid scores on development set:

0.536 (+/-0.052) for {'penalty': 'l2'}
0.535 (+/-0.055) for {'penalty': 'l1'}

Detailed classification report:

The model is trained on the full development set.
The scores are com



### SVM

In [6]:
from sklearn.svm import SVC
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                     'C': [1, 10, 100, 1000, 10000, 100000]}]
grid_search_and_test(SVC(), tuned_parameters)

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

Grid scores on development set:

0.565 (+/-0.096) for {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.270 (+/-0.001) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.270 (+/-0.001) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.270 (+/-0.001) for {'C': 1, 'gamma': 1e-05, 'kernel': 'rbf'}
0.550 (+/-0.066) for {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.557 (+/-0.098) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.270 (+/-0.001) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.270 (+/-0.001) for {'C': 10, 'gamma': 1e-05, 'kernel': 'rbf'}
0.553 (+/-0.058) for {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.551 (+/-0.069) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.560 (+/-0.099) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.270 (+/-0.001) for {'C': 100, 'gamma': 1e-05, 'kernel': 'rbf'}
0.538 (+/-0.052) for {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
0.55