In [1]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array

In [2]:
data_train = load_sparse_csr('data/data_train.npz')
data_test = load_sparse_csr('data/data_test.npz')
label_train = load_csv('data/label_train.csv')
label_test = load_csv('data/label_test.csv')
features_names = load_feature_names('data/feature_names.csv')

In [3]:
def score(true_label, predicted_label):
    length = len(true_label)
    total = 0
    for i, label in enumerate(true_label):
        if label == predicted_label[i]:
            total += 1
    return float(total)/float(length)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation

alg = LogisticRegression(penalty = 'l1')
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)
print("LogReg - Penalty l1 - Score on test_data : ", score(label_test, predicted_label))

alg = LogisticRegression(penalty = 'l2')
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)
print("LogReg - Penalty l2 - Score on test_data : ", score(label_test, predicted_label))

('LogReg - Penalty l1 - Score on test_data : ', 0.8664)
('LogReg - Penalty l2 - Score on test_data : ', 0.8792)


In [9]:
from sklearn.ensemble import RandomForestClassifier

alg = RandomForestClassifier(random_state=1, n_estimators=30)
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("Random Forest - Score on test_data : ", score(label_test, predicted_label))

('Random Forest - Score on test_data : ', 0.82672)


In [10]:
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV 

Cs = {'C': np.logspace(-1, 0, 20)}

grid_search = GridSearchCV(LinearSVC(penalty="l1",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

grid_search = GridSearchCV(LinearSVC(penalty="l2",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

('SVM - Best C & associated score', {'C': 0.88586679041008254}, 0.87482666666666664)
('SVM - Score on test_data : ', 0.87984)
('SVM - Best C & associated score', {'C': 0.37926901907322497}, 0.89178666666666662)
('SVM - Score on test_data : ', 0.89392)


In [7]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV 

alphas = np.logspace(-6, 0, 50)

from sklearn.grid_search import GridSearchCV
for Model in [MultinomialNB, BernoulliNB]:
    gscv = GridSearchCV(Model(), dict(alpha=alphas), cv=10).fit(data_train, label_train)
    print(Model.__name__, "- Best C & associated score", gscv.best_params_, gscv.best_score_)
    print("Score on test_data : ", score(label_test, gscv.predict(data_train)))

('MultinomialNB', '- Best C & associated score', {'alpha': 0.079060432109077011}, 0.86575999999999997)
('Score on test_data : ', 0.51104)
('BernoulliNB', '- Best C & associated score', {'alpha': 0.75431200633546069}, 0.87482666666666664)
('Score on test_data : ', 0.51024)
