In [1]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array

In [2]:
path = "simplified_data/"
data_train = load_sparse_csr(path+'data_train.npz')
data_test = load_sparse_csr(path+'data_test.npz')
label_train = load_csv(path+'label_train.csv')
label_test = load_csv(path+'label_test.csv')
features_names = load_feature_names('data/feature_names.csv')

In [3]:
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=80000)
data_train = fselect.fit_transform(data_train, label_train)
data_test = fselect.transform(data_test)

In [4]:
def score(true_label, predicted_label):
    length = len(true_label)
    total = 0
    for i, label in enumerate(true_label):
        if label == predicted_label[i]:
            total += 1
    return float(total)/float(length)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation

alg = LogisticRegression(penalty = 'l1')
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)
print("LogReg - Penalty l1 - Score on test_data : ", score(label_test, predicted_label))

alg = LogisticRegression(penalty = 'l2')
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)
print("LogReg - Penalty l2 - Score on test_data : ", score(label_test, predicted_label))

('LogReg - Penalty l1 - Score on test_data : ', 0.8632)
('LogReg - Penalty l2 - Score on test_data : ', 0.8944)


In [6]:
from sklearn.ensemble import RandomForestClassifier

alg = RandomForestClassifier(random_state=1, n_estimators=30)
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("Random Forest - Score on test_data : ", score(label_test, predicted_label))

('Random Forest - Score on test_data : ', 0.81904)


In [7]:
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV 

Cs = {'C': np.logspace(-1, 0, 20)}

grid_search = GridSearchCV(LinearSVC(penalty="l1",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

grid_search = GridSearchCV(LinearSVC(penalty="l2",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

('SVM - Best C & associated score', {'C': 1.0}, 0.88362666666666667)
('SVM - Score on test_data : ', 0.88768)
('SVM - Best C & associated score', {'C': 1.0}, 0.90725333333333336)
('SVM - Score on test_data : ', 0.9064)


In [8]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV 

alphas = np.logspace(-6, 0, 50)

from sklearn.grid_search import GridSearchCV
for Model in [MultinomialNB, BernoulliNB]:
    gscv = GridSearchCV(Model(), dict(alpha=alphas), cv=10).fit(data_train, label_train)
    print(Model.__name__, "- Best alpha & associated score", gscv.best_params_, gscv.best_score_)
    print("Score on test_data : ", score(label_test, gscv.predict(data_test)))

('MultinomialNB', '- Best alpha & associated score', {'alpha': 0.0082864277285468416}, 0.90581333333333336)
('Score on test_data : ', 0.88624)
('BernoulliNB', '- Best alpha & associated score', {'alpha': 0.0026826957952797246}, 0.89648000000000005)
('Score on test_data : ', 0.87712)


In [35]:
#from sklearn import cross_validation
from sklearn.linear_model import SGDClassifier 

# ADD other param?
losses = ['squared_hinge', 'perceptron', 'modified_huber']
for loss in losses:
    alg = SGDClassifier(shuffle = True, loss = loss, n_iter= 500)
    alg2 = SGDClassifier(shuffle = True, loss = loss, n_iter= 500, penalty="l2")
    alg.fit(data_train, label_train)
    predicted_label = alg.predict(data_test)
    alg2.fit(data_train, label_train)
    predicted_label2 = alg2.predict(data_test)

    # Take the mean of the scores (because we have one for each fold)
    #scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
    #print("SGD - Mean score on cross val", scores.mean())
    print("SGD - Score on test_data with loss ", loss, ": ", score(predicted_label, label_test))
    #print("SGD - Score on train_data : ", score(alg.predict(data_train), label_train))
    print("penalized SGD - Score on test_data with loss ", loss, ": ", score(predicted_label2, label_test))
    #print("penalized SGD - Score on train_data : ", score(alg2.predict(data_train), label_train))

('SGD - Score on test_data with loss ', 'squared_hinge', ': ', 0.90784)
('penalized SGD - Score on test_data with loss ', 'squared_hinge', ': ', 0.90768)
('SGD - Score on test_data with loss ', 'perceptron', ': ', 0.8808)
('penalized SGD - Score on test_data with loss ', 'perceptron', ': ', 0.88608)
('SGD - Score on test_data with loss ', 'modified_huber', ': ', 0.90784)
('penalized SGD - Score on test_data with loss ', 'modified_huber', ': ', 0.90784)
