In [2]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array

In [17]:
data_train = load_sparse_csr('data/data_train.npz')
data_test = load_sparse_csr('data/data_test.npz')
label_train = load_csv('data/label_train.csv')
label_test = load_csv('data/label_test.csv')
features_names = load_feature_names('data/feature_names.csv')


In [18]:
def score(true_label, predicted_label):
    length = len(true_label)
    total = 0
    for i, label in enumerate(true_label):
        if label == predicted_label[i]:
            total += 1
    return float(total)/float(length)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation

alg = LogisticRegression(penalty = 'l1')
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)
print("LogReg - Penalty l1 - Score on test_data : ", score(label_test, predicted_label))

alg = LogisticRegression(penalty = 'l2')
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)
print("LogReg - Penalty l2 - Score on test_data : ", score(label_test, predicted_label))

('LogReg - Penalty l1 - Score on test_data : ', 0.86784)
('LogReg - Penalty l2 - Score on test_data : ', 0.89136)


In [None]:
from sklearn.ensemble import RandomForestClassifier

alg = RandomForestClassifier(random_state=1, n_estimators=30)
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("Random Forest - Score on test_data : ", score(label_test, predicted_label))

In [9]:
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV 

Cs = {'C': np.logspace(-1, 0, 20)}

grid_search = GridSearchCV(LinearSVC(penalty="l1",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

grid_search = GridSearchCV(LinearSVC(penalty="l2",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

('SVM - Best C & associated score', {'C': 0.78475997035146106}, 0.87786666666666668)
('SVM - Score on test_data : ', 0.88688)
('SVM - Best C & associated score', {'C': 0.61584821106602639}, 0.89461333333333337)
('SVM - Score on test_data : ', 0.90144)


In [20]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV 

alphas = np.logspace(-2, 0, 20)

from sklearn.grid_search import GridSearchCV
for Model in [MultinomialNB, BernoulliNB]:
    gscv = GridSearchCV(Model(), dict(alpha=alphas), cv=10).fit(data_train, label_train)
    print(Model.__name__, "- Best C & associated score", gscv.best_params_, gscv.best_score_)
    print("Score on test_data : ", score(label_test, gscv.predict(data_test)))

('MultinomialNB', '- Best C & associated score', {'alpha': 0.020691380811147901}, 0.86453333333333338)
('Score on test_data : ', 0.87104)
('BernoulliNB', '- Best C & associated score', {'alpha': 0.61584821106602605}, 0.88031999999999999)
('Score on test_data : ', 0.88336)


In [23]:
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=70000)
data_train = fselect.fit_transform(data_train, label_train)
data_test = fselect.transform(data_test)


from sklearn.linear_model import SGDClassifier

alg = SGDClassifier(loss='modified_huber', n_iter=5, random_state=0, shuffle=True)
alg.fit(data_train, label_train)
proba = alg.predict_proba(data_test)[:,1]
predicted_label = [1 if p > 0.5 else 0 for p in proba]

print("SGD Classifier - Score on test_data : ", score(label_test, predicted_label))

('SGD Classifier - Score on test_data : ', 0.49808)


0.50688
