In [1]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array

In [2]:
path = "simplified_data/"
data_train = load_sparse_csr(path+'data_train.npz')
data_test = load_sparse_csr(path+'data_test.npz')
label_train = load_csv(path+'label_train.csv')
label_test = load_csv(path+'label_test.csv')
features_names = load_feature_names('data/feature_names.csv')

In [3]:
def score(true_label, predicted_label):
    length = len(true_label)
    total = 0
    for i, label in enumerate(true_label):
        if label == predicted_label[i]:
            total += 1
    return float(total)/float(length)



In [4]:
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=80000)
data_train = fselect.fit_transform(data_train, label_train)
data_test = fselect.transform(data_test)


# One classifier

In [5]:
from sklearn.linear_model import LogisticRegression

alg1 = LogisticRegression(penalty = 'l2')
alg1.fit(data_train, label_train)
predicted_label1 = alg1.predict(data_test)
probability1 = alg1.predict_proba(data_test)

print(score(predicted_label1, label_test))

0.8944


In [6]:
from sklearn.linear_model import SGDClassifier

alg2 = SGDClassifier(loss='modified_huber', n_iter=5, random_state=0, shuffle=True)
alg2.fit( data_train, label_train )
predicted_label = alg2.predict(data_test)
print(score(label_test, predicted_label))

0.9056


In [7]:
from sklearn.svm import LinearSVC
from sklearn import cross_validation
alg3 = LinearSVC(penalty="l2",dual=False, C=0.48329302385717521)
alg3.fit(data_train, label_train)
predicted_label = alg3.predict(data_test)

# Take the mean of the scores (because we have one for each fold)
#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
#print("linear svc - Mean score on cross val", scores.mean())
print("linear svc  - Score on test_data : ", score(predicted_label, label_test))
print("Bernouilli Naive Bayes - Score on train_data : ", score(alg3.predict(data_train), label_train))

('linear svc  - Score on test_data : ', 0.90688)
('Bernouilli Naive Bayes - Score on train_data : ', 0.9937066666666666)


# Majority Vote

In [8]:
from sklearn.ensemble import VotingClassifier
mv = VotingClassifier(estimators=[('lr', alg1), ('sgd', alg2), ('svm', alg3)], voting='hard',weights=[1,1,1])
mv.fit(data_train, label_train)
predicted_label = mv.predict(data_test)

# Take the mean of the scores (because we have one for each fold)
#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
#print("linear svc - Mean score on cross val", scores.mean())
print("Majority vote  - Score on test_data : ", score(predicted_label, label_test))
print("Majority vote - Score on train_data : ", score(mv.predict(data_train), label_train))

('Majority vote  - Score on test_data : ', 0.90592)
('Majority vote - Score on train_data : ', 0.9937066666666666)


# Average Probability

In [9]:
from sklearn.ensemble import VotingClassifier
ap = VotingClassifier(estimators=[('lr', alg1), ('sgd', alg2)], voting = 'soft', weights=[1,1]) #should normally be used on well-calibrated classifiers (not the case here)
ap.fit(data_train, label_train)
predicted_label = ap.predict(data_test)

# Take the mean of the scores (because we have one for each fold)
#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
#print("linear svc - Mean score on cross val", scores.mean())
print("AV  - Score on test_data : ", score(predicted_label, label_test))
print("AV - Score on train_data : ", score(ap.predict(data_train), label_train))

('AV  - Score on test_data : ', 0.90256)
('AV - Score on train_data : ', 0.9833066666666667)
