In [1]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array
    
def load_sparse_coo(filename):
    loader = np.load(filename)
    return scipy.sparse.coo_matrix((loader['data'],(loader['row'],loader['col'])),
                     shape = loader['shape'])

In [2]:
path = "simplified_data/"
data_train = load_sparse_csr(path+'data_train.npz')
data_test = load_sparse_csr(path+'data_test.npz')
label_train = load_csv(path+'label_train.csv')
label_test = load_csv(path+'label_test.csv')
features_names = load_feature_names('data/feature_names.csv')

In [24]:
from sklearn.metrics import zero_one_loss
def score(true_label, predicted_label):
    return 1 - zero_one_loss(true_label,predicted_label)

In [4]:
data_train_all = [load_sparse_csr('tw_sw{}_all_train_train.npz'.format(k)) for k in range(1,6)]
data_test_all = [load_sparse_csr('tw_sw{}_all_train_test.npz'.format(k)) for k in range(1,6)]
data_train_sep = [load_sparse_coo('tw_sw{}_train_train.npz'.format(k)) for k in range(1,6)]
data_test_sep = [load_sparse_coo('tw_sw{}_train_test.npz'.format(k)) for k in range(1,6)]

In [5]:
label_train_tw = load_csv('labels_train_train.csv')
label_test_tw = load_csv('labels_train_test.csv')

In [6]:
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=80000)
data_train = fselect.fit_transform(data_train, label_train)
data_test = fselect.transform(data_test)


In [7]:
from sklearn.preprocessing import MaxAbsScaler
normalizer_all = map(lambda x: MaxAbsScaler().fit(x), data_train_all)
normalizer_sep = map(lambda x: MaxAbsScaler().fit(x), data_train_sep)

scaler =MaxAbsScaler()
scaler.partial_fit(data_test)
scaler.partial_fit(data_train)
scaler.transform(data_test)
scaler.transform(data_train)

data_train_all_norm = [normalizer_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_norm = [normalizer_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]
data_train_sep_norm = [normalizer_sep[i].transform(data_train_sep[i]) for i in range(len(data_train_sep))]
data_test_sep_norm = [normalizer_sep[i].transform(data_test_sep[i]) for i in range(len(data_test_sep))]

In [16]:
SelectKBest(f_classif , k=data_train_all[0].shape[1]/100).fit(data_train_all[0],label_train_tw)
fselect_all = [SelectKBest(f_classif , k=data_train_all[i].shape[1]/100).fit(
        data_train_all[i],label_train_tw) for i in range(len(data_train_all))]
fselect_sep = [SelectKBest(f_classif , k=data_train_sep[i].shape[1]/100).fit(
        data_train_sep[i], label_train_tw) for i in range(len(data_train_sep))]
data_train_all_selec = [fselect_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_selec = [fselect_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]

data_train_sep_selec = [fselect_sep[i].transform(data_train_sep[i]) for i in range(len(data_train_sep))]
data_test_sep_selec = [fselect_sep[i].transform(data_test_sep[i]) for i in range(len(data_test_sep))]
fselect_all_norm = [SelectKBest(f_classif , k=data_train_all_norm[i].shape[1]/100).fit(
        data_train_all_norm[i],label_train_tw) for i in range(len(data_train_all_norm))]
fselect_sep_norm = [SelectKBest(f_classif , k=data_train_sep[i].shape[1]/100).fit(
        data_train_sep_norm[i], label_train_tw) for i in range(len(data_train_sep_norm))]

data_train_all_norm_selec = [fselect_all_norm[i].transform(
        data_train_all_norm[i]) for i in range(len(data_train_all_norm))]
data_test_all_norm_selec = [fselect_all_norm[i].transform(
        data_test_all_norm[i]) for i in range(len(data_test_all_norm))]
data_train_sep_norm_selec = [fselect_sep_norm[i].transform(
        data_train_sep_norm[i]) for i in range(len(data_train_sep_norm))]
data_test_sep_norm_selec = [fselect_sep_norm[i].transform(
        data_test_sep_norm[i]) for i in range(len(data_test_sep_norm))]

data_train_tw = data_train_all_norm [3]
data_test_tw = data_test_all_norm [3]

# One classifier

In [None]:
Cs = {'C': np.logspace(-1, 3, 30)}
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

lr_tf = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = -1)
lr_tf = grid_search.fit(data_train, label_train)
predicted_label_lr_tf = lr_tf.predict(data_test)

print("SVM - Best C & associated score", lr_tf.best_params_, lr_tf.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label_lr_tf))

In [48]:
Cs = {'C': np.logspace(-1, 3, 30)}
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

lr_tw = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = -1)
lr_tw = grid_search.fit(data_train_tw, label_train_tw)
predicted_label_lr_tw = lr_tw.predict(data_test_tw)

print("SVM - Best C & associated score", lr_tw.best_params_, lr_tw.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label_lr_tw))

('SVM - Best C & associated score', {'C': 0.6723357536499337}, 0.87669333333333332)
('SVM - Score on test_data : ', 0.88624000000000003)


In [6]:
from sklearn.linear_model import LogisticRegression

alg1 = LogisticRegression(penalty = 'l2')
alg1.fit(data_train, label_train)
predicted_label1 = alg1.predict(data_test)
probability1 = alg1.predict_proba(data_test)

print(score(predicted_label1, label_test))

0.8944


In [30]:
from sklearn.linear_model import SGDClassifier

alg2 = SGDClassifier(loss='modified_huber', n_iter=400, random_state=0, shuffle=True)
alg2.fit( data_train, label_train )
predicted_label = alg2.predict(data_test)
print(score(label_test, predicted_label))

0.90784


In [31]:
from sklearn.svm import LinearSVC
from sklearn import cross_validation
alg3 = LinearSVC(penalty="l2",dual=False, C=0.48329302385717521)
alg3.fit(data_train, label_train)
predicted_label = alg3.predict(data_test)

# Take the mean of the scores (because we have one for each fold)
#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
#print("linear svc - Mean score on cross val", scores.mean())
print("linear svc  - Score on test_data : ", score(predicted_label, label_test))
print("Bernouilli Naive Bayes - Score on train_data : ", score(alg3.predict(data_train), label_train))

('linear svc  - Score on test_data : ', 0.90688000000000002)
('Bernouilli Naive Bayes - Score on train_data : ', 0.99370666666666663)


In [32]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import cross_validation
alg4 = MultinomialNB(alpha = 0.0082864277285468416)
alg4.fit(data_train, label_train)
predicted_label = alg4.predict(data_test)

# Take the mean of the scores (because we have one for each fold)
#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
#print("linear svc - Mean score on cross val", scores.mean())
print("MNB  - Score on test_data : ", score(predicted_label, label_test))
print("MNB - Score on train_data : ", score(alg4.predict(data_train), label_train))

('MNB  - Score on test_data : ', 0.88624000000000003)
('MNB - Score on train_data : ', 0.95498666666666665)


In [50]:
def error_similarity(predicted_label_SGD, predicted_label_SVM):
    ref_diff =0
    all_diff =0

    for i, label in enumerate(predicted_label_SGD):
        if(label != label_test[i]):
            ref_diff+=1
            if(label_test[i] != predicted_label_SVM[i]):
                all_diff+=1
    return ref_diff, all_diff, float(all_diff)/ref_diff *100

In [None]:
predicted_label_SGD = alg2.predict(data_test)
predicted_label_SVM = alg3.predict(data_test)
predicted_label_lr_tf = lr_tf.predict(data_test)
print error_similarity(predicted_label_SGD, predicted_label_SVM)
print error_similarity(predicted_label_SGD, predicted_label_lr_tw)

# Majority Vote

In [10]:
from sklearn.ensemble import VotingClassifier
mv = VotingClassifier(estimators=[('lr', alg1), ('sgd', alg2), ('svm', alg3),('mnb', alg4)], voting='hard',weights=[1,1,1,0])
mv.fit(data_train, label_train)
predicted_label = mv.predict(data_test)

# Take the mean of the scores (because we have one for each fold)
#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
#print("linear svc - Mean score on cross val", scores.mean())
print("Majority vote  - Score on test_data : ", score(predicted_label, label_test))
print("Majority vote - Score on train_data : ", score(mv.predict(data_train), label_train))

('Majority vote  - Score on test_data : ', 0.90688)
('Majority vote - Score on train_data : ', 0.9937066666666666)


# Average Probability

In [11]:
from sklearn.ensemble import VotingClassifier
ap = VotingClassifier(estimators=[('lr', alg1), ('sgd', alg2)], voting = 'soft', weights=[1,1]) #should normally be used on well-calibrated classifiers (not the case here)
ap.fit(data_train, label_train)
predicted_label = ap.predict(data_test)

# Take the mean of the scores (because we have one for each fold)
#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
#print("linear svc - Mean score on cross val", scores.mean())
print("AV  - Score on test_data : ", score(predicted_label, label_test))
print("AV - Score on train_data : ", score(ap.predict(data_train), label_train))

('AV  - Score on test_data : ', 0.90304)
('AV - Score on train_data : ', 0.9809066666666667)
