In [1]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array
    
def load_sparse_coo(filename):
    loader = np.load(filename)
    return scipy.sparse.coo_matrix((loader['data'],(loader['row'],loader['col'])),
                     shape = loader['shape'])

# Tf- Idf

In [13]:
data_train_train_tf = load_sparse_csr('bapt_tfidf/data_train.npz')
data_train_test_tf = load_sparse_csr('bapt_tfidf/data_test.npz')

label_train= load_csv('labels_train_train.csv')
label_test = load_csv('labels_train_test.csv')

created_feat_train_train = load_sparse_csr('bapt_tfidf/train_new_feat.npz')
created_feat_train_test = load_sparse_csr('bapt_tfidf/test_new_feat.npz')

In [14]:
import scipy

data_train_tf = scipy.sparse.vstack([data_train_train_tf, data_train_test_tf], format ='csr')
label = label_train + label_test
created_feat = scipy.sparse.vstack([created_feat_train_train, created_feat_train_test], format = 'csr')

In [53]:
from sklearn.cross_validation import train_test_split
data_train_train_tf, data_train_test_tf, label_train, label_test = train_test_split(data_train_tf, label, test_size = 0.25, random_state = 13)
created_features_train, created_features_test = train_test_split(created_feat, test_size = 0.25, random_state = 13)

In [21]:
nb_feat = 80000
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=nb_feat)
data_train = fselect.fit_transform(data_train_train_tf, label_train)
data_test = fselect.transform(data_train_test_tf)
print data_train.shape

(18750, 80000)


# Feature Selection

In [20]:
data_train_all = [load_sparse_csr('tw_sw{}_all_train_train.npz'.format(k)) for k in range(1,6)]
data_test_all = [load_sparse_csr('tw_sw{}_all_train_test.npz'.format(k)) for k in range(1,6)]
data_train_sep = [load_sparse_coo('tw_sw{}_train_train.npz'.format(k)) for k in range(1,6)]
data_test_sep = [load_sparse_coo('tw_sw{}_train_test.npz'.format(k)) for k in range(1,6)]

label_train_tw = load_csv('labels_train_train.csv')
label_test_tw = load_csv('labels_train_test.csv')

In [22]:
from sklearn.preprocessing import MaxAbsScaler
normalizer_all = map(lambda x: MaxAbsScaler().fit(x), data_train_all)
normalizer_sep = map(lambda x: MaxAbsScaler().fit(x), data_train_sep)

scaler =MaxAbsScaler()
scaler.partial_fit(data_test)
scaler.partial_fit(data_train)
scaler.transform(data_test)
scaler.transform(data_train)

data_train_all_norm = [normalizer_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_norm = [normalizer_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]
data_train_sep_norm = [normalizer_sep[i].transform(data_train_sep[i]) for i in range(len(data_train_sep))]
data_test_sep_norm = [normalizer_sep[i].transform(data_test_sep[i]) for i in range(len(data_test_sep))]

In [23]:
SelectKBest(f_classif , k=data_train_all[0].shape[1]/100).fit(data_train_all[0],label_train_tw)
fselect_all = [SelectKBest(f_classif , k=data_train_all[i].shape[1]/100).fit(
        data_train_all[i],label_train_tw) for i in range(len(data_train_all))]
fselect_sep = [SelectKBest(f_classif , k=data_train_sep[i].shape[1]/100).fit(
        data_train_sep[i], label_train_tw) for i in range(len(data_train_sep))]
data_train_all_selec = [fselect_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_selec = [fselect_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]

data_train_sep_selec = [fselect_sep[i].transform(data_train_sep[i]) for i in range(len(data_train_sep))]
data_test_sep_selec = [fselect_sep[i].transform(data_test_sep[i]) for i in range(len(data_test_sep))]
fselect_all_norm = [SelectKBest(f_classif , k=data_train_all_norm[i].shape[1]/100).fit(
        data_train_all_norm[i],label_train_tw) for i in range(len(data_train_all_norm))]
fselect_sep_norm = [SelectKBest(f_classif , k=data_train_sep[i].shape[1]/100).fit(
        data_train_sep_norm[i], label_train_tw) for i in range(len(data_train_sep_norm))]

data_train_all_norm_selec = [fselect_all_norm[i].transform(
        data_train_all_norm[i]) for i in range(len(data_train_all_norm))]
data_test_all_norm_selec = [fselect_all_norm[i].transform(
        data_test_all_norm[i]) for i in range(len(data_test_all_norm))]
data_train_sep_norm_selec = [fselect_sep_norm[i].transform(
        data_train_sep_norm[i]) for i in range(len(data_train_sep_norm))]
data_test_sep_norm_selec = [fselect_sep_norm[i].transform(
        data_test_sep_norm[i]) for i in range(len(data_test_sep_norm))]

data_train_train_tw = data_train_all_norm [3]
data_train_test_tw = data_test_all_norm [3]

data_train_tw = data_train_tw[:,0:-25]
data_test_tw = data_train_tw[:,0:-25]

  f = msb / msw


In [24]:
data_train_tw = scipy.sparse.vstack([data_train_train_tw, data_train_test_tw], format ='csr')
label_tw = scipy.sparse.vstack([label_train_tw, label_test_tw], format = 'csr')

In [None]:
data_train_tw, data_test_tw, label_train, label_test = train_test_split(data_train_tw, label, test_size = 0.25, random_state = 13)
label_train_tw, label_test_tw = train_test_split(label_tw, test_size = 0.25, random_state = 13)

In [27]:
from sklearn.metrics import zero_one_loss
def score(true_label, predicted_label):
    return 1 - zero_one_loss(true_label,predicted_label)

In [28]:
Cs = {'C': np.linspace(1000, 1500, 10)}
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

lrtf = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = 1)
lrtf = lrtf.fit(data_train, label_train)
predicted_label_lrtf = lrtf.predict(data_test)


print("SVM - Best C & associated score", lrtf.best_params_, lrtf.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label_lrtf))

('SVM - Best C & associated score', {'C': 1000.0}, 0.92378666666666664)
('SVM - Score on test_data : ', 0.90447999999999995)


In [None]:
Cs = {'C': np.linspace(0.4, 1, 20)}

lrtw = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = 1)
lrtw = lrtw.fit(data_train_tw, label_train_tw)
predicted_label_lrtw = lrtw.predict(data_test_tw)

print("SVM - Best C & associated score", lrtw.best_params_, lrtw.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label_lrtw))

In [30]:
from sklearn.linear_model import SGDClassifier

sgd_tf = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True, penalty='l2')
sgd_tf.fit( data_train, label_train )
predicted_label_SGD_TF = sgd_tf.predict(data_test)

print("SGD - Score on test data : ", score(label_test, predicted_label_SGD_TF))

('SGD - Score on test data : ', 0.90512000000000004)


In [31]:
sgd_tw = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True, penalty='l2')
sgd_tw.fit( data_train_tw, label_train )
predicted_label_SGD_TW = sgd_tw.predict(data_test_tw)

print("SGD - Score on test data : ", score(label_test, predicted_label_SGD_TW))

('SGD - Score on test data : ', 0.86895999999999995)


In [32]:
from sklearn.svm import LinearSVC

Cs = {'C': np.linspace(3, 5, 10)}
svc_tf = GridSearchCV(LinearSVC(penalty = 'l2'), Cs, n_jobs = 1)
svc_tf.fit(data_train, label_train)
predicted_label_SVC_TF = svc_tf.predict(data_test)

print("Linear SVC - Best C & associated score", svc_tf.best_params_, svc_tf.best_score_)
print("Linear svc  - Score on test_data : ", score(predicted_label_SVC_TF, label_test))

('Linear SVC - Best C & associated score', {'C': 3.0}, 0.92517333333333329)
('Linear svc  - Score on test_data : ', 0.90464)


In [33]:
Cs = {'C': np.linspace(0.01, 0.1, 5)}
svc_tw = GridSearchCV(LinearSVC(penalty = 'l2'), Cs, n_jobs = 1)
svc_tw.fit(data_train_tw, label_train)
predicted_label_SVC_TW = svc_tw.predict(data_test_tw)

print("Linear SVC - Best C & associated score", svc_tw.best_params_, svc_tw.best_score_)
print("linear svc  - Score on test_data : ", score(predicted_label_SVC_TW, label_test))

('Linear SVC - Best C & associated score', {'C': 0.032500000000000001}, 0.87551999999999996)
('linear svc  - Score on test_data : ', 0.88624000000000003)


In [37]:
from sklearn.naive_bayes import MultinomialNB

alphas = {'alpha': np.linspace(0.0001, 0.001, 10)}
multinom_tf = GridSearchCV(MultinomialNB(), alphas, n_jobs = 1)
multinom_tf.fit(data_train, label_train)
predicted_label_MN_TF = multinom_tf.predict(data_test)

print("Multinomial - Best alpha & associated score", multinom_tf.best_params_, multinom_tf.best_score_)
print("MNB  - Score on test_data : ", score(predicted_label_MN_TF, label_test))

('Multinomial - Best alpha & associated score', {'alpha': 0.00029999999999999997}, 0.95317333333333332)
('MNB  - Score on test_data : ', 0.88192000000000004)


In [38]:
from sklearn.ensemble import ExtraTreesClassifier

extratrees_tf = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=1, random_state=0, n_jobs = 1)
extratrees_tf.fit(data_train, label_train)
predicted_label_extratrees_tf = extratrees_tf.predict(data_test)

print("ExtraTrees - Score on test_data : ", score(label_test, predicted_label_extratrees_tf))

('ExtraTrees - Score on test_data : ', 0.84896000000000005)


In [39]:
extratrees_tw = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=1, random_state=0, n_jobs = 1)
extratrees_tw.fit(data_train_tw, label_train)
predicted_label_extratrees_tw = extratrees_tw.predict(data_test_tw)

print("ExtraTrees - Score on test_data : ", score(label_test, predicted_label_extratrees_tw))

('ExtraTrees - Score on test_data : ', 0.82879999999999998)


In [40]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_tf = AdaBoostClassifier(n_estimators=40)
adaboost_tf.fit(data_train, label_train)
predicted_label_adaboost_tf = adaboost_tf.predict(data_test)

print("AdaBoost - Score on test_data : ", score(label_test, predicted_label_adaboost_tf))

('AdaBoost - Score on test_data : ', 0.78879999999999995)


In [41]:
adaboost_tw = AdaBoostClassifier(n_estimators=40)
adaboost_tw.fit(data_train_tw, label_train)
predicted_label_adaboost_tw = adaboost_tw.predict(data_test_tw)

print("AdaBoost - Score on test_data : ", score(label_test, predicted_label_adaboost_tw))

('AdaBoost - Score on test_data : ', 0.79488000000000003)


# Comparing algo

In [42]:
def error_similarity(l1, l2):#
    ref_diff =0
    all_diff =0

    for i, label in enumerate(l1):
        if(label != label_test[i]):
            ref_diff+=1
            if(label_test[i] != l2[i]):
                all_diff+=1
    return ref_diff, all_diff, float(all_diff)/ref_diff *100

In [43]:
predicted_label_TF = {"lrtf": predicted_label_lrtf, 
                      "sgd_tf" : predicted_label_SGD_TF, 
                      "svc_tf" :predicted_label_SVC_TF,
                      "multinomial_tf" : predicted_label_MN_TF,
                      "extratrees_tf" : predicted_label_extratrees_tf,
                      "adaboost_tf" : predicted_label_adaboost_tf,
                      }
                      
predicted_label_TW = {"lrtw": predicted_label_lrtw, 
                      "sgd_tw" : predicted_label_SGD_TW, 
                      "svc_tw" :predicted_label_SVC_TW,
                      "extratrees_tw" : predicted_label_extratrees_tw,
                      "adaboost_tw" : predicted_label_adaboost_tw,
                      }

for tfkey, tfvalue in predicted_label_TF.items():
    for twkey, twvalue in predicted_label_TW.items():
        print error_similarity(tfvalue,twvalue), tfkey, twkey

print '------------------------------------------'

for tfkey, tfvalue in predicted_label_TF.items():
    for tfkey2, tfvalue2 in predicted_label_TF.items():
        if tfkey != tfkey2:
            print error_similarity(tfvalue,tfvalue2), tfkey, tfkey2
            
print '------------------------------------------'

for twkey, twvalue in predicted_label_TW.items():
    for twkey2, twvalue2 in predicted_label_TW.items():
        if twkey != twkey2:
            print error_similarity(twvalue,twvalue2), twkey, twkey2

(597, 72, 12.060301507537687) lrtf sgd_tw
(597, 295, 49.413735343383586) lrtf lrtw
(597, 69, 11.557788944723619) lrtf svc_tw
(597, 118, 19.765494137353436) lrtf adaboost_tw
(597, 92, 15.41038525963149) lrtf extratrees_tw
(596, 71, 11.912751677852349) svc_tf sgd_tw
(596, 292, 48.99328859060403) svc_tf lrtw
(596, 67, 11.241610738255034) svc_tf svc_tw
(596, 117, 19.630872483221477) svc_tf adaboost_tw
(596, 94, 15.771812080536913) svc_tf extratrees_tw
(944, 126, 13.347457627118645) extratrees_tf sgd_tw
(944, 470, 49.78813559322034) extratrees_tf lrtw
(944, 110, 11.652542372881355) extratrees_tf svc_tw
(944, 189, 20.021186440677965) extratrees_tf adaboost_tw
(944, 157, 16.63135593220339) extratrees_tf extratrees_tw
(1320, 175, 13.257575757575758) adaboost_tf sgd_tw
(1320, 649, 49.166666666666664) adaboost_tf lrtw
(1320, 156, 11.818181818181818) adaboost_tf svc_tw
(1320, 257, 19.46969696969697) adaboost_tf adaboost_tw
(1320, 241, 18.257575757575758) adaboost_tf extratrees_tw
(738, 93, 12.601

# Assembling

In [44]:
import math
def csr_vappend(a,b): #b est un vecteur ligne (np.array ou liste) et a est une sparse matrix
    if(type(a)== list):
        a=np.array([a]).T
    if(type(b)== list):
        b=np.array([b]).T
    if(type(a)!= scipy.sparse.csr.csr_matrix):
        a=scipy.sparse.csr_matrix(a)
    if(type(b)!= scipy.sparse.csr.csr_matrix):
        b=scipy.sparse.csr_matrix(b)
        
    return scipy.sparse.hstack([a,b], format ='csr')

In [85]:
###### LOG REG ######

# Add PROBA Logistic Regression TF
new_feat_train = lrtw.predict_proba(data_train_tw)[:,0].tolist()
new_feat_test = lrtw.predict_proba(data_test_tw)[:,0].tolist()

# Add PROBA Logistic Regression TW
new_feat_train = csr_vappend(new_feat_train, lrtf.predict_proba(data_train)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, lrtf.predict_proba(data_test)[:,0].tolist())

###### SGD ########

# Add SGD TF
new_feat_train = csr_vappend(new_feat_train, sgd_tf.predict(data_train).tolist())
new_feat_test = csr_vappend(new_feat_test, sgd_tf.predict(data_test).tolist())

# Add SGD TW
new_feat_train = csr_vappend(new_feat_train, sgd_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, sgd_tw.predict(data_test_tw).tolist())

###### LINEAR SVC #######

# Add Linear SVC TF
new_feat_train = csr_vappend(new_feat_train, svc_tf.predict(data_train).tolist())
new_feat_test = csr_vappend(new_feat_test, svc_tf.predict(data_test).tolist())

# Add Linear SVC TW
new_feat_train = csr_vappend(new_feat_train, svc_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, svc_tw.predict(data_test_tw).tolist())

###### MULTINOMIAL NAIVE BAYES ######

# Add Multinomial TF
new_feat_train = csr_vappend(new_feat_train, multinom_tf.predict(data_train).tolist())
new_feat_test = csr_vappend(new_feat_test, multinom_tf.predict(data_test).tolist())

# Add PROBA Multinomial TF
new_feat_train = csr_vappend(new_feat_train, multinom_tf.predict_proba(data_train)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, multinom_tf.predict_proba(data_test)[:,0].tolist())

In [86]:
##### EXTRA TREES #######

# Add TREES TF
new_feat_train = csr_vappend(new_feat_train, extratrees_tf.predict(data_train).tolist())
new_feat_test = csr_vappend(new_feat_test, extratrees_tf.predict(data_test).tolist())

# Add PROBA TREES TF
new_feat_train = csr_vappend(new_feat_train, extratrees_tf.predict_proba(data_train)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, extratrees_tf.predict_proba(data_test)[:,0].tolist())

# Add TREES TW
new_feat_train = csr_vappend(new_feat_train, extratrees_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, extratrees_tw.predict(data_test_tw).tolist())

# Add PROBA TREES TW
new_feat_train = csr_vappend(new_feat_train, extratrees_tw.predict_proba(data_train_tw)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, extratrees_tw.predict_proba(data_test_tw)[:,0].tolist())


In [87]:
##### ADABOOST #######

# Add adaboost TF
new_feat_train = csr_vappend(new_feat_train, adaboost_tf.predict(data_train).tolist())
new_feat_test = csr_vappend(new_feat_test, adaboost_tf.predict(data_test).tolist())

# Add PROBA adaboost TF
new_feat_train = csr_vappend(new_feat_train, adaboost_tf.predict_proba(data_train)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, adaboost_tf.predict_proba(data_test)[:,0].tolist())

# Add adaboost TW
new_feat_train = csr_vappend(new_feat_train, adaboost_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, adaboost_tw.predict(data_test_tw).tolist())

# Add PROBA adaboost TW
new_feat_train = csr_vappend(new_feat_train, adaboost_tw.predict_proba(data_train_tw)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, adaboost_tw.predict_proba(data_test_tw)[:,0].tolist())

In [88]:
# Add created features
new_feat_train = csr_vappend(new_feat_train, created_features_train)
new_feat_test = csr_vappend(new_feat_test, created_features_test)

In [89]:
data_train_ = csr_vappend(data_train, new_feat_train)
data_test_ = csr_vappend(data_test, new_feat_test)

In [82]:
data_train__ = scipy.sparse.hstack([data_train_tw, data_train])
data_test__ = scipy.sparse.hstack([data_test_tw, data_test])

# Second Layer Algorithm

In [None]:
from sklearn.ensemble import RandomForestClassifier

alg = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1, n_jobs = 1)
alg.fit(data_train_, label_train)
predicted_label_rf = alg.predict(data_test_)

print("Random Forest - Score on test_data : ", score(label_test, predicted_label_rf))

In [83]:
logreg = LogisticRegression(penalty = 'l2', C = 0.01)
logreg.fit(new_feat_train, label_train)
predicted_label_logreg = logreg.predict(new_feat_test)
print("Num & TOTAL : ", score(label_test, predicted_label_logreg))

('Num & TOTAL : ', 0.94672000000000001)


In [None]:
train = new_feat_train[:,0]
test = new_feat_test[:,0]

for i in range(1,43):
    train_draft = scipy.sparse.hstack([train, new_feat_train[:,i]])
    test_draft = scipy.sparse.hstack([test, new_feat_test[:,i]])
    
    logreg = LogisticRegression(penalty = 'l2', C = 0.01)
    logreg.fit(train_draft, label_train)
    predicted_label_logreg = logreg.predict(test_draft)
    print("Num & TOTAL : ", score(label_test, predicted_label_logreg))
    new_feat_train = 
    new_feat_test = 