In [1]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array
    
def load_sparse_coo(filename):
    loader = np.load(filename)
    return scipy.sparse.coo_matrix((loader['data'],(loader['row'],loader['col'])),
                     shape = loader['shape'])

In [2]:
path = "bapt_tfidf/"
data_train = load_sparse_csr(path+'data_train.npz')
data_test = load_sparse_csr(path+'data_test.npz')
label_train = load_csv(path+'label_train.csv')
label_test = load_csv(path+'label_test.csv')
features_names = load_feature_names('data/feature_names.csv')
print data_train.shape

(18750, 200000)


In [3]:
from sklearn.metrics import zero_one_loss
def score(true_label, predicted_label):
    return 1 - zero_one_loss(true_label,predicted_label)

In [4]:
data_train_all = [load_sparse_csr('tw_sw{}_all_train_train.npz'.format(k)) for k in range(1,6)]
data_test_all = [load_sparse_csr('tw_sw{}_all_train_test.npz'.format(k)) for k in range(1,6)]
data_train_sep = [load_sparse_coo('tw_sw{}_train_train.npz'.format(k)) for k in range(1,6)]
data_test_sep = [load_sparse_coo('tw_sw{}_train_test.npz'.format(k)) for k in range(1,6)]

In [5]:
label_train_tw = load_csv('labels_train_train.csv')
label_test_tw = load_csv('labels_train_test.csv')

In [6]:
nb_feat = 80000
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=nb_feat)
data_train = fselect.fit_transform(data_train, label_train)
data_test = fselect.transform(data_test)
print data_train.shape

(18750, 80000)


  chisq /= f_exp


In [7]:
from sklearn.preprocessing import MaxAbsScaler
normalizer_all = map(lambda x: MaxAbsScaler().fit(x), data_train_all)
normalizer_sep = map(lambda x: MaxAbsScaler().fit(x), data_train_sep)

scaler =MaxAbsScaler()
scaler.partial_fit(data_test)
scaler.partial_fit(data_train)
scaler.transform(data_test)
scaler.transform(data_train)

data_train_all_norm = [normalizer_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_norm = [normalizer_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]
data_train_sep_norm = [normalizer_sep[i].transform(data_train_sep[i]) for i in range(len(data_train_sep))]
data_test_sep_norm = [normalizer_sep[i].transform(data_test_sep[i]) for i in range(len(data_test_sep))]

In [8]:
SelectKBest(f_classif , k=data_train_all[0].shape[1]/100).fit(data_train_all[0],label_train_tw)
fselect_all = [SelectKBest(f_classif , k=data_train_all[i].shape[1]/100).fit(
        data_train_all[i],label_train_tw) for i in range(len(data_train_all))]
fselect_sep = [SelectKBest(f_classif , k=data_train_sep[i].shape[1]/100).fit(
        data_train_sep[i], label_train_tw) for i in range(len(data_train_sep))]
data_train_all_selec = [fselect_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_selec = [fselect_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]

data_train_sep_selec = [fselect_sep[i].transform(data_train_sep[i]) for i in range(len(data_train_sep))]
data_test_sep_selec = [fselect_sep[i].transform(data_test_sep[i]) for i in range(len(data_test_sep))]
fselect_all_norm = [SelectKBest(f_classif , k=data_train_all_norm[i].shape[1]/100).fit(
        data_train_all_norm[i],label_train_tw) for i in range(len(data_train_all_norm))]
fselect_sep_norm = [SelectKBest(f_classif , k=data_train_sep[i].shape[1]/100).fit(
        data_train_sep_norm[i], label_train_tw) for i in range(len(data_train_sep_norm))]

data_train_all_norm_selec = [fselect_all_norm[i].transform(
        data_train_all_norm[i]) for i in range(len(data_train_all_norm))]
data_test_all_norm_selec = [fselect_all_norm[i].transform(
        data_test_all_norm[i]) for i in range(len(data_test_all_norm))]
data_train_sep_norm_selec = [fselect_sep_norm[i].transform(
        data_train_sep_norm[i]) for i in range(len(data_train_sep_norm))]
data_test_sep_norm_selec = [fselect_sep_norm[i].transform(
        data_test_sep_norm[i]) for i in range(len(data_test_sep_norm))]

data_train_tw = data_train_all_norm [3]
data_test_tw = data_test_all_norm [3]

data_train_tw = data_train_tw[:,0:-25]
data_test_tw = data_train_tw[:,0:-25]

  f = msb / msw


In [165]:
print data_train_all_norm_selec.shape

AttributeError: 'list' object has no attribute 'shape'

# Logistic Regression
- Adding the proba (instead of the label) is better for the final predition)

In [10]:
Cs = {'C': np.linspace(1000, 1500, 10)}
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

lrtf = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = 1)
lrtf = lrtf.fit(data_train, label_train)
predicted_label_lrtf = lrtf.predict(data_test)


print("SVM - Best C & associated score", lrtf.best_params_, lrtf.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label_lrtf))

('SVM - Best C & associated score', {'C': 1388.8888888888889}, 0.92437333333333338)
('SVM - Score on test_data : ', 0.90544000000000002)


In [11]:
Cs = {'C': np.linspace(0.4, 1, 20)}

lrtw = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = 1)
lrtw = lrtw.fit(data_train_tw, label_train_tw)
predicted_label_lrtw = lrtw.predict(data_test_tw)

print("SVM - Best C & associated score", lrtw.best_params_, lrtw.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label_lrtw))

('SVM - Best C & associated score', {'C': 0.62105263157894741}, 0.87680000000000002)
('SVM - Score on test_data : ', 0.88127999999999995)


# SGD Classifier
- Impossible to predict a proba, just labels

In [12]:
from sklearn.linear_model import SGDClassifier

sgd_tf = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True, penalty='l2')
sgd_tf.fit( data_train, label_train )
predicted_label_SGD_TF = sgd_tf.predict(data_test)

print("SGD - Score on test data : ", score(label_test, predicted_label_SGD_TF))

('SGD - Score on test data : ', 0.90383999999999998)


In [13]:
sgd_tw = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True, penalty='l2')
sgd_tw.fit( data_train_tw, label_train )
predicted_label_SGD_TW = sgd_tw.predict(data_test_tw)

print("SGD - Score on test data : ", score(label_test, predicted_label_SGD_TW))

('SGD - Score on test data : ', 0.86736000000000002)


# LinearSVC
- Do not predict proba, only labels

In [14]:
from sklearn.svm import LinearSVC

Cs = {'C': np.linspace(3, 5, 10)}
svc_tf = GridSearchCV(LinearSVC(penalty = 'l2'), Cs, n_jobs = 1)
svc_tf.fit(data_train, label_train)
predicted_label_SVC_TF = svc_tf.predict(data_test)

print("Linear SVC - Best C & associated score", svc_tf.best_params_, svc_tf.best_score_)
print("Linear svc  - Score on test_data : ", score(predicted_label_SVC_TF, label_test))

('Linear SVC - Best C & associated score', {'C': 4.333333333333333}, 0.92458666666666667)
('Linear svc  - Score on test_data : ', 0.90527999999999997)


In [15]:
Cs = {'C': np.linspace(0.01, 0.1, 5)}
svc_tw = GridSearchCV(LinearSVC(penalty = 'l2'), Cs, n_jobs = 1)
svc_tw.fit(data_train_tw, label_train)
predicted_label_SVC_TW = svc_tw.predict(data_test_tw)

print("Linear SVC - Best C & associated score", svc_tw.best_params_, svc_tw.best_score_)
print("linear svc  - Score on test_data : ", score(predicted_label_SVC_TW, label_test))

('Linear SVC - Best C & associated score', {'C': 0.032500000000000001}, 0.87690666666666661)
('linear svc  - Score on test_data : ', 0.88160000000000005)


# Multinomial Naive Bayes
- Able to return the label, the proba, and the log_proba

In [16]:
from sklearn.naive_bayes import MultinomialNB

alphas = {'alpha': np.linspace(0.0001, 0.001, 10)}
multinom_tf = GridSearchCV(MultinomialNB(), alphas, n_jobs = 1)
multinom_tf.fit(data_train, label_train)
predicted_label_MN_TF = multinom_tf.predict(data_test)

print("Multinomial - Best alpha & associated score", multinom_tf.best_params_, multinom_tf.best_score_)
print("MNB  - Score on test_data : ", score(predicted_label_MN_TF, label_test))

('Multinomial - Best alpha & associated score', {'alpha': 0.00029999999999999997}, 0.95413333333333339)
('MNB  - Score on test_data : ', 0.88736000000000004)


In [17]:
# DO NOT WORK BECAUSE THE VALUES NEED TO BE POSITIVE APPARENTLY
'''
alphas = {'alpha': np.logspace(-3, 0, 10)}
multinom_tw = GridSearchCV(MultinomialNB(), alphas, n_jobs = 1)
multinom_tw.fit(data_train_tw, label_train)
predicted_label_MN_TW = multinom_tw.predict(data_test_tw)

print("Multinomial - Best alpha & associated score", multinom_tw.best_params_, multinom_tw.best_score_)
print("MNB  - Score on test_data : ", score(predicted_label_MN_TW, label_test))
'''

'\nalphas = {\'alpha\': np.logspace(-3, 0, 10)}\nmultinom_tw = GridSearchCV(MultinomialNB(), alphas, n_jobs = 1)\nmultinom_tw.fit(data_train_tw, label_train)\npredicted_label_MN_TW = multinom_tw.predict(data_test_tw)\n\nprint("Multinomial - Best alpha & associated score", multinom_tw.best_params_, multinom_tw.best_score_)\nprint("MNB  - Score on test_data : ", score(predicted_label_MN_TW, label_test))\n'

# ExtraTrees
- Can predict label, proba AND log proba

In [18]:
from sklearn.ensemble import ExtraTreesClassifier

extratrees_tf = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=1, random_state=0, n_jobs = 1)
extratrees_tf.fit(data_train, label_train)
predicted_label_extratrees_tf = extratrees_tf.predict(data_test)

print("ExtraTrees - Score on test_data : ", score(label_test, predicted_label_extratrees_tf))

('ExtraTrees - Score on test_data : ', 0.84192)


In [21]:
extratrees_tw = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=1, random_state=0, n_jobs = 1)
extratrees_tw.fit(data_train_tw, label_train)
predicted_label_extratrees_tw = extratrees_tw.predict(data_test_tw)

print("ExtraTrees - Score on test_data : ", score(label_test, predicted_label_extratrees_tw))

('ExtraTrees - Score on test_data : ', 0.82320000000000004)


# AdaBoost
- Can predict label, proba AND log proba

In [22]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_tf = AdaBoostClassifier(n_estimators=40)
adaboost_tf.fit(data_train, label_train)
predicted_label_adaboost_tf = adaboost_tf.predict(data_test)

print("AdaBoost - Score on test_data : ", score(label_test, predicted_label_adaboost_tf))

('AdaBoost - Score on test_data : ', 0.78895999999999999)


In [23]:
adaboost_tw = AdaBoostClassifier(n_estimators=40)
adaboost_tw.fit(data_train_tw, label_train)
predicted_label_adaboost_tw = adaboost_tw.predict(data_test_tw)

print("AdaBoost - Score on test_data : ", score(label_test, predicted_label_adaboost_tw))

('AdaBoost - Score on test_data : ', 0.79520000000000002)


# Comparing the difference in predictions

In [24]:
def error_similarity(l1, l2):#
    ref_diff =0
    all_diff =0

    for i, label in enumerate(l1):
        if(label != label_test[i]):
            ref_diff+=1
            if(label_test[i] != l2[i]):
                all_diff+=1
    return ref_diff, all_diff, float(all_diff)/ref_diff *100

In [25]:
predicted_label_TF = {"lrtf": predicted_label_lrtf, 
                      "sgd_tf" : predicted_label_SGD_TF, 
                      "svc_tf" :predicted_label_SVC_TF,
                      "multinomial_tf" : predicted_label_MN_TF,
                      "extratrees_tf" : predicted_label_extratrees_tf,
                      "adaboost_tf" : predicted_label_adaboost_tf,
                      }
                      
predicted_label_TW = {"lrtw": predicted_label_lrtw, 
                      "sgd_tw" : predicted_label_SGD_TW, 
                      "svc_tw" :predicted_label_SVC_TW,
                      "extratrees_tw" : predicted_label_extratrees_tw,
                      "adaboost_tw" : predicted_label_adaboost_tw,
                      }

for tfkey, tfvalue in predicted_label_TF.items():
    for twkey, twvalue in predicted_label_TW.items():
        print error_similarity(tfvalue,twvalue), tfkey, twkey

print '------------------------------------------'

for tfkey, tfvalue in predicted_label_TF.items():
    for tfkey2, tfvalue2 in predicted_label_TF.items():
        if tfkey != tfkey2:
            print error_similarity(tfvalue,tfvalue2), tfkey, tfkey2
            
print '------------------------------------------'

for twkey, twvalue in predicted_label_TW.items():
    for twkey2, twvalue2 in predicted_label_TW.items():
        if twkey != twkey2:
            print error_similarity(twvalue,twvalue2), twkey, twkey2

(591, 75, 12.690355329949238) lrtf sgd_tw
(591, 69, 11.6751269035533) lrtf lrtw
(591, 72, 12.18274111675127) lrtf svc_tw
(591, 125, 21.150592216582066) lrtf adaboost_tw
(591, 119, 20.135363790186126) lrtf extratrees_tw
(592, 71, 11.993243243243242) svc_tf sgd_tw
(592, 65, 10.97972972972973) svc_tf lrtw
(592, 68, 11.486486486486488) svc_tf svc_tw
(592, 126, 21.283783783783782) svc_tf adaboost_tw
(592, 117, 19.763513513513516) svc_tf extratrees_tw
(988, 120, 12.145748987854251) extratrees_tf sgd_tw
(988, 108, 10.931174089068826) extratrees_tf lrtw
(988, 108, 10.931174089068826) extratrees_tf svc_tw
(988, 197, 19.939271255060728) extratrees_tf adaboost_tw
(988, 177, 17.91497975708502) extratrees_tf extratrees_tw
(1319, 163, 12.357846853677028) adaboost_tf sgd_tw
(1319, 144, 10.917361637604246) adaboost_tf lrtw
(1319, 144, 10.917361637604246) adaboost_tf svc_tw
(1319, 254, 19.257012888551934) adaboost_tf adaboost_tw
(1319, 209, 15.845337376800606) adaboost_tf extratrees_tw
(704, 91, 12.926

# Assembling the results

In [117]:
import math
def csr_vappend(a,b): #b est un vecteur ligne (np.array ou liste) et a est une sparse matrix
    if(type(a)== list):
        a=np.array([a]).T
    if(type(b)== list):
        b=np.array([b]).T
    if(type(a)!= scipy.sparse.csr.csr_matrix):
        a=scipy.sparse.csr_matrix(a)
    if(type(b)!= scipy.sparse.csr.csr_matrix):
        b=scipy.sparse.csr_matrix(b)
        
    return scipy.sparse.hstack([a,b], format ='csr')

In [118]:
created_features_train = load_sparse_csr(path+'train_new_feat.npz')
created_features_test  = load_sparse_csr(path+'test_new_feat.npz')

In [119]:
###### LOG REG ######

# Add PROBA Logistic Regression TF
new_feat_train = lrtf.predict_proba(data_train)[:,0].tolist()
new_feat_test = lrtf.predict_proba(data_test)[:,0].tolist()

# Add PROBA Logistic Regression TW
new_feat_train = csr_vappend(new_feat_train, lrtw.predict_proba(data_train_tw)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, lrtw.predict_proba(data_test_tw)[:,0].tolist())

###### SGD ########

# Add SGD TF
new_feat_train = csr_vappend(new_feat_train, sgd_tf.predict(data_train).tolist())
new_feat_test = csr_vappend(new_feat_test, sgd_tf.predict(data_test).tolist())

# Add SGD TW
new_feat_train = csr_vappend(new_feat_train, sgd_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, sgd_tw.predict(data_test_tw).tolist())

###### LINEAR SVC #######

# Add Linear SVC TF
new_feat_train = csr_vappend(new_feat_train, svc_tf.predict(data_train).tolist())
new_feat_test = csr_vappend(new_feat_test, svc_tf.predict(data_test).tolist())

# Add Linear SVC TW
new_feat_train = csr_vappend(new_feat_train, svc_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, svc_tw.predict(data_test_tw).tolist())

###### MULTINOMIAL NAIVE BAYES ######

# Add Multinomial TF
new_feat_train = csr_vappend(new_feat_train, multinom_tf.predict(data_train).tolist())
new_feat_test = csr_vappend(new_feat_test, multinom_tf.predict(data_test).tolist())

# Add PROBA Multinomial TF
new_feat_train = csr_vappend(new_feat_train, multinom_tf.predict_proba(data_train)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, multinom_tf.predict_proba(data_test)[:,0].tolist())

In [120]:
##### EXTRA TREES #######

# Add TREES TF
new_feat_train = csr_vappend(new_feat_train, extratrees_tf.predict(data_train).tolist())
new_feat_test = csr_vappend(new_feat_test, extratrees_tf.predict(data_test).tolist())

# Add PROBA TREES TF
new_feat_train = csr_vappend(new_feat_train, extratrees_tf.predict_proba(data_train)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, extratrees_tf.predict_proba(data_test)[:,0].tolist())

# Add TREES TW
new_feat_train = csr_vappend(new_feat_train, extratrees_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, extratrees_tw.predict(data_test_tw).tolist())

# Add PROBA TREES TW
new_feat_train = csr_vappend(new_feat_train, extratrees_tw.predict_proba(data_train_tw)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, extratrees_tw.predict_proba(data_test_tw)[:,0].tolist())


In [121]:
##### ADABOOST #######

# Add adaboost TF
new_feat_train = csr_vappend(new_feat_train, adaboost_tf.predict(data_train).tolist())
new_feat_test = csr_vappend(new_feat_test, adaboost_tf.predict(data_test).tolist())

# Add PROBA adaboost TF
new_feat_train = csr_vappend(new_feat_train, adaboost_tf.predict_proba(data_train)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, adaboost_tf.predict_proba(data_test)[:,0].tolist())

# Add adaboost TW
new_feat_train = csr_vappend(new_feat_train, adaboost_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, adaboost_tw.predict(data_test_tw).tolist())

# Add PROBA adaboost TW
new_feat_train = csr_vappend(new_feat_train, adaboost_tw.predict_proba(data_train_tw)[:,0].tolist())
new_feat_test = csr_vappend(new_feat_test, adaboost_tw.predict_proba(data_test_tw)[:,0].tolist())

In [122]:
# Add created features
new_feat_train = csr_vappend(new_feat_train, created_features_train)
new_feat_test = csr_vappend(new_feat_test, created_features_test)

In [123]:
data_train_ = csr_vappend(data_train, new_feat_train)
data_test_ = csr_vappend(data_test, new_feat_test)

In [104]:
data_train__ = scipy.sparse.hstack([data_train_tw, data_train])
data_test__ = scipy.sparse.hstack([data_test_tw, data_test])

# Second Layer Algorithm

In [112]:
from sklearn.ensemble import RandomForestClassifier

alg = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1, n_jobs = 1)
alg.fit(data_train_, label_train)
predicted_label_rf = alg.predict(data_test_)

print("Random Forest - Score on test_data : ", score(label_test, predicted_label_rf))

('Random Forest - Score on test_data : ', 0.93711999999999995)


In [164]:
train_ = new_feat_train
test_ = new_feat_test
logreg = LogisticRegression(penalty = 'l2', C = 0.001)
logreg.fit(train_, label_train)
predicted_label_logreg = logreg.predict(test_)
print("Num & TOTAL : ", score(label_test, predicted_label_logreg))


train_ = new_feat_train[:,1:]
test_ = new_feat_test[:,1:]
logreg = LogisticRegression(penalty = 'l2', C = 0.001)
logreg.fit(train_, label_train)
predicted_label_logreg = logreg.predict(test_)
print("Num & Score : ", 1, score(label_test, predicted_label_logreg))

         
Num = 1
for i in range(2,43):
    train_ = scipy.sparse.hstack([new_feat_train[:,0:i-1], new_feat_train[:,i:]])
    test_ = scipy.sparse.hstack([new_feat_test[:,0:i-1], new_feat_test[:,i:]])
    Num += 1
    logreg = LogisticRegression(penalty = 'l2', C = 0.001)
    logreg.fit(train_, label_train)
    predicted_label_logreg = logreg.predict(test_)
    print("Num & Score : ", Num, score(label_test, predicted_label_logreg))
         
         
train_ = new_feat_train[:,:43]
test_ = new_feat_test[:,:43]
logreg = LogisticRegression(penalty = 'l2', C = 0.001)
logreg.fit(train_, label_train)
predicted_label_logreg = logreg.predict(test_)
print("Num & Score : ", 43, score(label_test, predicted_label_logreg))

('Num & TOTAL : ', 0.94144000000000005)
('Num & Score : ', 1, 0.94272)
('Num & Score : ', 2, 0.93584000000000001)
('Num & Score : ', 3, 0.94576000000000005)
('Num & Score : ', 4, 0.93503999999999998)
('Num & Score : ', 5, 0.94543999999999995)
('Num & Score : ', 6, 0.93567999999999996)
('Num & Score : ', 7, 0.94352000000000003)
('Num & Score : ', 8, 0.94352000000000003)
('Num & Score : ', 9, 0.94479999999999997)
('Num & Score : ', 10, 0.94272)
('Num & Score : ', 11, 0.93679999999999997)
('Num & Score : ', 12, 0.93776000000000004)
('Num & Score : ', 13, 0.94159999999999999)
('Num & Score : ', 14, 0.94128000000000001)
('Num & Score : ', 15, 0.93903999999999999)
('Num & Score : ', 16, 0.94111999999999996)
('Num & Score : ', 17, 0.94144000000000005)
('Num & Score : ', 18, 0.93984000000000001)
('Num & Score : ', 19, 0.94047999999999998)
('Num & Score : ', 20, 0.94047999999999998)
('Num & Score : ', 21, 0.94111999999999996)
('Num & Score : ', 22, 0.94144000000000005)
('Num & Score : ', 23, 0.

In [None]:
Num = 1
for i in range(2,43):
    train_ = scipy.sparse.hstack([new_feat_train[:,0:i-1], new_feat_train[:,i:]])
    test_ = scipy.sparse.hstack([new_feat_test[:,0:i-1], new_feat_test[:,i:]])
    Num += 1
    logreg = LogisticRegression(penalty = 'l2', C = 0.001)
    logreg.fit(train_, label_train)
    predicted_label_logreg = logreg.predict(test_)
    print("Num & Score : ", Num, score(label_test, predicted_label_logreg))


In [111]:
error_similarity(predicted_label, predicted_label_logreg)

(408, 317, 77.69607843137256)

In [25]:
alg = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True, penalty='l2')
alg.fit( new_feat_train, label_train )
predicted_label = alg.predict(new_feat_test)

print("SGD - Score on test data : ", score(label_test, predicted_label_SGD_TF))

('SGD - Score on test data : ', 0.90383999999999998)


In [177]:
from sklearn.ensemble import ExtraTreesClassifier


alg = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=1, n_jobs = 1)
alg.fit(data_train_, label_train)
predicted_label = alg.predict(data_test_)

print("ExtraTrees - Score on test_data : ", score(label_test, predicted_label))

('ExtraTrees - Score on test_data : ', 0.93391999999999997)


In [181]:
from sklearn.tree import DecisionTreeClassifier

alg = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0)
alg.fit(data_train_, label_train)
predicted_label = alg.predict(data_test_)

print("DecisionTrees - Score on test_data : ", score(label_test, predicted_label))

('DecisionTrees - Score on test_data : ', 0.90544000000000002)


In [182]:
from sklearn.ensemble import AdaBoostClassifier

alg = AdaBoostClassifier(n_estimators=40)
alg.fit(data_train_, label_train)
predicted_label = alg.predict(data_test_)

print("AdaBoost - Score on test_data : ", score(label_test, predicted_label))

('AdaBoost - Score on test_data : ', 0.90544000000000002)


In [183]:
from sklearn.ensemble import GradientBoostingClassifier

alg = GradientBoostingClassifier(n_estimators=100, max_depth=1, random_state=0)
alg.fit(data_train_, label_train)
predicted_label = alg.predict(data_test_)

print("AdaBoost - Score on test_data : ", score(label_test, predicted_label))

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [None]:
add_prob_lrtw = 1
add_prob_lrtf = 1
add_prob_sgdtf = 1
add_prob_sgdtw = 1
add_lab_linearsvc_tf = 1 
add_lab_linearsvc_tw = 1
add_lab_multinomial_tf = 1
add_lab_multinomail_tw = 1
add_lab_lrtw = 0
add_lab_lrtf = 0
add_data_tw = 0

if(add_prob_lrtw):
    data_train = csr_vappend(data_train, lrtw.predict_proba(data_train_tw)[:,0].tolist())
    data_test = csr_vappend(data_test, lrtw.predict_proba(data_test_tw)[:,0].tolist())
if(add_lab_lrtw):
    data_train = csr_vappend(data_train, lrtw.predict(data_train_tw).tolist())
    data_test = csr_vappend(data_test, lrtw.predict(data_test_tw).tolist())    

if(add_prob_sgdtf):
    data_train = csr_vappend(data_train, sgd_tf.predict(data_train).tolist())
    data_test = csr_vappend(data_test, sgd_tf.predict(data_test).tolist())  

if(add_prob_sgdtw):
    data_train = csr_vappend(data_train, sgd_tw.predict(data_train_tw).tolist())
    data_test = csr_vappend(data_test, sgd_tw.predict(data_test_tw).tolist())
    
if(add_prob_lrtf):
    data_train = csr_vappend(data_train, lrtf.predict_proba(data_train[:,0:nb_feat])[:,0].tolist())
    data_test = csr_vappend(data_test, lrtf.predict_proba(data_test[:,0:nb_feat])[:,0].tolist())
    
if(add_lab_lrtw):
    data_train = csr_vappend(data_train, lrtf.predict(data_train[:,0:nb_feat]).tolist())
    data_test = csr_vappend(data_test, lrtf.predict(data_test[:,0:nb_feat]).tolist())    

if(add_data_tw):   
    data_train = scipy.sparse.hstack([data_train,data_train_tw])
    data_test = scipy.sparse.hstack([data_test,data_test_tw])

## adding features IGNORER APRES ICI

In [112]:
def csr_vappend(a,b): #b est un vecteur ligne (np.array ou liste) et a est une sparse matrix
    if(type(a)!= scipy.sparse.csr.csr_matrix):
        a=scipy.sparse.csr_matrix(a)
        
    if(type(b)== list):
        b=np.array([b]).T
    if(type(b)!= scipy.sparse.csr.csr_matrix):
        b=scipy.sparse.csr_matrix(b)
        
    return scipy.sparse.hstack([a,b], format ='csr')

In [113]:
b = lr_tw.predict(data_test_tw).tolist()
print np.array([b]).T.shape, data_test.shape


(6250, 1) (6250, 80000)


In [155]:
data_train = csr_vappend(data_train, lr_tw.predict(data_train_tw).tolist())
data_test = csr_vappend(data_test, lr_tw.predict(data_test_tw).tolist())
#data_train = csr_vappend(data_train, alg2.predict(data_train).tolist())
#data_test = csr_vappend(data_test, alg2.predict(data_test).tolist())

In [156]:
data_train = csr_vappend(data_train, predicted_train_SGD.tolist())
data_test = csr_vappend(data_test, predicted_label_SGD.tolist())