In [1]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array
    
def load_sparse_coo(filename):
    loader = np.load(filename)
    return scipy.sparse.coo_matrix((loader['data'],(loader['row'],loader['col'])),
                     shape = loader['shape'])

In [2]:
from sklearn.metrics import zero_one_loss
def score(true_label, predicted_label):
    return 1 - zero_one_loss(true_label,predicted_label)

## TFIDF

In [3]:
path = "bapt_tfidf/"
data_train_train_tf = load_sparse_csr(path+'data_train.npz')
data_train_test_tf = load_sparse_csr(path+'data_test.npz')
data_test = load_sparse_csr(path+'data_test_test.npz')


label_train = load_csv(path+'label_train.csv')
label_test = load_csv(path+'label_test.csv')
features_names = load_feature_names('data/feature_names.csv')

created_feat_train_train = load_sparse_csr(path+'train_new_feat.npz')
created_feat_train_test = load_sparse_csr(path+'test_new_feat.npz')

In [4]:
import scipy

data_train = scipy.sparse.vstack([data_train_train_tf, data_train_test_tf], format ='csr')
label_train = label_train + label_test
created_feat = scipy.sparse.vstack([created_feat_train_train, created_feat_train_test], format = 'csr')

In [5]:
nb_feat = 80000
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=nb_feat)
data_train = fselect.fit_transform(data_train, label_train)
data_test = fselect.transform(data_test)
print data_train.shape, data_test.shape

(25000, 80000) (25000, 80000)


  chisq /= f_exp


## TWIDF

In [11]:
data_train_all = [load_sparse_csr('tw_sw{}_all_train.npz'.format(k)) for k in range(1,5)]
data_test_all = [load_sparse_csr('tw_sw{}_all_test.npz'.format(k)) for k in range(1,5)]

label_train_tw = load_csv('labels_train_train.csv')
label_test_tw = load_csv('labels_train_test.csv')
label_train_tw = label_train_tw +label_test_tw

In [12]:
data_test_all[3].shape

(25000, 99627)

In [13]:
from sklearn.preprocessing import MaxAbsScaler
normalizer_all = map(lambda x: MaxAbsScaler().fit(x), data_train_all)

scaler =MaxAbsScaler()
scaler.partial_fit(data_test)
scaler.partial_fit(data_train)
scaler.transform(data_test)
scaler.transform(data_train)

data_train_all_norm = [normalizer_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_norm = [normalizer_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]


In [14]:
SelectKBest(f_classif , k=data_train_all[0].shape[1]/100).fit(data_train_all[0],label_train_tw)
fselect_all = [SelectKBest(f_classif , k=data_train_all[i].shape[1]/100).fit(
        data_train_all[i],label_train_tw) for i in range(len(data_train_all))]
data_train_all_selec = [fselect_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_selec = [fselect_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]

fselect_all_norm = [SelectKBest(f_classif , k=data_train_all_norm[i].shape[1]/100).fit(
        data_train_all_norm[i],label_train_tw) for i in range(len(data_train_all_norm))]

data_train_all_norm_selec = [fselect_all_norm[i].transform(
        data_train_all_norm[i]) for i in range(len(data_train_all_norm))]
data_test_all_norm_selec = [fselect_all_norm[i].transform(
        data_test_all_norm[i]) for i in range(len(data_test_all_norm))]


  f = msb / msw


In [15]:
data_train_tw = data_train_all_norm [3]
data_test_tw = data_test_all_norm [3]

data_train_tw = data_train_tw[:,0:-25]
data_test_tw = data_test_tw[:,0:-25]



In [16]:
print data_train_tw.shape, data_test_tw.shape

(25000, 99602) (25000, 99602)


# Logistic Regression
- Adding the proba (instead of the label) is better for the final predition)

In [17]:
Cs = {'C': np.linspace(1000, 1500, 10)}
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

lrtf = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = 1)
lrtf = lrtf.fit(data_train, label_train)
predicted_label_lrtf = lrtf.predict(data_test)


print("SVM - Best C & associated score", lrtf.best_params_, lrtf.best_score_)

('SVM - Best C & associated score', {'C': 1222.2222222222222}, 0.92352000000000001)


In [18]:
Cs = {'C': np.linspace(0.4, 1, 20)}

lrtw = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = 1)
lrtw = lrtw.fit(data_train_tw, label_train_tw)
predicted_label_lrtw = lrtw.predict(data_test_tw)

print("SVM - Best C & associated score", lrtw.best_params_, lrtw.best_score_)

('SVM - Best C & associated score', {'C': 0.90526315789473688}, 0.49715999999999999)


# SGD Classifier
- Impossible to predict a proba, just labels

In [12]:
from sklearn.linear_model import SGDClassifier

sgd_tf = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True, penalty='l2')
sgd_tf.fit( data_train, label_train )
predicted_label_SGD_TF = sgd_tf.predict(data_test)


('SGD - Score on test data : ', 0.90383999999999998)


In [13]:
sgd_tw = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True, penalty='l2')
sgd_tw.fit( data_train_tw, label_train )
predicted_label_SGD_TW = sgd_tw.predict(data_test_tw)


('SGD - Score on test data : ', 0.86528000000000005)


# LinearSVC
- Do not predict proba, only labels

In [14]:
from sklearn.svm import LinearSVC

Cs = {'C': np.linspace(3, 5, 10)}
svc_tf = GridSearchCV(LinearSVC(penalty = 'l2'), Cs, n_jobs = 1)
svc_tf.fit(data_train, label_train)
predicted_label_SVC_TF = svc_tf.predict(data_test)

print("Linear SVC - Best C & associated score", svc_tf.best_params_, svc_tf.best_score_)

('Linear SVC - Best C & associated score', {'C': 4.333333333333333}, 0.92458666666666667)
('Linear svc  - Score on test_data : ', 0.90527999999999997)


In [15]:
Cs = {'C': np.linspace(0.01, 0.1, 5)}
svc_tw = GridSearchCV(LinearSVC(penalty = 'l2'), Cs, n_jobs = 1)
svc_tw.fit(data_train_tw, label_train)
predicted_label_SVC_TW = svc_tw.predict(data_test_tw)

print("Linear SVC - Best C & associated score", svc_tw.best_params_, svc_tw.best_score_)

('Linear SVC - Best C & associated score', {'C': 0.055000000000000007}, 0.87648000000000004)
('linear svc  - Score on test_data : ', 0.88016000000000005)


# Multinomial Naive Bayes
- Able to return the label, the proba, and the log_proba

In [16]:
from sklearn.naive_bayes import MultinomialNB

alphas = {'alpha': np.linspace(0.0001, 0.001, 10)}
multinom_tf = GridSearchCV(MultinomialNB(), alphas, n_jobs = 1)
multinom_tf.fit(data_train, label_train)
predicted_label_MN_TF = multinom_tf.predict(data_test)

print("Multinomial - Best alpha & associated score", multinom_tf.best_params_, multinom_tf.best_score_)

('Multinomial - Best alpha & associated score', {'alpha': 0.00029999999999999997}, 0.95413333333333339)
('MNB  - Score on test_data : ', 0.88736000000000004)


# ExtraTrees
- Can predict label, proba AND log proba

In [18]:
from sklearn.ensemble import ExtraTreesClassifier

extratrees_tf = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=1, random_state=0, n_jobs = 1)
extratrees_tf.fit(data_train, label_train)
predicted_label_extratrees_tf = extratrees_tf.predict(data_test)

('ExtraTrees - Score on test_data : ', 0.84192)


In [19]:
extratrees_tw = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=1, random_state=0, n_jobs = 1)
extratrees_tw.fit(data_train_tw, label_train)
predicted_label_extratrees_tw = extratrees_tw.predict(data_test_tw)

('ExtraTrees - Score on test_data : ', 0.83199999999999996)


# AdaBoost
- Can predict label, proba AND log proba

In [20]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_tf = AdaBoostClassifier(n_estimators=40)
adaboost_tf.fit(data_train, label_train)
predicted_label_adaboost_tf = adaboost_tf.predict(data_test)

('AdaBoost - Score on test_data : ', 0.78895999999999999)


In [21]:
adaboost_tw = AdaBoostClassifier(n_estimators=40)
adaboost_tw.fit(data_train_tw, label_train)
predicted_label_adaboost_tw = adaboost_tw.predict(data_test_tw)

('AdaBoost - Score on test_data : ', 0.78847999999999996)


# Comparing the difference in predictions

In [22]:
def error_similarity(l1, l2):#
    ref_diff =0
    all_diff =0

    for i, label in enumerate(l1):
        if(label != label_test[i]):
            ref_diff+=1
            if(label_test[i] != l2[i]):
                all_diff+=1
    return ref_diff, all_diff, float(all_diff)/ref_diff *100

In [23]:
predicted_label_TF = {"lrtf": predicted_label_lrtf, 
                      "sgd_tf" : predicted_label_SGD_TF, 
                      "svc_tf" :predicted_label_SVC_TF,
                      "multinomial_tf" : predicted_label_MN_TF,
                      "extratrees_tf" : predicted_label_extratrees_tf,
                      "adaboost_tf" : predicted_label_adaboost_tf,
                      }
                      
predicted_label_TW = {"lrtw": predicted_label_lrtw, 
                      "sgd_tw" : predicted_label_SGD_TW, 
                      "svc_tw" :predicted_label_SVC_TW,
                      "extratrees_tw" : predicted_label_extratrees_tw,
                      "adaboost_tw" : predicted_label_adaboost_tw,
                      }

for tfkey, tfvalue in predicted_label_TF.items():
    for twkey, twvalue in predicted_label_TW.items():
        print error_similarity(tfvalue,twvalue), tfkey, twkey

print '------------------------------------------'

for tfkey, tfvalue in predicted_label_TF.items():
    for tfkey2, tfvalue2 in predicted_label_TF.items():
        if tfkey != tfkey2:
            print error_similarity(tfvalue,tfvalue2), tfkey, tfkey2
            
print '------------------------------------------'

for twkey, twvalue in predicted_label_TW.items():
    for twkey2, twvalue2 in predicted_label_TW.items():
        if twkey != twkey2:
            print error_similarity(twvalue,twvalue2), twkey, twkey2

(591, 76, 12.859560067681894) lrtf sgd_tw
(591, 68, 11.505922165820643) lrtf lrtw
(591, 69, 11.6751269035533) lrtf svc_tw
(591, 135, 22.84263959390863) lrtf adaboost_tw
(591, 106, 17.93570219966159) lrtf extratrees_tw
(592, 72, 12.162162162162163) svc_tf sgd_tw
(592, 64, 10.81081081081081) svc_tf lrtw
(592, 65, 10.97972972972973) svc_tf svc_tw
(592, 137, 23.14189189189189) svc_tf adaboost_tw
(592, 106, 17.905405405405407) svc_tf extratrees_tw
(988, 122, 12.348178137651821) extratrees_tf sgd_tw
(988, 107, 10.82995951417004) extratrees_tf lrtw
(988, 111, 11.234817813765183) extratrees_tf svc_tw
(988, 208, 21.052631578947366) extratrees_tf adaboost_tw
(988, 161, 16.295546558704455) extratrees_tf extratrees_tw
(1319, 160, 12.130401819560273) adaboost_tf sgd_tw
(1319, 144, 10.917361637604246) adaboost_tf lrtw
(1319, 146, 11.06899166034875) adaboost_tf svc_tw
(1319, 270, 20.47005307050796) adaboost_tf adaboost_tw
(1319, 203, 15.390447308567095) adaboost_tf extratrees_tw
(704, 91, 12.92613636

# Assembling the results

In [24]:
import math
def csr_vappend(a,b): #b est un vecteur ligne (np.array ou liste) et a est une sparse matrix
    if(type(a)== list):
        a=np.array([a]).T
    if(type(b)== list):
        b=np.array([b]).T
    if(type(a)!= scipy.sparse.csr.csr_matrix):
        a=scipy.sparse.csr_matrix(a)
    if(type(b)!= scipy.sparse.csr.csr_matrix):
        b=scipy.sparse.csr_matrix(b)
        
    return scipy.sparse.hstack([a,b], format ='csr')

In [25]:
created_features_train = load_sparse_csr(path+'train_new_feat.npz')
created_features_test  = load_sparse_csr(path+'test_new_feat.npz')

In [79]:
###### LOG REG ######
add_label = 1
add_proba = 1
# Add PROBA Logistic Regression TW
if(add_proba):
    new_feat_train = lrtf.predict_proba(data_train)[:,0].tolist()
    new_feat_test = lrtf.predict_proba(data_test)[:,0].tolist()

# Add PROBA Logistic Regression TW
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, lrtw.predict_proba(data_train_tw)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, lrtw.predict_proba(data_test_tw)[:,0].tolist())

###### SGD ########

# Add SGD TF
#new_feat_train = csr_vappend(new_feat_train, sgd_tf.predict(data_train).tolist())
#new_feat_test = csr_vappend(new_feat_test, sgd_tf.predict(data_test).tolist())

# Add SGD TW
new_feat_train = csr_vappend(new_feat_train, sgd_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, sgd_tw.predict(data_test_tw).tolist())

###### LINEAR SVC #######

# Add Linear SVC TF
#new_feat_train = csr_vappend(new_feat_train, svc_tf.predict(data_train).tolist())
#new_feat_test = csr_vappend(new_feat_test, svc_tf.predict(data_test).tolist())

# Add Linear SVC TW
new_feat_train = csr_vappend(new_feat_train, svc_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, svc_tw.predict(data_test_tw).tolist())

###### MULTINOMIAL NAIVE BAYES ######

# Add Multinomial TF
if(add_label):
    new_feat_train = csr_vappend(new_feat_train, multinom_tf.predict(data_train).tolist())
    new_feat_test = csr_vappend(new_feat_test, multinom_tf.predict(data_test).tolist())

# Add PROBA Multinomial TF
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, multinom_tf.predict_proba(data_train)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, multinom_tf.predict_proba(data_test)[:,0].tolist())



In [80]:
##### EXTRA TREES #######

# Add TREES TF
if(add_label):
    new_feat_train = csr_vappend(new_feat_train, extratrees_tf.predict(data_train).tolist())
    new_feat_test = csr_vappend(new_feat_test, extratrees_tf.predict(data_test).tolist())

# Add PROBA TREES TF
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, extratrees_tf.predict_proba(data_train)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, extratrees_tf.predict_proba(data_test)[:,0].tolist())



# Add TREES TW
if(add_label):
    new_feat_train = csr_vappend(new_feat_train, extratrees_tw.predict(data_train_tw).tolist())
    new_feat_test = csr_vappend(new_feat_test, extratrees_tw.predict(data_test_tw).tolist())

# Add PROBA TREES TW
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, extratrees_tw.predict_proba(data_train_tw)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, extratrees_tw.predict_proba(data_test_tw)[:,0].tolist())



In [81]:
##### ADABOOST #######

# Add adaboost TF
if(add_label):
    new_feat_train = csr_vappend(new_feat_train, adaboost_tf.predict(data_train).tolist())
    new_feat_test = csr_vappend(new_feat_test, adaboost_tf.predict(data_test).tolist())

# Add PROBA adaboost TF
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, adaboost_tf.predict_proba(data_train)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, adaboost_tf.predict_proba(data_test)[:,0].tolist())


# Add adaboost TW
if(add_label):
    new_feat_train = csr_vappend(new_feat_train, adaboost_tw.predict(data_train_tw).tolist())
    new_feat_test = csr_vappend(new_feat_test, adaboost_tw.predict(data_test_tw).tolist())

# Add PROBA adaboost TW
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, adaboost_tw.predict_proba(data_train_tw)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, adaboost_tw.predict_proba(data_test_tw)[:,0].tolist())


In [82]:
# Add created features
new_feat_train = csr_vappend(new_feat_train, created_features_train)
new_feat_test = csr_vappend(new_feat_test, created_features_test)

In [83]:
data_train_ = csr_vappend(new_feat_train, data_train)
data_test_ = csr_vappend(new_feat_test, data_test)

# Second Layer Algorithm

In [85]:
from sklearn.ensemble import RandomForestClassifier

randomF_final = RandomForestClassifier(n_estimators=2000, min_samples_split=2, min_samples_leaf=1, n_jobs = -1)
randomF_final.fit(data_train_, label_train)
predicted_label = randomF_final.predict(data_test_)

('Random Forest - Score on test_data : ', 0.95023999999999997)


In [84]:
Cs = {'C': np.linspace(0.001, 0.6, 15)}

logreg_final = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = -1)
logreg_final.fit(new_feat_train, label_train)
predicted_label = logreg_final.predict(new_feat_test)


print("SVM - Best C & associated score", logreg_final.best_params_, logreg_final.best_score_)

('SVM - Best C & associated score', {'C': 0.001}, 1.0)
('SVM - Score on test_data : ', 0.95135999999999998)


In [37]:
alg = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True, penalty='l2')
alg.fit( new_feat_train, label_train )
predicted_label = alg.predict(new_feat_test)

print("SGD - Score on test data : ", score(label_test, predicted_label_SGD_TF))

('SGD - Score on test data : ', 0.90383999999999998)


In [40]:
from sklearn.ensemble import ExtraTreesClassifier


alg = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, n_jobs = 1)
alg.fit(data_train_, label_train)
predicted_label = alg.predict(data_test_)

print("ExtraTrees - Score on test_data : ", score(label_test, predicted_label))

('ExtraTrees - Score on test_data : ', 0.94223999999999997)
