In [1]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array
    
def load_sparse_coo(filename):
    loader = np.load(filename)
    return scipy.sparse.coo_matrix((loader['data'],(loader['row'],loader['col'])),
                     shape = loader['shape'])

In [2]:
from sklearn.metrics import zero_one_loss
def score(true_label, predicted_label):
    return 1 - zero_one_loss(true_label,predicted_label)

## TFIDF

In [3]:
import scipy

#### Loading the data ####
path = "bapt_tfidf/"
data_train_train_tf = load_sparse_csr(path+'data_train.npz')
data_train_test_tf = load_sparse_csr(path+'data_test.npz')
data_train = scipy.sparse.vstack([data_train_train_tf, data_train_test_tf], format ='csr')
data_test = load_sparse_csr(path+'data_test_test.npz')

#### Loading the labels ####
label_train = load_csv(path+'label_train.csv')
label_test = load_csv(path+'label_test.csv')
label_train = label_train + label_test

#### Loading the created features ####
created_feat_train_train = load_sparse_csr(path+'train_new_feat.npz')
created_feat_train_test = load_sparse_csr(path+'test_new_feat.npz')
created_features_train = scipy.sparse.vstack([created_feat_train_train, created_feat_train_test], format = 'csr')
created_features_test = load_sparse_csr(path+'test_test_new_feat.npz')

In [4]:
### Adding the good and bad grades #### 
data_train = scipy.sparse.hstack([data_train, created_features_train[:,-2:]])
data_test = scipy.sparse.hstack([data_test, created_features_test[:,-2:]])

In [5]:
nb_feat = 80000
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=nb_feat)
data_train = fselect.fit_transform(data_train, label_train)
data_test = fselect.transform(data_test)
print data_train.shape, data_test.shape

(25000, 80000) (25000, 80000)


  chisq /= f_exp


## TWIDF

In [6]:
data_train_train_all = [load_sparse_csr('tw_sw{}_all_train_train.npz'.format(k)) for k in range(1,5)]
data_train_test_all = [load_sparse_csr('tw_sw{}_all_train_test.npz'.format(k)) for k in range(1,5)]
data_train_all = [scipy.sparse.vstack([data_train_train_all[i], data_train_test_all[i]], format ='csr') for i in range(0,4)]
data_test_all = [load_sparse_csr('tw_sw{}_all_test.npz'.format(k)) for k in range(1,5)]

label_train_tw =label_train
#label_train_tw = load_csv('label_train.csv')
#label_test_tw = load_csv('labels_train_test.csv')
#label_train_tw = label_train_tw +label_test_tw

In [7]:
data_test_all[3].shape

(25000, 99627)

In [8]:
from sklearn.preprocessing import MaxAbsScaler
normalizer_all = map(lambda x: MaxAbsScaler().fit(x), data_train_all)

scaler =MaxAbsScaler()
scaler.partial_fit(data_test)
scaler.partial_fit(data_train)
scaler.transform(data_test)
scaler.transform(data_train)

data_train_all_norm = [normalizer_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_norm = [normalizer_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]


In [9]:
SelectKBest(f_classif , k=data_train_all[0].shape[1]/100).fit(data_train_all[0],label_train_tw)
fselect_all = [SelectKBest(f_classif , k=data_train_all[i].shape[1]/100).fit(
        data_train_all[i],label_train_tw) for i in range(len(data_train_all))]
data_train_all_selec = [fselect_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_selec = [fselect_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]

fselect_all_norm = [SelectKBest(f_classif , k=data_train_all_norm[i].shape[1]/100).fit(
        data_train_all_norm[i],label_train_tw) for i in range(len(data_train_all_norm))]

data_train_all_norm_selec = [fselect_all_norm[i].transform(
        data_train_all_norm[i]) for i in range(len(data_train_all_norm))]
data_test_all_norm_selec = [fselect_all_norm[i].transform(
        data_test_all_norm[i]) for i in range(len(data_test_all_norm))]


  f = msb / msw


In [10]:
data_train_tw = data_train_all_norm [3]
data_test_tw = data_test_all_norm [3]

data_train_tw = data_train_tw[:,0:-25]
data_test_tw = data_test_tw[:,0:-25]



In [None]:
print data_train_tw.shape, data_test_tw.shape

(25000, 99602) (25000, 99602)


# Logistic Regression
- Adding the proba (instead of the label) is better for the final predition)

In [None]:
Cs = {'C': np.logspace(2, 5, 20)}
Cs = {'C': np.linspace(1000, 1500, 10)}
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

lrtf = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = -1)
lrtf = lrtf.fit(data_train, label_train)
predicted_label_lrtf = lrtf.predict(data_test)


print("LogReg - Best C & associated score", lrtf.best_params_, lrtf.best_score_)

In [None]:
Cs = {'C': np.logspace(0, 2, 20)}
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

lrtw = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = -1)
lrtw = lrtw.fit(data_train_tw, label_train_tw)
predicted_label_lrtw = lrtw.predict(data_test_tw)

print("LogReg - Best C & associated score", lrtw.best_params_, lrtw.best_score_)

# SGD Classifier
- Impossible to predict a proba, just labels

In [None]:
#from sklearn.linear_model import SGDClassifier
#alphas = np.logspace(-6, -2, 10)
#sgd_tf = GridSearchCV(SGDClassifier(loss='modified_huber', n_iter=200, random_state=0, shuffle=True, penalty='l2')
#                     ,dict(alpha=alphas)
#                     ,n_jobs = -1
#                     ,cv=10)
#sgd_tf.fit( data_train, label_train )
#predicted_label_SGD_TF = sgd_tf.predict(data_test)
#print("SGD squared hinge: Best alpha and associated score: ", sgd_tf.best_params_, sgd_tf.best_score_)


In [None]:
from sklearn.linear_model import SGDClassifier
alphas = np.logspace(-5, 0, 15)
sgd_tw = GridSearchCV(SGDClassifier(loss='modified_huber', n_iter=500, random_state=0, shuffle=True, penalty='l2'),
                      dict(alpha=alphas)
                     ,n_jobs = -1
                     ,cv=10)
sgd_tw.fit( data_train_tw, label_train )
predicted_label_SGD_TW = sgd_tw.predict(data_test_tw)
print("SGD squared hinge: Best alpha and associated score: ", sgd_tw.best_params_, sgd_tw.best_score_)

# LinearSVC
- Do not predict proba, only labels

In [None]:
#from sklearn.svm import LinearSVC

#Cs = {'C': np.logspace(-2, 1, 15)}
#svc_tf = GridSearchCV(LinearSVC(penalty = 'l2'), Cs, n_jobs = -1)
#svc_tf.fit(data_train, label_train)
#predicted_label_SVC_TF = svc_tf.predict(data_test)

#print("Linear SVC - Best C & associated score", svc_tf.best_params_, svc_tf.best_score_)

In [None]:
from sklearn.svm import LinearSVC
Cs = {'C': np.logspace(-3,0, 30)}
svc_tw = GridSearchCV(LinearSVC(penalty = 'l2'), Cs, n_jobs = -1)
svc_tw.fit(data_train_tw, label_train)
predicted_label_SVC_TW = svc_tw.predict(data_test_tw)

print("Linear SVC - Best C & associated score", svc_tw.best_params_, svc_tw.best_score_)

# Multinomial Naive Bayes
- Able to return the label, the proba, and the log_proba

In [None]:
from sklearn.naive_bayes import MultinomialNB

alphas = {'alpha': np.logspace(-4, 1, 80)}
multinom_tf = GridSearchCV(MultinomialNB(), alphas, n_jobs = -1)
multinom_tf.fit(data_train, label_train)
predicted_label_MN_TF = multinom_tf.predict(data_test)

print("Multinomial - Best alpha & associated score", multinom_tf.best_params_, multinom_tf.best_score_)

In [None]:
from sklearn.naive_bayes import BernoulliNB

alphas = {'alpha': np.logspace(-2, 4, 80)}
nbtw = GridSearchCV(BernoulliNB(), alphas, n_jobs = -1)
nbtw.fit(data_train_tw, label_train_tw)
predicted_label_NB_TW = nbtw.predict(data_test_tw)

print("Bernouilli - Best alpha & associated score", nbtw.best_params_, nbtw.best_score_)

# ExtraTrees
- Can predict label, proba AND log proba

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

extratrees_tf = ExtraTreesClassifier(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, n_jobs = -1)
extratrees_tf.fit(data_train, label_train)
predicted_label_extratrees_tf = extratrees_tf.predict(data_test)

In [None]:
extratrees_tw = ExtraTreesClassifier(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, n_jobs = -1)
extratrees_tw.fit(data_train_tw, label_train)
predicted_label_extratrees_tw = extratrees_tw.predict(data_test_tw)

# AdaBoost
- Can predict label, proba AND log proba

In [114]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_tf = AdaBoostClassifier(n_estimators=300)
adaboost_tf.fit(data_train, label_train)
predicted_label_adaboost_tf = adaboost_tf.predict(data_test)

In [115]:
adaboost_tw = AdaBoostClassifier(n_estimators=300)
adaboost_tw.fit(data_train_tw, label_train)
predicted_label_adaboost_tw = adaboost_tw.predict(data_test_tw)

# Assembling the results

In [116]:
import math
def csr_vappend(a,b): #b est un vecteur ligne (np.array ou liste) et a est une sparse matrix
    if(type(a)== list):
        a=np.array([a]).T
    if(type(b)== list):
        b=np.array([b]).T
    if(type(a)!= scipy.sparse.csr.csr_matrix):
        a=scipy.sparse.csr_matrix(a)
    if(type(b)!= scipy.sparse.csr.csr_matrix):
        b=scipy.sparse.csr_matrix(b)
        
    return scipy.sparse.hstack([a,b], format ='csr')

In [117]:
###### LOG REG ######
add_label = 1
add_proba = 1
# Add PROBA Logistic Regression TW
if(add_proba):
    new_feat_train = lrtf.predict_proba(data_train)[:,0].tolist()
    new_feat_test = lrtf.predict_proba(data_test)[:,0].tolist()

# Add PROBA Logistic Regression TW
#if(add_proba):
#    new_feat_train = csr_vappend(new_feat_train, lrtw.predict_proba(data_train_tw)[:,0].tolist())
#    new_feat_test = csr_vappend(new_feat_test, lrtw.predict_proba(data_test_tw)[:,0].tolist())

###### SGD ########

# Add SGD TF
#new_feat_train = csr_vappend(new_feat_train, sgd_tf.predict(data_train).tolist())
#new_feat_test = csr_vappend(new_feat_test, sgd_tf.predict(data_test).tolist())

# Add SGD TW
new_feat_train = csr_vappend(new_feat_train, sgd_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, sgd_tw.predict(data_test_tw).tolist())

###### LINEAR SVC #######

# Add Linear SVC TF
#new_feat_train = csr_vappend(new_feat_train, svc_tf.predict(data_train).tolist())
#new_feat_test = csr_vappend(new_feat_test, svc_tf.predict(data_test).tolist())

# Add Linear SVC TW
new_feat_train = csr_vappend(new_feat_train, svc_tw.predict(data_train_tw).tolist())
new_feat_test = csr_vappend(new_feat_test, svc_tw.predict(data_test_tw).tolist())

###### MULTINOMIAL NAIVE BAYES ######

# Add Multinomial TF
if(add_label):
    new_feat_train = csr_vappend(new_feat_train, multinom_tf.predict(data_train).tolist())
    new_feat_test = csr_vappend(new_feat_test, multinom_tf.predict(data_test).tolist())

# Add PROBA Multinomial TF
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, multinom_tf.predict_proba(data_train)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, multinom_tf.predict_proba(data_test)[:,0].tolist())

# Add Bernouilli TW
if(add_label):
    new_feat_train = csr_vappend(new_feat_train, nbtw.predict(data_train_tw).tolist())
    new_feat_test = csr_vappend(new_feat_test, nbtw.predict(data_test_tw).tolist())

# Add PROBA Bernouilli TW
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, nbtw.predict_proba(data_train_tw)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, nbtw.predict_proba(data_test_tw)[:,0].tolist())

In [118]:
##### EXTRA TREES #######

# Add TREES TF
if(add_label):
    new_feat_train = csr_vappend(new_feat_train, extratrees_tf.predict(data_train).tolist())
    new_feat_test = csr_vappend(new_feat_test, extratrees_tf.predict(data_test).tolist())

# Add PROBA TREES TF
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, extratrees_tf.predict_proba(data_train)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, extratrees_tf.predict_proba(data_test)[:,0].tolist())



# Add TREES TW
if(add_label):
    new_feat_train = csr_vappend(new_feat_train, extratrees_tw.predict(data_train_tw).tolist())
    new_feat_test = csr_vappend(new_feat_test, extratrees_tw.predict(data_test_tw).tolist())

# Add PROBA TREES TW
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, extratrees_tw.predict_proba(data_train_tw)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, extratrees_tw.predict_proba(data_test_tw)[:,0].tolist())



In [119]:
##### ADABOOST #######

# Add adaboost TF
if(add_label):
    new_feat_train = csr_vappend(new_feat_train, adaboost_tf.predict(data_train).tolist())
    new_feat_test = csr_vappend(new_feat_test, adaboost_tf.predict(data_test).tolist())

# Add PROBA adaboost TF
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, adaboost_tf.predict_proba(data_train)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, adaboost_tf.predict_proba(data_test)[:,0].tolist())


# Add adaboost TW
if(add_label):
    new_feat_train = csr_vappend(new_feat_train, adaboost_tw.predict(data_train_tw).tolist())
    new_feat_test = csr_vappend(new_feat_test, adaboost_tw.predict(data_test_tw).tolist())

# Add PROBA adaboost TW
if(add_proba):
    new_feat_train = csr_vappend(new_feat_train, adaboost_tw.predict_proba(data_train_tw)[:,0].tolist())
    new_feat_test = csr_vappend(new_feat_test, adaboost_tw.predict_proba(data_test_tw)[:,0].tolist())


In [120]:
# Add created features
new_feat_train = csr_vappend(new_feat_train, created_features_train)
new_feat_test = csr_vappend(new_feat_test, created_features_test)

In [121]:
data_train_ = csr_vappend(new_feat_train, data_train)
data_test_ = csr_vappend(new_feat_test, data_test)

# Second Layer Algorithm

In [123]:
Cs = {'C': np.linspace(0.001, 0.6, 15)}

logreg_final = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = -1)
logreg_final.fit(new_feat_train, label_train)
predicted_label_logreg = logreg_final.predict(new_feat_test)


print("SVM - Best C & associated score", logreg_final.best_params_, logreg_final.best_score_)

('SVM - Best C & associated score', {'C': 0.043785714285714282}, 1.0)


In [127]:

def save_submission(labels, filepath):
    with open(filepath, 'w') as file:
        for i in range(25000):
            file.write("{}\n".format(format(i, '05d')+ "\t" + str(labels[i])))

save_submission(predicted_label_logreg, "submission_logreg.txt")
