In [1]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array
    
def load_sparse_coo(filename):
    loader = np.load(filename)
    return scipy.sparse.coo_matrix((loader['data'],(loader['row'],loader['col'])),
                     shape = loader['shape'])

In [2]:
path = "bapt_tfidf/"
data_train = load_sparse_csr(path+'data_train.npz')
data_test = load_sparse_csr(path+'data_test.npz')
label_train = load_csv(path+'label_train.csv')
label_test = load_csv(path+'label_test.csv')
features_names = load_feature_names('data/feature_names.csv')

In [3]:
from sklearn.metrics import zero_one_loss
def score(true_label, predicted_label):
    return 1 - zero_one_loss(true_label,predicted_label)

In [4]:
data_train_all = [load_sparse_csr('tw_sw{}_all_train_train.npz'.format(k)) for k in range(1,6)]
data_test_all = [load_sparse_csr('tw_sw{}_all_train_test.npz'.format(k)) for k in range(1,6)]
data_train_sep = [load_sparse_coo('tw_sw{}_train_train.npz'.format(k)) for k in range(1,6)]
data_test_sep = [load_sparse_coo('tw_sw{}_train_test.npz'.format(k)) for k in range(1,6)]

In [5]:
label_train_tw = load_csv('labels_train_train.csv')
label_test_tw = load_csv('labels_train_test.csv')

In [6]:
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=80000)
data_train = fselect.fit_transform(data_train, label_train)
data_test = fselect.transform(data_test)


  chisq /= f_exp


In [7]:
from sklearn.preprocessing import MaxAbsScaler
normalizer_all = map(lambda x: MaxAbsScaler().fit(x), data_train_all)
normalizer_sep = map(lambda x: MaxAbsScaler().fit(x), data_train_sep)

scaler =MaxAbsScaler()
scaler.partial_fit(data_test)
scaler.partial_fit(data_train)
scaler.transform(data_test)
scaler.transform(data_train)

data_train_all_norm = [normalizer_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_norm = [normalizer_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]
data_train_sep_norm = [normalizer_sep[i].transform(data_train_sep[i]) for i in range(len(data_train_sep))]
data_test_sep_norm = [normalizer_sep[i].transform(data_test_sep[i]) for i in range(len(data_test_sep))]

In [None]:
SelectKBest(f_classif , k=data_train_all[0].shape[1]/100).fit(data_train_all[0],label_train_tw)
fselect_all = [SelectKBest(f_classif , k=data_train_all[i].shape[1]/100).fit(
        data_train_all[i],label_train_tw) for i in range(len(data_train_all))]
fselect_sep = [SelectKBest(f_classif , k=data_train_sep[i].shape[1]/100).fit(
        data_train_sep[i], label_train_tw) for i in range(len(data_train_sep))]
data_train_all_selec = [fselect_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_selec = [fselect_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]

data_train_sep_selec = [fselect_sep[i].transform(data_train_sep[i]) for i in range(len(data_train_sep))]
data_test_sep_selec = [fselect_sep[i].transform(data_test_sep[i]) for i in range(len(data_test_sep))]
fselect_all_norm = [SelectKBest(f_classif , k=data_train_all_norm[i].shape[1]/100).fit(
        data_train_all_norm[i],label_train_tw) for i in range(len(data_train_all_norm))]
fselect_sep_norm = [SelectKBest(f_classif , k=data_train_sep[i].shape[1]/100).fit(
        data_train_sep_norm[i], label_train_tw) for i in range(len(data_train_sep_norm))]

data_train_all_norm_selec = [fselect_all_norm[i].transform(
        data_train_all_norm[i]) for i in range(len(data_train_all_norm))]
data_test_all_norm_selec = [fselect_all_norm[i].transform(
        data_test_all_norm[i]) for i in range(len(data_test_all_norm))]
data_train_sep_norm_selec = [fselect_sep_norm[i].transform(
        data_train_sep_norm[i]) for i in range(len(data_train_sep_norm))]
data_test_sep_norm_selec = [fselect_sep_norm[i].transform(
        data_test_sep_norm[i]) for i in range(len(data_test_sep_norm))]

data_train_tw = data_train_all_norm [3]
data_test_tw = data_test_all_norm [3]

  f = msb / msw


# One classifier

In [None]:
Cs = {'C': np.logspace(-1, 3, 20)}
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

lrtf = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = -1)
lrtf = lrtf.fit(data_train, label_train)
predicted_label_lrtf = lrtf.predict(data_test)

print("SVM - Best C & associated score", lrtf.best_params_, lrtf.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label_lrtf))

In [None]:
Cs = {'C': np.logspace(-1, 3, 20)}
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

lrtw = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = -1)
lrtw = lrtw.fit(data_train_tw, label_train_tw)
predicted_label_lrtw = lrtw.predict(data_test_tw)

print("SVM - Best C & associated score", lrtw.best_params_, lrtw.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label_lrtw))

In [None]:
from sklearn.linear_model import LogisticRegression

alg1 = LogisticRegression(penalty = 'l2')
alg1.fit(data_train, label_train)
predicted_label1 = alg1.predict(data_test)
probability1 = alg1.predict_proba(data_test)

print(score(predicted_label1, label_test))

In [None]:
from sklearn.linear_model import SGDClassifier

alg2 = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True, penalty='l2')
alg2.fit( data_train, label_train )
predicted_label_SGD = alg2.predict(data_test)
predicted_train_SGD = alg2.predict(data_train)
print(score(label_test, predicted_label_SGD))
#print alg2.coef_

In [None]:
from sklearn.svm import LinearSVC
from sklearn import cross_validation
alg3 = LinearSVC(penalty="l2",dual=False, C=0.48329302385717521)
alg3.fit(data_train, label_train)
predicted_label = alg3.predict(data_test)

# Take the mean of the scores (because we have one for each fold)
#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
#print("linear svc - Mean score on cross val", scores.mean())
print("linear svc  - Score on test_data : ", score(predicted_label, label_test))
print("linear svc - Score on train_data : ", score(alg3.predict(data_train), label_train))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import cross_validation
alg4 = MultinomialNB(alpha = 0.0082864277285468416)
alg4.fit(data_train, label_train)
predicted_label = alg4.predict(data_test)

# Take the mean of the scores (because we have one for each fold)
#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
#print("linear svc - Mean score on cross val", scores.mean())
print("MNB  - Score on test_data : ", score(predicted_label, label_test))
print("MNB - Score on train_data : ", score(alg4.predict(data_train), label_train))

In [None]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

# n_estimators is the number of trees we want to make (higher is generally better)
# min_samples_split is tfnhe minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
alg = RandomForestClassifier(random_state=1, n_estimators=2000, min_samples_split=2, min_samples_leaf=1)
alg.fit(data_train[:,0:-1], label_train)
predicted_label = alg.predict(data_test[:,0:-1])


#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=5)
#print("Random Forest - Mean score on cross val", scores.mean())
print("Random Forest - Score on test_data : ", score(label_test, predicted_label))

In [None]:
def error_similarity(l1, l2):#
    ref_diff =0
    all_diff =0

    for i, label in enumerate(l1):
        if(label != label_test[i]):
            ref_diff+=1
            if(label_test[i] != l2[i]):
                all_diff+=1
    return ref_diff, all_diff, float(all_diff)/ref_diff *100

In [132]:
#predicted_label_SGD = alg2.predict(data_test)
#predicted_label_SVM = alg3.predict(data_test)
#predicted_label_lr_tf = lr_tf.predict(data_test)
#predicted_label_lr_tw = lr_tw.predict(data_test_tw)

print error_similarity(predicted_label_SGD, predicted_label_SVM)
print error_similarity(predicted_label_SGD, predicted_label_lr_tf)
print error_similarity(predicted_label_lr_tw, predicted_label_lr_tf)
print error_similarity(predicted_label_SGD, predicted_label_lr_tw)
print error_similarity(predicted_label, predicted_label_SGD)

(686, 680, 99.12536443148689)
(686, 67, 9.7667638483965)
(742, 61, 8.221024258760108)
(686, 680, 99.12536443148689)
(489, 244, 49.897750511247445)


## adding features

In [112]:
def csr_vappend(a,b): #b est un vecteur ligne (np.array ou liste) et a est une sparse matrix
    if(type(a)!= scipy.sparse.csr.csr_matrix):
        a=scipy.sparse.csr_matrix(a)
        
    if(type(b)== list):
        b=np.array([b]).T
    if(type(b)!= scipy.sparse.csr.csr_matrix):
        b=scipy.sparse.csr_matrix(b)
        
    return scipy.sparse.hstack([a,b], format ='csr')

In [113]:
b = lr_tw.predict(data_test_tw).tolist()
print np.array([b]).T.shape, data_test.shape


(6250, 1) (6250, 80000)


In [155]:
data_train = csr_vappend(data_train, lr_tw.predict(data_train_tw).tolist())
data_test = csr_vappend(data_test, lr_tw.predict(data_test_tw).tolist())
#data_train = csr_vappend(data_train, alg2.predict(data_train).tolist())
#data_test = csr_vappend(data_test, alg2.predict(data_test).tolist())

In [156]:
data_train = csr_vappend(data_train, predicted_train_SGD.tolist())
data_test = csr_vappend(data_test, predicted_label_SGD.tolist())

# Majority Vote

In [10]:
from sklearn.ensemble import VotingClassifier
mv = VotingClassifier(estimators=[('lr', alg1), ('sgd', alg2), ('svm', alg3),('mnb', alg4)], voting='hard',weights=[1,1,1,0])
mv.fit(data_train, label_train)
predicted_label = mv.predict(data_test)

# Take the mean of the scores (because we have one for each fold)
#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
#print("linear svc - Mean score on cross val", scores.mean())
print("Majority vote  - Score on test_data : ", score(predicted_label, label_test))
print("Majority vote - Score on train_data : ", score(mv.predict(data_train), label_train))

('Majority vote  - Score on test_data : ', 0.90688)
('Majority vote - Score on train_data : ', 0.9937066666666666)


# Average Probability

In [11]:
from sklearn.ensemble import VotingClassifier
ap = VotingClassifier(estimators=[('lr', alg1), ('sgd', alg2)], voting = 'soft', weights=[1,1]) #should normally be used on well-calibrated classifiers (not the case here)
ap.fit(data_train, label_train)
predicted_label = ap.predict(data_test)

# Take the mean of the scores (because we have one for each fold)
#scores = cross_validation.cross_val_score(alg, data_train, label_train, cv=10)
#print("linear svc - Mean score on cross val", scores.mean())
print("AV  - Score on test_data : ", score(predicted_label, label_test))
print("AV - Score on train_data : ", score(ap.predict(data_train), label_train))

('AV  - Score on test_data : ', 0.90304)
('AV - Score on train_data : ', 0.9809066666666667)
