# Load Data

In [2]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])

def load_sparse_coo(filename):
    loader = np.load(filename)
    return scipy.sparse.coo_matrix((loader['data'],(loader['row'],loader['col'])),
                     shape = loader['shape'])

def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array

tf-idf

In [None]:
path = "simplified_data/"
label_train = load_csv(path+'label_train.csv')
label_test = load_csv(path+'label_test.csv')
data_train = load_sparse_csr(path+'data_train.npz')
data_test = load_sparse_csr(path+'data_test.npz')
features_names = load_feature_names('data/feature_names.csv')

tw-idf

In [3]:
label_train_tw = load_csv('labels_train_train.csv')
label_test_tw = load_csv('labels_train_test.csv')

In [4]:
data_train_all = [load_sparse_csr('tw_sw{}_all_train_train.npz'.format(k)) for k in range(1,6)]
data_test_all = [load_sparse_csr('tw_sw{}_all_train_test.npz'.format(k)) for k in range(1,6)]
data_train_sep = [load_sparse_coo('tw_sw{}_train_train.npz'.format(k)) for k in range(1,6)]
data_test_sep = [load_sparse_coo('tw_sw{}_train_test.npz'.format(k)) for k in range(1,6)]

# Normalizing

tw-idf

In [61]:
from sklearn.preprocessing import MaxAbsScaler
normalizer_all = map(lambda x: MaxAbsScaler().fit(x), data_train_all)
normalizer_sep = map(lambda x: MaxAbsScaler().fit(x), data_train_sep)

In [62]:
data_train_all_norm = [normalizer_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_norm = [normalizer_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]
data_train_sep_norm = [normalizer_sep[i].transform(data_train_sep[i]) for i in range(len(data_train_sep))]
data_test_sep_norm = [normalizer_sep[i].transform(data_test_sep[i]) for i in range(len(data_test_sep))]

# Feature selection

In [63]:
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=80000)

tf-idf

In [None]:
data_train = fselect.fit_transform(data_train, label_train)
data_test = fselect.transform(data_test)

tw-idf

In [75]:
data_train_all[0], label_train_tw[0]

(<18750x99627 sparse matrix of type '<type 'numpy.float64'>'
 	with 91517 stored elements in Compressed Sparse Row format>, 1.0)

In [76]:
SelectKBest(f_classif , k=data_train_all[0].shape[1]/100).fit(data_train_all[0],label_train_tw)

SelectKBest(k=996, score_func=<function f_classif at 0x11b32fc80>)

In [77]:
fselect_all = [SelectKBest(f_classif , k=data_train_all[i].shape[1]/100).fit(
        data_train_all[i],label_train_tw) for i in range(len(data_train_all))]
fselect_sep = [SelectKBest(f_classif , k=data_train_sep[i].shape[1]/100).fit(
        data_train_sep[i], label_train_tw) for i in range(len(data_train_sep))]

In [79]:
data_train_all_selec = [fselect_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_selec = [fselect_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]
data_train_sep_selec = [fselect_sep[i].transform(data_train_sep[i]) for i in range(len(data_train_sep))]
data_test_sep_selec = [fselect_sep[i].transform(data_test_sep[i]) for i in range(len(data_test_sep))]

In [83]:
fselect_all_norm = [SelectKBest(f_classif , k=data_train_all_norm[i].shape[1]/100).fit(
        data_train_all_norm[i],label_train_tw) for i in range(len(data_train_all_norm))]
fselect_sep_norm = [SelectKBest(f_classif , k=data_train_sep[i].shape[1]/100).fit(
        data_train_sep_norm[i], label_train_tw) for i in range(len(data_train_sep_norm))]

In [84]:
data_train_all_norm_selec = [fselect_all_norm[i].transform(
        data_train_all_norm[i]) for i in range(len(data_train_all_norm))]
data_test_all_norm_selec = [fselect_all_norm[i].transform(
        data_test_all_norm[i]) for i in range(len(data_test_all_norm))]
data_train_sep_norm_selec = [fselect_sep_norm[i].transform(
        data_train_sep_norm[i]) for i in range(len(data_train_sep_norm))]
data_test_sep_norm_selec = [fselect_sep_norm[i].transform(
        data_test_sep_norm[i]) for i in range(len(data_test_sep_norm))]

# Models

In [4]:
def score(true_label, predicted_label):
    length = len(true_label)
    total = 0
    for i, label in enumerate(true_label):
        if label == predicted_label[i]:
            total += 1
    return float(total)/float(length)

## Logistic regression

In [87]:
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation

tf-idf

In [5]:
alg = LogisticRegression(penalty = 'l1')
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)
print("LogReg - Penalty l1 - Score on test_data : ", score(label_test, predicted_label))

alg = LogisticRegression(penalty = 'l2')
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)
print("LogReg - Penalty l2 - Score on test_data : ", score(label_test, predicted_label))

('LogReg - Penalty l1 - Score on test_data : ', 0.8632)
('LogReg - Penalty l2 - Score on test_data : ', 0.8944)


tw-idf

In [100]:
data_test_all_norm_selec[0].shape

(6250, 996)

L1 penality

In [104]:
from sklearn.metrics import zero_one_loss
log_all_tw = map(lambda x: LogisticRegression(penalty = 'l1').fit(x, label_train_tw), data_train_all_norm)
log_sep_tw = map(lambda x: LogisticRegression(penalty = 'l1').fit(x, label_train_tw), data_train_sep_norm)
log_label_all_tw = [log_all_tw[i].predict(data_test_all_norm[i]) for i in range(len(data_test_all_norm_selec))]
for i, preds in enumerate(log_label_all_tw):
    print "LogReg - Penalty l1 - features computed for all - sliding window {} : ".format(i+1), 1 - zero_one_loss(label_test_tw, preds)

log_label_sep_tw = [log_sep_tw[i].predict(data_test_sep_norm[i]) for i in range(len(data_test_sep_norm_selec))]
for i, preds in enumerate(log_label_sep_tw):
    print "LogReg - Penalty l1 - features computed separetely - sliding window {} : ".format(i+1), 1 - zero_one_loss(label_test_tw, preds)

LogReg - Penalty l1 - features computed for all - sliding window 1 :  0.5864
LogReg - Penalty l1 - features computed for all - sliding window 2 :  0.87392
LogReg - Penalty l1 - features computed for all - sliding window 3 :  0.8728
LogReg - Penalty l1 - features computed for all - sliding window 4 :  0.87216
LogReg - Penalty l1 - features computed for all - sliding window 5 :  0.86992
LogReg - Penalty l1 - features computed separetely - sliding window 1 :  0.5864
LogReg - Penalty l1 - features computed separetely - sliding window 2 :  0.8744
LogReg - Penalty l1 - features computed separetely - sliding window 3 :  0.87136
LogReg - Penalty l1 - features computed separetely - sliding window 4 :  0.87184
LogReg - Penalty l1 - features computed separetely - sliding window 5 :  0.86992


L2 penality

In [105]:
from sklearn.metrics import zero_one_loss
log_all_tw = map(lambda x: LogisticRegression(penalty = 'l2').fit(x, label_train_tw), data_train_all_norm)
log_sep_tw = map(lambda x: LogisticRegression(penalty = 'l2').fit(x, label_train_tw), data_train_sep_norm)
log_label_all_tw = [log_all_tw[i].predict(data_test_all_norm[i]) for i in range(len(data_test_all_norm))]
for i, preds in enumerate(log_label_all_tw):
    print "LogReg - Penalty l1 - features computed for all - sliding window {} : ".format(i+1), 1 - zero_one_loss(label_test_tw, preds)

log_label_sep_tw = [log_sep_tw[i].predict(data_test_sep_norm[i]) for i in range(len(data_test_sep_norm))]
for i, preds in enumerate(log_label_sep_tw):
    print "LogReg - Penalty l1 - features computed separetely - sliding window {} : ".format(i+1), 1 - zero_one_loss(label_test_tw, preds)

LogReg - Penalty l1 - features computed for all - sliding window 1 :  0.58592
LogReg - Penalty l1 - features computed for all - sliding window 2 :  0.88336
LogReg - Penalty l1 - features computed for all - sliding window 3 :  0.88128
LogReg - Penalty l1 - features computed for all - sliding window 4 :  0.87984
LogReg - Penalty l1 - features computed for all - sliding window 5 :  0.88064
LogReg - Penalty l1 - features computed separetely - sliding window 1 :  0.58592
LogReg - Penalty l1 - features computed separetely - sliding window 2 :  0.88304
LogReg - Penalty l1 - features computed separetely - sliding window 3 :  0.87984
LogReg - Penalty l1 - features computed separetely - sliding window 4 :  0.87936
LogReg - Penalty l1 - features computed separetely - sliding window 5 :  0.88096


## Random forests

In [6]:
from sklearn.ensemble import RandomForestClassifier

alg = RandomForestClassifier(random_state=1, n_estimators=30)
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("Random Forest - Score on test_data : ", score(label_test, predicted_label))

('Random Forest - Score on test_data : ', 0.81904)


In [106]:
from sklearn.ensemble import RandomForestClassifier

rf_all_tw = map(lambda x: RandomForestClassifier(random_state=1, n_estimators=30).fit(x, label_train_tw), data_train_all_norm)
rf_sep_tw = map(lambda x: RandomForestClassifier(random_state=1, n_estimators=30).fit(x, label_train_tw), data_train_sep_norm)
rf_label_all_tw = [rf_all_tw[i].predict(data_test_all_norm[i]) for i in range(len(data_test_all_norm))]

for i, preds in enumerate(rf_label_all_tw):
    print "Random Forest - features computed for all - sliding window {} : ".format(i+1), 1 - zero_one_loss(label_test_tw, preds)

rf_label_sep_tw = [rf_sep_tw[i].predict(data_test_sep_norm[i]) for i in range(len(data_test_sep_norm))]
for i, preds in enumerate(rf_label_sep_tw):
    print "Random Forest - features computed separetely - sliding window {} : ".format(i+1), 1 - zero_one_loss(label_test_tw, preds)

Random Forest - features computed for all - sliding window 1 :  0.548
Random Forest - features computed for all - sliding window 2 :  0.8112
Random Forest - features computed for all - sliding window 3 :  0.80624
Random Forest - features computed for all - sliding window 4 :  0.8104
Random Forest - features computed for all - sliding window 5 :  0.80592
Random Forest - features computed separetely - sliding window 1 :  0.54528
Random Forest - features computed separetely - sliding window 2 :  0.82
Random Forest - features computed separetely - sliding window 3 :  0.80944
Random Forest - features computed separetely - sliding window 4 :  0.8112
Random Forest - features computed separetely - sliding window 5 :  0.8104


## Support vector classification

In [107]:
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV 

tf-idf

In [7]:
Cs = {'C': np.logspace(-1, 0, 20)}

grid_search = GridSearchCV(LinearSVC(penalty="l1",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

grid_search = GridSearchCV(LinearSVC(penalty="l2",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

('SVM - Best C & associated score', {'C': 1.0}, 0.88357333333333332)
('SVM - Score on test_data : ', 0.88768)
('SVM - Best C & associated score', {'C': 1.0}, 0.90725333333333336)
('SVM - Score on test_data : ', 0.9064)


In [110]:
data_test_all_norm

[<6250x99627 sparse matrix of type '<type 'numpy.float64'>'
 	with 30391 stored elements in Compressed Sparse Row format>,
 <6250x99627 sparse matrix of type '<type 'numpy.float64'>'
 	with 685836 stored elements in Compressed Sparse Row format>,
 <6250x99627 sparse matrix of type '<type 'numpy.float64'>'
 	with 685836 stored elements in Compressed Sparse Row format>,
 <6250x99627 sparse matrix of type '<type 'numpy.float64'>'
 	with 685836 stored elements in Compressed Sparse Row format>,
 <6250x99627 sparse matrix of type '<type 'numpy.float64'>'
 	with 685836 stored elements in Compressed Sparse Row format>]

In [111]:
Cs = {'C': np.logspace(-2, 2, 30)}

SVM_all_tw_l1 = map(
    lambda x: GridSearchCV(LinearSVC(penalty="l1",dual=False), Cs).fit(x, label_train_tw), data_train_all_norm) 

SVM_label_all_tw_l1 = [SVM_all_tw_l1[i].predict(data_test_all_norm[i]) for i in range(len(data_test_all_norm))]

for i, preds in enumerate(SVM_label_all_tw_l1):
    print "SVM - Best C & associated score", SVM_all_tw_l1[i].best_params_, SVM_all_tw_l1[i].best_score_
    print "SVM - features computed for all - sliding window {} : ".format(i+1), 1 - zero_one_loss(label_test_tw, preds)

SVM - Best C & associated score {'C': 72.78953843983146} 0.5888
SVM - features computed for all - sliding window 1 :  0.58688
SVM - Best C & associated score {'C': 0.23950266199874859} 0.866186666667
SVM - features computed for all - sliding window 2 :  0.87616
SVM - Best C & associated score {'C': 0.23950266199874859} 0.864426666667
SVM - features computed for all - sliding window 3 :  0.8736
SVM - Best C & associated score {'C': 0.23950266199874859} 0.8632
SVM - features computed for all - sliding window 4 :  0.87312
SVM - Best C & associated score {'C': 0.23950266199874859} 0.863786666667
SVM - features computed for all - sliding window 5 :  0.87168


In [None]:
SVM_all_tw_l2 = map(
    lambda x: GridSearchCV(LinearSVC(penalty="l2",dual=False), Cs, n_jobs=-1).fit(x, label_train_tw), data_train_all_norm)
SVM_label_tw_l2 = [SVM_all_tw_l2[i].predict(data_test_all_norm[i]) for i in range(len(data_test_all_norm))]

for i, preds in enumerate(SVM_label_all_tw_l2):
    print "SVM - Best C & associated score", SVM_all_tw_l2[i].best_params_, SVM_all_tw_l2[i].best_score_
    print "SVM - features computed for all - sliding window {} : ".format(i+1), 1 - zero_one_loss(label_test_tw, preds)

## Naïve Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV 

In [8]:
alphas = np.logspace(-6, 0, 50)

from sklearn.grid_search import GridSearchCV
for Model in [MultinomialNB, BernoulliNB]:
    gscv = GridSearchCV(Model(), dict(alpha=alphas), cv=10).fit(data_train, label_train)
    print(Model.__name__, "- Best alpha & associated score", gscv.best_params_, gscv.best_score_)
    print("Score on test_data : ", score(label_test, gscv.predict(data_test)))

('MultinomialNB', '- Best alpha & associated score', {'alpha': 0.0082864277285468416}, 0.90581333333333336)
('Score on test_data : ', 0.88624)
('BernoulliNB', '- Best alpha & associated score', {'alpha': 0.0026826957952797246}, 0.89648000000000005)
('Score on test_data : ', 0.87712)


In [8]:
alphas = np.logspace(-6, 0, 50)
gscv_all_tw=[]
gscv_label_all_tw=[]
for i, Model in enumerate([MultinomialNB, BernoulliNB]):
    gscv_all_tw[i] = map(
        lambda x: GridSearchCV(Model(), dict(alpha=alphas), cv=10, n_jobs=-1).fit(x, label_train_tw), 
        data_train_all_norm)
    gscv_label_all_tw.append(
        [gscv_all_tw[i][j].predict(data_test_all_norm[j]) for j in range(len(data_test_all_norm))])    
    for k, preds in enumerate(gscv_label_all_tw[i])
        print(Model.__name__, "- Best alpha & associated score", gscv_label_all_tw[i][k].best_params_, gscv_label_all_tw[i][k].best_score_)
        print("Naïve Bayes {} :  - sliding window {} : ".format(Model.__name__, k+1), 1 - zero_one_loss(label_test_tw, preds)

('MultinomialNB', '- Best alpha & associated score', {'alpha': 0.0082864277285468416}, 0.90581333333333336)
('Score on test_data : ', 0.88624)
('BernoulliNB', '- Best alpha & associated score', {'alpha': 0.0026826957952797246}, 0.89648000000000005)
('Score on test_data : ', 0.87712)


In [15]:
#from sklearn import cross_validation
from sklearn.linear_model import SGDClassifier 

alphas = np.logspace(-6, 0, 10)
# ADD other param?
losses = ['squared_hinge', 'modified_huber']
sgd_all_tw=[]
sgd_label_all_tw=[]
for i, loss in enumerate(losses):
    sgd_all_tw[i] = map(lambda x : GridSearchCV(
        SGDClassifier(shuffle = True, loss = loss, n_iter= 500), 
        dict(alpha=alphas), 
        cv=10, 
        n_jobs = -1).fit(x, label_train), data_train_all_norm)
    sgd_label_all_tw.append(
        [sgd_all_tw[i][j].predict(data_test_all_norm[j]) for j in range(len(data_test_all_norm))]
    )
    for k, preds in enumerate(sgd_label_all_tw[i]):
        print("SGD -- ", loss, " Best alpha and associated score : ", sgd_all_tw[i][k].best_params_, sgd_all_tw[i][k].best_score_)
        print("Score on test_data -- sliding window {}: ".format(k+1), 1 - zero_one_loss(label_test_tw, preds)
    

('SGD', 'squared_hinge', ' Best alpha and associated score', 'squared_hinge', ': ', {'alpha': 0.0001}, 0.91034666666666664)
('Score on test_data : ', 0.90768)
('SGD', 'modified_huber', ' Best alpha and associated score', 'modified_huber', ': ', {'alpha': 0.0001}, 0.91039999999999999)
('Score on test_data : ', 0.90784)


In [15]:
#from sklearn import cross_validation
from sklearn.linear_model import SGDClassifier 

alphas = np.logspace(-6, 0, 10)
# ADD other param?
losses = ['squared_hinge', 'modified_huber']
for loss in losses:
    model = SGDClassifier(shuffle = True, loss = loss, n_iter= 500)
    gscv = GridSearchCV(model, dict(alpha=alphas), cv=10, n_jobs = -1).fit(data_train, label_train)
    print("SGD", loss, " Best alpha and associated score", loss, ": ", gscv.best_params_, gscv.best_score_)
    print("Score on test_data : ", score(label_test, gscv.predict(data_test)))
    

('SGD', 'squared_hinge', ' Best alpha and associated score', 'squared_hinge', ': ', {'alpha': 0.0001}, 0.91034666666666664)
('Score on test_data : ', 0.90768)
('SGD', 'modified_huber', ' Best alpha and associated score', 'modified_huber', ': ', {'alpha': 0.0001}, 0.91039999999999999)
('Score on test_data : ', 0.90784)
