In [16]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array
    
def load_sparse_coo(filename):
    loader = np.load(filename)
    return scipy.sparse.coo_matrix((loader['data'],(loader['row'],loader['col'])),
                     shape = loader['shape'])

In [17]:
path = "bapt_tfidf/"
data_train = load_sparse_csr(path+'data_train.npz')
data_test = load_sparse_csr(path+'data_test.npz')
label_train = load_csv(path+'label_train.csv')
label_test = load_csv(path+'label_test.csv')
features_names = load_feature_names('data/feature_names.csv')

In [18]:
from sklearn.metrics import zero_one_loss
def score(true_label, predicted_label):
    return 1 - zero_one_loss(true_label,predicted_label)
data_train_all = [load_sparse_csr('tw_sw{}_all_train_train.npz'.format(k)) for k in range(1,6)]
data_test_all = [load_sparse_csr('tw_sw{}_all_train_test.npz'.format(k)) for k in range(1,6)]
data_train_sep = [load_sparse_coo('tw_sw{}_train_train.npz'.format(k)) for k in range(1,6)]
data_test_sep = [load_sparse_coo('tw_sw{}_train_test.npz'.format(k)) for k in range(1,6)]

label_train_tw = load_csv('labels_train_train.csv')
label_test_tw = load_csv('labels_train_test.csv')

In [19]:
from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
fselect = SelectKBest(chi2 , k=80000)
data_train = fselect.fit_transform(data_train, label_train)
data_test = fselect.transform(data_test)

from sklearn.preprocessing import MaxAbsScaler
normalizer_all = map(lambda x: MaxAbsScaler().fit(x), data_train_all)
normalizer_sep = map(lambda x: MaxAbsScaler().fit(x), data_train_sep)

scaler =MaxAbsScaler()
scaler.partial_fit(data_test)
scaler.partial_fit(data_train)
scaler.transform(data_test)
scaler.transform(data_train)

data_train_all_norm = [normalizer_all[i].transform(data_train_all[i]) for i in range(len(data_train_all))]
data_test_all_norm = [normalizer_all[i].transform(data_test_all[i]) for i in range(len(data_test_all))]
data_train_sep_norm = [normalizer_sep[i].transform(data_train_sep[i]) for i in range(len(data_train_sep))]
data_test_sep_norm = [normalizer_sep[i].transform(data_test_sep[i]) for i in range(len(data_test_sep))]


data_train_tw = data_train_all_norm [3]
data_test_tw = data_test_all_norm [3]

In [20]:
data_train = scipy.sparse.hstack([data_train,data_train_tw])
data_test = scipy.sparse.hstack([data_test,data_test_tw])

In [21]:
Cs = {'C': np.logspace(-1, 3, 20)}
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

lr = GridSearchCV(LogisticRegression(penalty = 'l2'), Cs, n_jobs = -1)
lr = lr.fit(data_train, label_train)
predicted_label_lr = lr.predict(data_test)

print("Log Reg - Best C & associated score", lr.best_params_, lr.best_score_)
print("Log Reg - Score on test_data : ", score(label_test, predicted_label_lr))

('Log Reg - Best C & associated score', {'C': 7.8475997035146108}, 0.91877333333333333)
('Log Reg - Score on test_data : ', 0.92320000000000002)


In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV 

alphas = np.logspace(-6, 0, 50)

from sklearn.grid_search import GridSearchCV
for Model in [ BernoulliNB]:
    gscv = GridSearchCV(Model(), dict(alpha=alphas), cv=10, n_jobs = -1).fit(data_train, label_train)
    print(Model.__name__, "- Best alpha & associated score", gscv.best_params_, gscv.best_score_)
    print("Score on test_data : ", score(label_test, gscv.predict(data_test)))

('BernoulliNB', '- Best alpha & associated score', {'alpha': 0.014563484775012445}, 0.9645866666666667)
('Score on test_data : ', 0.92688000000000004)


In [None]:
#from sklearn import cross_validation
from sklearn.linear_model import SGDClassifier 

alphas = np.logspace(-6, 0, 10)
# ADD other param?
losses = ['squared_hinge', 'modified_huber']
for loss in losses:
    model = SGDClassifier(shuffle = True, loss = loss, n_iter= 500)
    gscv = GridSearchCV(model, dict(alpha=alphas), cv=10, n_jobs = -1).fit(data_train, label_train)
    print("SGD", loss, " Best alpha and associated score", loss, ": ", gscv.best_params_, gscv.best_score_)
    print("Score on test_data : ", score(label_test, gscv.predict(data_test)))