In [29]:
import sklearn 
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array
    
def load_feature_names(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter= '\n')
        array = [row for row in reader]
        return array

In [31]:
data_train = load_sparse_csr('DB_well_tfidf/data_train_well_tfidf.npz')
data_test = load_sparse_csr('DB_well_tfidf/data_test_well_tfidf.npz')
label_train = load_csv('DB_well_tfidf/label_train_well_tfidf.csv')
label_test = load_csv('DB_well_tfidf/label_test_well_tfidf.csv')
features_names = load_feature_names('DB_well_tfidf/feature_names_well_tfidf.csv')

In [32]:
def score(true_label, predicted_label):
    length = len(true_label)
    total = 0
    for i, label in enumerate(true_label):
        if label == predicted_label[i]:
            total += 1
    return float(total)/float(length)

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation

alg = LogisticRegression(penalty = 'l1')
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)
print("LogReg - Penalty l1 - Score on test_data : ", score(label_test, predicted_label))

alg = LogisticRegression(penalty = 'l2')
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)
print("LogReg - Penalty l2 - Score on test_data : ", score(label_test, predicted_label))

('LogReg - Penalty l1 - Score on test_data : ', 0.87424)
('LogReg - Penalty l2 - Score on test_data : ', 0.88592)


In [45]:
from sklearn.ensemble import RandomForestClassifier

alg = RandomForestClassifier(random_state=1, n_estimators=30)
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("Random Forest - Score on test_data : ", score(label_test, predicted_label))

('Random Forest - Score on test_data : ', 0.82624)


In [46]:
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV 

Cs = {'C': np.logspace(-1, 0, 20)}

grid_search = GridSearchCV(LinearSVC(penalty="l1",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

grid_search = GridSearchCV(LinearSVC(penalty="l2",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

('SVM - Best C & associated score', {'C': 0.69519279617756047}, 0.87434666666666672)
('SVM - Score on test_data : ', 0.88064)
('SVM - Best C & associated score', {'C': 0.11288378916846889}, 0.88474666666666668)
('SVM - Score on test_data : ', 0.88768)
