In [1]:
import sklearn
import numpy as np
import scipy
import csv

def load_sparse_csr(filename):
    loader = np.load(filename)
    return scipy.sparse.csr_matrix(( loader['data'], loader['indices'], loader['indptr']),
                     shape = loader['shape'])
        
def load_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter = '\n')
        array = [float(row[0]) for row in reader]
        return array

In [2]:
data_train = load_sparse_csr('DB_no_cleaning/data_train_no_feat.npz')
data_test = load_sparse_csr('DB_no_cleaning/data_test_no_feat.npz')

In [3]:
label_train = load_csv('DB_no_cleaning/label_train_no_feat.csv')
label_test = load_csv('DB_no_cleaning/label_test_no_feat.csv')

In [4]:
def score(true_label, predicted_label):
    length = len(true_label)
    total = 0
    for i, label in enumerate(true_label):
        if label == predicted_label[i]:
            total += 1
    return float(total)/float(length)

In [24]:
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV 

Cs = {'C': np.logspace(-1, 0, 20)}

grid_search = GridSearchCV(LinearSVC(penalty="l1",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

#print("SVM - diff scores", grid_search.grid_scores_)
print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

('SVM - diff scores', [mean: 0.83552, std: 0.00375, params: {'C': 0.10000000000000001}, mean: 0.83968, std: 0.00318, params: {'C': 0.11288378916846889}, mean: 0.84224, std: 0.00356, params: {'C': 0.12742749857031338}, mean: 0.84795, std: 0.00359, params: {'C': 0.14384498882876631}, mean: 0.85168, std: 0.00352, params: {'C': 0.16237767391887217}, mean: 0.85509, std: 0.00253, params: {'C': 0.18329807108324356}, mean: 0.85792, std: 0.00178, params: {'C': 0.20691380811147897}, mean: 0.86064, std: 0.00229, params: {'C': 0.23357214690901223}, mean: 0.86336, std: 0.00214, params: {'C': 0.26366508987303583}, mean: 0.86651, std: 0.00189, params: {'C': 0.29763514416313175}, mean: 0.86837, std: 0.00215, params: {'C': 0.33598182862837817}, mean: 0.87077, std: 0.00242, params: {'C': 0.37926901907322497}, mean: 0.87307, std: 0.00306, params: {'C': 0.42813323987193935}, mean: 0.87504, std: 0.00279, params: {'C': 0.48329302385717521}, mean: 0.87675, std: 0.00291, params: {'C': 0.5455594781168519}, mea

In [26]:
Cs = {'C': np.logspace(-1, 0, 20)}

grid_search = GridSearchCV(LinearSVC(penalty="l2",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - diff scores", grid_search.grid_scores_)
print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

('SVM - diff scores', [mean: 0.87979, std: 0.00195, params: {'C': 0.10000000000000001}, mean: 0.88165, std: 0.00177, params: {'C': 0.11288378916846889}, mean: 0.88293, std: 0.00179, params: {'C': 0.12742749857031338}, mean: 0.88400, std: 0.00159, params: {'C': 0.14384498882876631}, mean: 0.88464, std: 0.00137, params: {'C': 0.16237767391887217}, mean: 0.88571, std: 0.00071, params: {'C': 0.18329807108324356}, mean: 0.88624, std: 0.00085, params: {'C': 0.20691380811147897}, mean: 0.88709, std: 0.00135, params: {'C': 0.23357214690901223}, mean: 0.88763, std: 0.00172, params: {'C': 0.26366508987303583}, mean: 0.88843, std: 0.00152, params: {'C': 0.29763514416313175}, mean: 0.88907, std: 0.00101, params: {'C': 0.33598182862837817}, mean: 0.88880, std: 0.00085, params: {'C': 0.37926901907322497}, mean: 0.88939, std: 0.00076, params: {'C': 0.42813323987193935}, mean: 0.88955, std: 0.00055, params: {'C': 0.48329302385717521}, mean: 0.88949, std: 0.00118, params: {'C': 0.5455594781168519}, mea

In [None]:
from sklearn.svm import SVC

alg = SVC(kernel = 'rbf')
alg.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - Score on test_data : ", score(label_test, predicted_label))

In [None]:
Cs = {'C': np.logspace(-1, 0, 20)}

grid_search = GridSearchCV(LinearSVC(penalty="l2",dual=False), Cs)
alg = grid_search.fit(data_train, label_train)
predicted_label = alg.predict(data_test)

print("SVM - diff scores", grid_search.grid_scores_)
print("SVM - Best C & associated score", grid_search.best_params_, grid_search.best_score_)
print("SVM - Score on test_data : ", score(label_test, predicted_label))

In [7]:
from sklearn import cross_validation
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
print data_train.shape

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(data_train, label_train)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(data_train)
X_new.shape

In [None]:

from sklearn.grid_search import GridSearchCV 
scores= []
for C in Cs:

    pipeline = Pipeline([
          ('feature_selection', SelectFromModel(LinearSVC(C=C, penalty="l1",dual=False))),
          ('classification', SVC(kernel="rbf"))
            ])

    scores.append(cross_validation.cross_val_score(pipeline, data_train, label_train, cv=5))
    
print("score max:",max(scores)) 

In [None]:
import matplotlib.pyplot as plt
plt.plot(scores,Cs)
plt.show()

In [2]:
param_grid =
{
C : np.logspace(-2, -0.5, 20)
}
Pipeline([
      ('linearSVC', GridSearchCV(LinearSVC(penalty="l1",dual=False), param_grid))
      ('feature_selection', SelectFromModel(GridSearchCV(LinearSVC(penalty="l1",dual=False), param_grid))),
      ('classification', GridSearcSVC(kernel="rbf"), param_grid)
        ])
from sklearn.grid_search import GridSearchCV 
rbf = GridSearchCV(pipeline, dict(param_grid=param_grid),cv=5, pre_dispatch= -1).fit(data_train, label_train)
print(model.__name__, rbf.best_params_, rbf.best_score_)

SyntaxError: invalid syntax (<ipython-input-2-10bcaef160be>, line 1)

In [34]:



rbf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(C=0.01,penalty="l1",dual=False))),
  ('classification', SVC(kernel="rbf"))
])
scores = cross_validation.cross_val_score(rbf, data_train, label_train, cv=5)

print(scores.mean())

KeyboardInterrupt: 

In [13]:
rbf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1",dual=False))),
  ('classification', SVC(kernel="linear"))
])
scores = cross_validation.cross_val_score(rbf, data_train, label_train, cv=5)

print(scores.mean())

0.878114049371


In [18]:
poly = SVC(kernel="poly")
scores = cross_validation.cross_val_score(poly, X_new, label_train, cv=5)

print(scores.mean())

0.500114285724


In [22]:
sigm = SVC(kernel="sigmoid")
scores = cross_validation.cross_val_score(sigm, X_new, label_train, cv=5)

print(scores.mean())

0.500114285724


In [21]:
rbf = SVC(kernel="rbf")
scores = cross_validation.cross_val_score(rbf, X_new, label_train, cv=5)

print(scores.mean())

0.743827297479


In [26]:
lin = SVC(kernel="linear")
scores = cross_validation.cross_val_score(lin, X_new, label_train, cv=5)

print(scores.mean())

0.743827297479
