In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
X_train = pd.read_csv(f'../BerkeleySETI/inputdata/X_train_4-26-22.csv', index_col=0)
y_train = pd.read_csv(f'../BerkeleySETI/inputdata/y_train_4-26-22.csv', index_col=0)
X_test = pd.read_csv(f'../BerkeleySETI/inputdata/X_test_4-26-22.csv', index_col=0)
y_test = pd.read_csv(f'../BerkeleySETI/inputdata/y_test_4-26-22.csv', index_col=0)
y_train = y_train.squeeze()
y_test = y_test.squeeze()

In [3]:
svc = make_pipeline(StandardScaler(),svm.LinearSVC(dual=False))
svc.fit(X_train,y_train)
y_predsvc = svc.predict(X_test)

In [4]:
print(accuracy_score(y_test,y_predsvc)) # Fraction of classifications that were correct
print(precision_score(y_test,y_predsvc,pos_label='EB'))

0.6212844036697248
0.6218302094818081


In [25]:
import glob
import os
files = os.path.join(f'../BerkeleySETI/inputdata', 'X_*_4-26-22.csv')
files = glob.glob(files)
df3 = pd.concat(map(pd.read_csv, files), ignore_index=True)
df3.rename(columns = {'Unnamed: 0':''}, inplace = True)
df3.to_csv(f'../BerkeleySETI/inputdata/X.csv', index = False)

In [26]:
files = os.path.join(f'../BerkeleySETI/inputdata', 'y_*_4-26-22.csv')
files = glob.glob(files)
df3 = pd.concat(map(pd.read_csv, files), ignore_index=True)
df3.rename(columns = {'Unnamed: 0':''}, inplace = True)
df3.to_csv(f'../BerkeleySETI/inputdata/y.csv', index = False)

In [30]:
X = pd.read_csv(f'../BerkeleySETI/inputdata/X.csv', index_col=[0])
y = pd.read_csv(f'../BerkeleySETI/inputdata/y.csv', index_col=[0])

In [34]:
y

Unnamed: 0,Label
2256,NonEB
3702,NonEB
2948,NonEB
9846,EB
10969,EB
...,...
11337,EB
3448,NonEB
10555,EB
10611,EB


In [36]:
y = y.squeeze()

In [32]:
from sklearn.model_selection import cross_val_score

In [38]:
cross_val_score(svc,X,y,cv=5)

array([0.62147307, 0.62220594, 0.6218395 , 0.62243402, 0.62170088])

In [39]:
svc2 = make_pipeline(StandardScaler(),svm.LinearSVC(dual=False))

In [39]:
from sklearn.model_selection import GridSearchCV

In [46]:
svc.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'standardscaler', 'linearsvc', 'standardscaler__copy', 'standardscaler__with_mean', 'standardscaler__with_std', 'linearsvc__C', 'linearsvc__class_weight', 'linearsvc__dual', 'linearsvc__fit_intercept', 'linearsvc__intercept_scaling', 'linearsvc__loss', 'linearsvc__max_iter', 'linearsvc__multi_class', 'linearsvc__penalty', 'linearsvc__random_state', 'linearsvc__tol', 'linearsvc__verbose'])

In [55]:
clf = GridSearchCV(svc, {
    'linearsvc__C': [0.01,0.1,1,10,100,1000],
    'linearsvc__tol': [1e-6,1e-5,5e-5,1e-4,5e-4,1e-3,1e-2]
}, cv=5, return_train_score=False)


In [56]:
clf.fit(X,y)
clf.cv_results_


{'mean_fit_time': array([0.16881089, 0.16384439, 0.15654001, 0.1458549 , 0.13469968,
        0.12649636, 0.11257377, 0.18932486, 0.16832485, 0.16405115,
        0.15991116, 0.13827696, 0.13858051, 0.11710858, 0.19524989,
        0.17294035, 0.15811601, 0.15871835, 0.14151168, 0.13376555,
        0.10505266, 0.19197903, 0.1755589 , 0.15989408, 0.15878453,
        0.14229064, 0.13215761, 0.1106524 , 0.19036889, 0.17003183,
        0.15867381, 0.18688068, 0.19773126, 0.17788091, 0.14780097,
        0.2619266 , 0.22667265, 0.22105331, 0.21633677, 0.16590104,
        0.13190317, 0.10442128]),
 'std_fit_time': array([0.02135372, 0.00545253, 0.00739627, 0.00897725, 0.00707312,
        0.00835529, 0.00937077, 0.01470846, 0.01253482, 0.00757712,
        0.01109466, 0.00889808, 0.00831269, 0.01174737, 0.01088657,
        0.01538881, 0.01091345, 0.00928979, 0.00869798, 0.00825585,
        0.01759463, 0.01231431, 0.01394155, 0.01074705, 0.01153375,
        0.009547  , 0.00835042, 0.01833423, 0.005

In [57]:
svcdf = pd.DataFrame(clf.cv_results_)

In [58]:
svcdf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_linearsvc__C,param_linearsvc__tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.168811,0.021354,0.011021,0.001302,0.01,1e-06,"{'linearsvc__C': 0.01, 'linearsvc__tol': 1e-06}",0.621473,0.622206,0.62184,0.622434,0.621701,0.621931,0.000346,1
1,0.163844,0.005453,0.010756,0.001927,0.01,1e-05,"{'linearsvc__C': 0.01, 'linearsvc__tol': 1e-05}",0.621473,0.622206,0.62184,0.622434,0.621701,0.621931,0.000346,1
2,0.15654,0.007396,0.009291,0.002135,0.01,5e-05,"{'linearsvc__C': 0.01, 'linearsvc__tol': 5e-05}",0.621473,0.622206,0.62184,0.622434,0.621701,0.621931,0.000346,1
3,0.145855,0.008977,0.010531,0.001615,0.01,0.0001,"{'linearsvc__C': 0.01, 'linearsvc__tol': 0.0001}",0.621473,0.622206,0.62184,0.622434,0.621701,0.621931,0.000346,1
4,0.1347,0.007073,0.010024,0.002143,0.01,0.0005,"{'linearsvc__C': 0.01, 'linearsvc__tol': 0.0005}",0.621473,0.622206,0.62184,0.622434,0.621701,0.621931,0.000346,1
5,0.126496,0.008355,0.010742,0.001857,0.01,0.001,"{'linearsvc__C': 0.01, 'linearsvc__tol': 0.001}",0.621473,0.622206,0.62184,0.622434,0.621701,0.621931,0.000346,1
6,0.112574,0.009371,0.011232,0.001986,0.01,0.01,"{'linearsvc__C': 0.01, 'linearsvc__tol': 0.01}",0.621473,0.622206,0.62184,0.622434,0.621701,0.621931,0.000346,1
7,0.189325,0.014708,0.010609,0.001923,0.1,1e-06,"{'linearsvc__C': 0.1, 'linearsvc__tol': 1e-06}",0.621473,0.622206,0.62184,0.622434,0.621701,0.621931,0.000346,1
8,0.168325,0.012535,0.011716,0.000105,0.1,1e-05,"{'linearsvc__C': 0.1, 'linearsvc__tol': 1e-05}",0.621473,0.622206,0.62184,0.622434,0.621701,0.621931,0.000346,1
9,0.164051,0.007577,0.010634,0.001825,0.1,5e-05,"{'linearsvc__C': 0.1, 'linearsvc__tol': 5e-05}",0.621473,0.622206,0.62184,0.622434,0.621701,0.621931,0.000346,1


In [59]:
svcdf[['param_linearsvc__C', 'param_linearsvc__tol', 'mean_test_score']]

Unnamed: 0,param_linearsvc__C,param_linearsvc__tol,mean_test_score
0,0.01,1e-06,0.621931
1,0.01,1e-05,0.621931
2,0.01,5e-05,0.621931
3,0.01,0.0001,0.621931
4,0.01,0.0005,0.621931
5,0.01,0.001,0.621931
6,0.01,0.01,0.621931
7,0.1,1e-06,0.621931
8,0.1,1e-05,0.621931
9,0.1,5e-05,0.621931


I'm getting the same values here regardless of the parameter values, hm

In [62]:
linearsvc__C= [0.01,0.1,1,10,100,1000],
linearsvc__tol= [1e-6,1e-5,5e-5,1e-4,5e-4,1e-3,1e-2]
avg_scores = {}
for cval in linearsvc__C:
    for tval in linearsvc__tol:
        cv_scores = cross_val_score(make_pipeline(StandardScaler(),svm.LinearSVC(C = cval, tol = tval, dual=False)), X, y, cv=5)
        avg_scores[str(cval) + '_' + str(tval)] = np.average(cv_scores)
        
avg_scores


5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/opt/conda/lib/python3.7/site-packages/sklearn/svm/_classes.py", line 243, in fit
    if self.C < 0:
TypeError: '<' not supported between instances of 'list' and 'int'

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters wi

{'[0.01, 0.1, 1, 10, 100, 1000]_1e-06': nan,
 '[0.01, 0.1, 1, 10, 100, 1000]_1e-05': nan,
 '[0.01, 0.1, 1, 10, 100, 1000]_5e-05': nan,
 '[0.01, 0.1, 1, 10, 100, 1000]_0.0001': nan,
 '[0.01, 0.1, 1, 10, 100, 1000]_0.0005': nan,
 '[0.01, 0.1, 1, 10, 100, 1000]_0.001': nan,
 '[0.01, 0.1, 1, 10, 100, 1000]_0.01': nan}

In [None]:
model = svm.SVC(kernel='linear')
model.fit (X_train, y_train)
model.score(X_test, y_test)