In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [34]:
#Read in data
data = pd.read_csv('Resources/exoplanet_red1.csv')
data.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_srad
0,CONFIRMED,0,0,0,0,9.488036,170.53875,0.146,2.9575,616.0,2.26,793.0,93.59,35.8,5455.0,0.927
1,CONFIRMED,0,0,0,0,54.418383,162.51384,0.586,4.507,875.0,2.83,443.0,9.11,25.8,5455.0,0.927
2,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,0.969,1.7822,10800.0,14.6,638.0,39.3,76.3,5853.0,0.868
3,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8080.0,33.46,1395.0,891.96,505.6,5805.0,0.791
4,CONFIRMED,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.0,2.75,1406.0,926.16,40.9,6031.0,1.046


In [36]:
#Set independent variables
X = data.iloc[:,1:16]
X.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_srad
0,0,0,0,0,9.488036,170.53875,0.146,2.9575,616.0,2.26,793.0,93.59,35.8,5455.0,0.927
1,0,0,0,0,54.418383,162.51384,0.586,4.507,875.0,2.83,443.0,9.11,25.8,5455.0,0.927
2,0,1,0,0,19.89914,175.850252,0.969,1.7822,10800.0,14.6,638.0,39.3,76.3,5853.0,0.868
3,0,1,0,0,1.736952,170.307565,1.276,2.40641,8080.0,33.46,1395.0,891.96,505.6,5805.0,0.791
4,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.0,2.75,1406.0,926.16,40.9,6031.0,1.046


In [37]:
#Set dependent variable
y = data.iloc[:,0]

In [38]:
#Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [39]:
#Scale data
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [40]:
#Run initial model
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [41]:
#Accuracy of initial model
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))

Test Acc: 0.786


In [25]:
#Perform Grid Search to tune the model; set Grid Search parameters
from sklearn.model_selection import GridSearchCV
param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [27]:
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.812, total=   0.7s
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.799, total=   1.6s
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.2s remaining:    0.0s


[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.807, total=   1.9s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.803, total=   1.4s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.795, total=   2.0s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.516, total=   1.2s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.517, total=   1.2s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.517, total=   1.2s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.517, total=   1.2s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] .

[CV] .... C=5, gamma=0.0001, kernel=linear, score=0.814, total=   2.5s
[CV] C=5, gamma=0.0001, kernel=linear ................................
[CV] .... C=5, gamma=0.0001, kernel=linear, score=0.796, total=   2.0s
[CV] C=5, gamma=0.0001, kernel=linear ................................
[CV] .... C=5, gamma=0.0001, kernel=linear, score=0.808, total=   4.2s
[CV] C=5, gamma=0.0001, kernel=linear ................................
[CV] .... C=5, gamma=0.0001, kernel=linear, score=0.806, total=   2.3s
[CV] C=5, gamma=0.0001, kernel=linear ................................
[CV] .... C=5, gamma=0.0001, kernel=linear, score=0.795, total=   6.0s
[CV] C=5, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=5, gamma=0.0001, kernel=poly, score=0.516, total=   1.1s
[CV] C=5, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=5, gamma=0.0001, kernel=poly, score=0.517, total=   1.2s
[CV] C=5, gamma=0.0001, kernel=poly ..................................
[CV] .

[CV] ..... C=5, gamma=0.01, kernel=sigmoid, score=0.773, total=   0.7s
[CV] C=5, gamma=0.01, kernel=sigmoid .................................
[CV] ..... C=5, gamma=0.01, kernel=sigmoid, score=0.781, total=   0.6s
[CV] C=10, gamma=0.0001, kernel=linear ...............................
[CV] ... C=10, gamma=0.0001, kernel=linear, score=0.814, total=   3.9s
[CV] C=10, gamma=0.0001, kernel=linear ...............................
[CV] ... C=10, gamma=0.0001, kernel=linear, score=0.796, total=   2.2s
[CV] C=10, gamma=0.0001, kernel=linear ...............................
[CV] ... C=10, gamma=0.0001, kernel=linear, score=0.809, total=   6.1s
[CV] C=10, gamma=0.0001, kernel=linear ...............................
[CV] ... C=10, gamma=0.0001, kernel=linear, score=0.808, total=   2.8s
[CV] C=10, gamma=0.0001, kernel=linear ...............................
[CV] ... C=10, gamma=0.0001, kernel=linear, score=0.795, total=   7.8s
[CV] C=10, gamma=0.0001, kernel=poly .................................
[CV] .

[CV] .... C=10, gamma=0.01, kernel=sigmoid, score=0.782, total=   0.6s
[CV] C=10, gamma=0.01, kernel=sigmoid ................................
[CV] .... C=10, gamma=0.01, kernel=sigmoid, score=0.780, total=   0.6s
[CV] C=10, gamma=0.01, kernel=sigmoid ................................
[CV] .... C=10, gamma=0.01, kernel=sigmoid, score=0.786, total=   0.6s
[CV] C=10, gamma=0.01, kernel=sigmoid ................................
[CV] .... C=10, gamma=0.01, kernel=sigmoid, score=0.776, total=   0.6s


[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:  5.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [28]:
#List the best parameters for this dataset
print(grid.best_params_)

{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}


In [29]:
#List the best score
print(grid.best_score_)

0.8063768115942029


In [33]:
#Analyze model accuracy with test data
from sklearn.metrics import classification_report, confusion_matrix
predictions = grid.predict(X_test_scaled)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[ 186  386   10]
 [  45  497   17]
 [   7    4 1148]]
                precision    recall  f1-score   support

     CANDIDATE       0.78      0.32      0.45       582
     CONFIRMED       0.56      0.89      0.69       559
FALSE POSITIVE       0.98      0.99      0.98      1159

      accuracy                           0.80      2300
     macro avg       0.77      0.73      0.71      2300
  weighted avg       0.83      0.80      0.78      2300

