# Model Results
### SVM Model
* 'C': 1, 'gamma': 0.0001
* Accuracy: 0.7730377805660115
-----
### SVC Model
Accuracy: 0.5253032204098703
-----
### Random Forest

                precision    recall  f1-score   support

     CANDIDATE       0.65      0.64      0.65       562
     CONFIRMED       0.66      0.68      0.67       573
FALSE POSITIVE       0.98      0.98      0.98      1256

     micro avg       0.83      0.83      0.83      2391
     macro avg       0.76      0.76      0.76      2391
  weighted avg       0.83      0.83      0.83      2391

In [38]:
#Import required packages

import numpy as np # linear algebra
import pandas as pd 


import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import os
import warnings

In [26]:
# Read in the cleaned CSV file (See R Notebook)
df = pd.read_csv("cleaned_data.csv")
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_duration,ra,dec
0,CONFIRMED,0,0,0,0,9.488036,170.53875,2.9575,291.93423,48.141651
1,CONFIRMED,0,0,0,0,54.418383,162.51384,4.507,291.93423,48.141651
2,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,1.7822,297.00482,48.134129
3,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,2.40641,285.53461,48.28521
4,CONFIRMED,0,0,0,0,2.525592,171.59555,1.6545,288.75488,48.2262


In [27]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
koi_fpflag_nt,9564.0,0.188206,0.390897,0.0,0.0,0.0,0.0,1.0
koi_fpflag_ss,9564.0,0.231598,0.421875,0.0,0.0,0.0,0.0,1.0
koi_fpflag_co,9564.0,0.194898,0.396143,0.0,0.0,0.0,0.0,1.0
koi_fpflag_ec,9564.0,0.120033,0.325018,0.0,0.0,0.0,0.0,1.0
koi_period,9564.0,75.671358,1334.744046,0.241843,2.733684,9.752831,40.715178,129995.7784
koi_time0bk,9564.0,166.183251,67.91896,120.515914,132.761718,137.224595,170.694603,1472.522306
koi_duration,9564.0,5.621606,6.471554,0.052,2.43775,3.7926,6.2765,138.54
ra,9564.0,292.060163,4.766657,279.85272,288.66077,292.261125,295.85916,301.72076
dec,9564.0,43.810433,3.601243,36.577381,40.777173,43.677504,46.714611,52.33601


# SVM Model
### Create a Train Test Split

Use `koi_disposition` for the y values

In [28]:
# Create the test and train data

y = df["koi_disposition"]
X = df.drop(columns=["koi_disposition"])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Use the MinMaxScaler to preprocess the data
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

  return self.partial_fit(X, y)


### Train the SVM Model

In [29]:
# Support vector machine linear classifier
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [30]:
print(f"Training Data Score: {svm_linear.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {svm_linear.score(X_test_scaled, y_test)}")

Training Data Score: 0.7747107207583995
Testing Data Score: 0.7586783772480133


### Tune Model Parameters using GridSearchCV

In [34]:
# Create a parameter grid to tune the C and gamma variables 
parameter_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10]}

tuned_grid = GridSearchCV(svm_linear, parameter_grid, verbose=7)

tuned_grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 49 candidates, totalling 147 fits
[CV] C=0.0001, gamma=0.0001 ..........................................
[CV] . C=0.0001, gamma=0.0001, score=0.5250836120401338, total=   0.4s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.5253032204098703, total=   0.4s
[CV] C=0.0001, gamma=0.0001 ..........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.3s remaining:    0.0s


[CV] . C=0.0001, gamma=0.0001, score=0.5251046025104602, total=   0.5s
[CV] C=0.0001, gamma=0.001 ...........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.1s remaining:    0.0s


[CV] .. C=0.0001, gamma=0.001, score=0.5250836120401338, total=   0.4s
[CV] C=0.0001, gamma=0.001 ...........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.9s remaining:    0.0s


[CV] .. C=0.0001, gamma=0.001, score=0.5253032204098703, total=   0.5s
[CV] C=0.0001, gamma=0.001 ...........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.6s remaining:    0.0s


[CV] .. C=0.0001, gamma=0.001, score=0.5251046025104602, total=   0.4s
[CV] C=0.0001, gamma=0.01 ............................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    4.4s remaining:    0.0s


[CV] ... C=0.0001, gamma=0.01, score=0.5250836120401338, total=   0.5s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.5253032204098703, total=   0.5s
[CV] C=0.0001, gamma=0.01 ............................................
[CV] ... C=0.0001, gamma=0.01, score=0.5251046025104602, total=   0.4s
[CV] C=0.0001, gamma=0.1 .............................................
[CV] .... C=0.0001, gamma=0.1, score=0.5250836120401338, total=   0.4s
[CV] C=0.0001, gamma=0.1 .............................................
[CV] .... C=0.0001, gamma=0.1, score=0.5253032204098703, total=   0.4s
[CV] C=0.0001, gamma=0.1 .............................................
[CV] .... C=0.0001, gamma=0.1, score=0.5251046025104602, total=   0.5s
[CV] C=0.0001, gamma=1 ...............................................
[CV] ...... C=0.0001, gamma=1, score=0.5250836120401338, total=   0.4s
[CV] C=0.0001, gamma=1 ...............................................
[CV] .

[CV] .... C=0.1, gamma=0.0001, score=0.7687160184023422, total=   0.1s
[CV] C=0.1, gamma=0.0001 .............................................
[CV] .... C=0.1, gamma=0.0001, score=0.7757322175732217, total=   0.2s
[CV] C=0.1, gamma=0.001 ..............................................
[CV] ..... C=0.1, gamma=0.001, score=0.7646321070234113, total=   0.1s
[CV] C=0.1, gamma=0.001 ..............................................
[CV] ..... C=0.1, gamma=0.001, score=0.7687160184023422, total=   0.1s
[CV] C=0.1, gamma=0.001 ..............................................
[CV] ..... C=0.1, gamma=0.001, score=0.7757322175732217, total=   0.1s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7646321070234113, total=   0.1s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...... C=0.1, gamma=0.01, score=0.7687160184023422, total=   0.1s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] .

[CV] ........... C=5, gamma=5, score=0.7736401673640168, total=   0.1s
[CV] C=5, gamma=10 ...................................................
[CV] .......... C=5, gamma=10, score=0.7688127090301003, total=   0.1s
[CV] C=5, gamma=10 ...................................................
[CV] .......... C=5, gamma=10, score=0.7749895441237976, total=   0.1s
[CV] C=5, gamma=10 ...................................................
[CV] .......... C=5, gamma=10, score=0.7736401673640168, total=   0.1s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.7683946488294314, total=   0.1s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.7774989544123797, total=   0.1s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ..... C=10, gamma=0.0001, score=0.7673640167364016, total=   0.1s
[CV] C=10, gamma=0.001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done 147 out of 147 | elapsed:  1.2min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=7)

In [33]:
print(tuned_grid.best_params_)
print(tuned_grid.best_score_)

{'C': 1, 'gamma': 0.0001}
0.7730377805660115


# SVC Model

In [35]:
svc = SVC()
svc.fit(X_train,y_train)
pred_svc =svc.predict(X_test)

print(f"Training Data Score: {svc.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {svc.score(X_test_scaled, y_test)}")



Training Data Score: 0.5251638087271714
Testing Data Score: 0.5253032204098703


# Random Forest

In [44]:
random_forest = RandomForestClassifier(n_estimators=250)
random_forest.fit(X_train, y_train)
random_forest_predictions = random_forest.predict(X_test)
print(classification_report(y_test, random_forest_predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.65      0.64      0.65       562
     CONFIRMED       0.66      0.68      0.67       573
FALSE POSITIVE       0.98      0.98      0.98      1256

     micro avg       0.83      0.83      0.83      2391
     macro avg       0.76      0.76      0.76      2391
  weighted avg       0.83      0.83      0.83      2391

