In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Read CSV and Clean

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# # Drop the null rows
df = df.dropna()
df.koi_disposition.value_counts()

FALSE POSITIVE    3504
CONFIRMED         1800
CANDIDATE         1687
Name: koi_disposition, dtype: int64

In [3]:
df.koi_disposition[df.koi_disposition == 'CANDIDATE'] = 1
df.koi_disposition[df.koi_disposition == 'FALSE POSITIVE'] = 0
df.koi_disposition[df.koi_disposition == 'CONFIRMED'] = 0
df2 = df.koi_disposition.astype(int)
df2
target_names  = ["Candidate", "Not"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
data = df.drop("koi_disposition", axis=1)
target = df2
print(f" Data Shape: {data.shape} / Target Shape{target.shape} / Target Dtype: {target.dtype}")

 Data Shape: (6991, 40) / Target Shape(6991,) / Target Dtype: int64


### Feature Selection (using ANOVA F-value)

In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, f_classif, chi2

In [59]:
bestfeatures = SelectKBest(f_classif, k=15)
fit = bestfeatures.fit(data,target)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(data.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(15,'Score'))  #print 10 best features

                Specs       Score
1       koi_fpflag_ss  711.339551
2       koi_fpflag_co  616.002101
0       koi_fpflag_nt  440.274865
3       koi_fpflag_ec  332.735032
26      koi_model_snr  180.475674
16          koi_depth  164.084097
22            koi_teq  140.713909
8    koi_time0bk_err1   70.477548
9    koi_time0bk_err2   70.477548
14  koi_duration_err1   40.971077
15  koi_duration_err2   40.971077
13       koi_duration   34.442112
29     koi_steff_err1   31.835768
30     koi_steff_err2   31.147950
4          koi_period   20.033421


  y = column_or_1d(y, warn=True)


In [61]:
selected_data = data[['koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_nt', 'koi_fpflag_ec', 'koi_model_snr','koi_depth','koi_teq','koi_time0bk_err1', 'koi_time0bk_err2', 'koi_duration_err1','koi_duration_err2','koi_duration','koi_steff_err1','koi_steff_err2','koi_period']].copy()

In [62]:
print(selected_data.shape)
target = df2.values.reshape(-1,1)
print(target.shape)

(6991, 15)
(6991, 1)


### Create a Train-Test Split

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [64]:
X_train, X_test, y_train, y_test = train_test_split(selected_data, target, random_state=42, stratify=target)
X_train.head()

Unnamed: 0,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_nt,koi_fpflag_ec,koi_model_snr,koi_depth,koi_teq,koi_time0bk_err1,koi_time0bk_err2,koi_duration_err1,koi_duration_err2,koi_duration,koi_steff_err1,koi_steff_err2,koi_period
4222,0,1,0,0,37.6,161.1,743,0.00621,-0.00621,0.189,-0.189,13.877,163,-199,15.745559
6119,0,1,0,1,16.7,49.9,932,0.0164,-0.0164,0.852,-0.852,8.724,128,-128,1.891144
3996,1,0,0,0,617.7,165480.0,1289,0.000465,-0.000465,0.0116,-0.0116,3.7357,188,-209,2.900613
374,0,0,0,0,23.4,135.2,1156,0.00232,-0.00232,0.0747,-0.0747,1.8069,108,-97,3.481592
984,0,0,0,0,22.5,776.8,699,0.00409,-0.00409,0.0972,-0.0972,3.6765,107,-131,17.815969


### Pre-processing (using MinMaxScaler)

In [65]:
X_minmax = MinMaxScaler().fit(X_train)
y_minmax = MinMaxScaler().fit(y_train)

X_train_minmax = X_minmax.transform(X_train)
X_test_minmax = X_minmax.transform(X_test)
y_train_minmax = y_minmax.transform(y_train)
y_test_minmax = y_minmax.transform(y_test)

### Train the Model

In [66]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_minmax, y_train_minmax)
predictions = model.predict(X_test_minmax)

  y = column_or_1d(y, warn=True)


In [67]:
print(f"Training Data Score: {model.score(X_train_minmax, y_train_minmax)}")
print(f"Testing Data Score: {model.score(X_test_minmax, y_test_minmax)}")

Training Data Score: 0.8441731832920084
Testing Data Score: 0.8392448512585813


### HyperParameter Tuner

In [99]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [250, 500, 1000],
              'gamma': [0.0000001, 0.000001, 0.00001]}
grid = GridSearchCV(model, param_grid, verbose=5)

In [100]:
grid.fit(X_train_minmax, y_train_minmax)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=250, gamma=1e-07 ..............................................
[CV] .................. C=250, gamma=1e-07, score=0.894, total=   0.3s
[CV] C=250, gamma=1e-07 ..............................................
[CV] .................. C=250, gamma=1e-07, score=0.886, total=   0.2s
[CV] C=250, gamma=1e-07 ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s
  y = column_or_1d(y, warn=True)


[CV] .................. C=250, gamma=1e-07, score=0.879, total=   0.2s
[CV] C=250, gamma=1e-06 ..............................................
[CV] .................. C=250, gamma=1e-06, score=0.894, total=   0.2s
[CV] C=250, gamma=1e-06 ..............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.7s remaining:    0.0s
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.9s remaining:    0.0s
  y = column_or_1d(y, warn=True)


[CV] .................. C=250, gamma=1e-06, score=0.886, total=   0.2s
[CV] C=250, gamma=1e-06 ..............................................
[CV] .................. C=250, gamma=1e-06, score=0.879, total=   0.2s
[CV] C=250, gamma=1e-05 ..............................................


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] .................. C=250, gamma=1e-05, score=0.894, total=   0.2s
[CV] C=250, gamma=1e-05 ..............................................
[CV] .................. C=250, gamma=1e-05, score=0.886, total=   0.2s
[CV] C=250, gamma=1e-05 ..............................................


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] .................. C=250, gamma=1e-05, score=0.879, total=   0.2s
[CV] C=500, gamma=1e-07 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=500, gamma=1e-07, score=0.894, total=   0.2s
[CV] C=500, gamma=1e-07 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=500, gamma=1e-07, score=0.887, total=   0.2s
[CV] C=500, gamma=1e-07 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=500, gamma=1e-07, score=0.879, total=   0.2s
[CV] C=500, gamma=1e-06 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=500, gamma=1e-06, score=0.894, total=   0.3s
[CV] C=500, gamma=1e-06 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=500, gamma=1e-06, score=0.887, total=   0.2s
[CV] C=500, gamma=1e-06 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=500, gamma=1e-06, score=0.879, total=   0.2s
[CV] C=500, gamma=1e-05 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=500, gamma=1e-05, score=0.894, total=   0.2s
[CV] C=500, gamma=1e-05 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=500, gamma=1e-05, score=0.887, total=   0.2s
[CV] C=500, gamma=1e-05 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=500, gamma=1e-05, score=0.879, total=   0.2s
[CV] C=520, gamma=1e-07 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=520, gamma=1e-07, score=0.895, total=   0.3s
[CV] C=520, gamma=1e-07 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=520, gamma=1e-07, score=0.887, total=   0.2s
[CV] C=520, gamma=1e-07 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=520, gamma=1e-07, score=0.879, total=   0.2s
[CV] C=520, gamma=1e-06 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=520, gamma=1e-06, score=0.895, total=   0.3s
[CV] C=520, gamma=1e-06 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=520, gamma=1e-06, score=0.887, total=   0.2s
[CV] C=520, gamma=1e-06 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=520, gamma=1e-06, score=0.879, total=   0.2s
[CV] C=520, gamma=1e-05 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=520, gamma=1e-05, score=0.895, total=   0.3s
[CV] C=520, gamma=1e-05 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=520, gamma=1e-05, score=0.887, total=   0.2s
[CV] C=520, gamma=1e-05 ..............................................


  y = column_or_1d(y, warn=True)


[CV] .................. C=520, gamma=1e-05, score=0.879, total=   0.2s


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    6.0s finished
  y = column_or_1d(y, warn=True)


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [250, 500, 520], 'gamma': [1e-07, 1e-06, 1e-05]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=5)

In [101]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 500, 'gamma': 1e-07}
0.88670608430288


### Classification Report

In [102]:
from sklearn.metrics import classification_report
predictions = grid.predict(X_test_minmax)
print(classification_report(y_test_minmax, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

   Candidate       0.90      0.95      0.92      1326
         Not       0.80      0.66      0.73       422

    accuracy                           0.88      1748
   macro avg       0.85      0.80      0.82      1748
weighted avg       0.88      0.88      0.88      1748



### Save Model

In [14]:
import joblib