In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import pandas as pd

In [2]:
df = pd.read_csv("cumulative.csv")
df.head(5)

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
dtypes = pd.DataFrame(df.dtypes)
dtypes

Unnamed: 0,0
rowid,int64
kepid,int64
kepoi_name,object
kepler_name,object
koi_disposition,object
koi_pdisposition,object
koi_score,float64
koi_fpflag_nt,int64
koi_fpflag_ss,int64
koi_fpflag_co,int64


In [4]:
data = df.dropna(axis='columns', how='any')
data.head()

Unnamed: 0,rowid,kepid,kepoi_name,koi_disposition,koi_pdisposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_duration,ra,dec
0,1,10797460,K00752.01,CONFIRMED,CANDIDATE,0,0,0,0,9.488036,170.53875,2.9575,291.93423,48.141651
1,2,10797460,K00752.02,CONFIRMED,CANDIDATE,0,0,0,0,54.418383,162.51384,4.507,291.93423,48.141651
2,3,10811496,K00753.01,FALSE POSITIVE,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,1.7822,297.00482,48.134129
3,4,10848459,K00754.01,FALSE POSITIVE,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,2.40641,285.53461,48.28521
4,5,10854555,K00755.01,CONFIRMED,CANDIDATE,0,0,0,0,2.525592,171.59555,1.6545,288.75488,48.2262


In [5]:

pd.set_option('display.max_columns', None)
# df = df.drop(['rowid', 'kepid','kepoi_name', 'kepler_name', 'koi_pdisposition','koi_tce_delivname','koi_period_err1','koi_period_err2',], axis=1)
# df = df.drop(['rowid', 'kepid','kepoi_name', 'kepler_name', 'koi_pdisposition','koi_tce_delivname','koi_teq_err2', 'koi_teq_err1','koi_depth_err1','koi_depth_err2', 'koi_depth','koi_period_err1', 'koi_period_err2', 'koi_time0bk_err1','koi_time0bk_err2','koi_prad_err1','koi_prad_err2'], axis=1)
data = df.dropna(axis='columns', how='any')
data = data.drop(['rowid', 'kepid', 'kepoi_name','koi_pdisposition'], axis=1)
target = df["koi_disposition"]
target_names = target.unique()
data = data.drop("koi_disposition", axis=1)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [7]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [8]:
from sklearn.preprocessing import StandardScaler

x_scaler = StandardScaler().fit(X_train)
scaled_x_train = x_scaler.transform(X_train)
scaled_x_test = x_scaler.transform(X_test)


In [9]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(scaled_x_train, encoded_y_train)
predictions = model.predict(scaled_x_test)

In [10]:
model.score(scaled_x_train, encoded_y_train)

0.7837724801338353

In [11]:
model.score(scaled_x_test, encoded_y_test)

0.7728983688833124

In [12]:
from sklearn.metrics import classification_report
print(classification_report(encoded_y_test, predictions,
                           target_names = target_names))

                precision    recall  f1-score   support

     CONFIRMED       0.74      0.17      0.28       567
FALSE POSITIVE       0.52      0.93      0.67       574
     CANDIDATE       0.98      0.97      0.98      1250

      accuracy                           0.77      2391
     macro avg       0.75      0.69      0.64      2391
  weighted avg       0.81      0.77      0.74      2391



In [16]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [17]:
grid.fit(scaled_x_train, encoded_y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END .................C=1, gamma=0.0001;, score=0.783 total time=   0.8s
[CV 2/5] END .................C=1, gamma=0.0001;, score=0.782 total time=   0.5s
[CV 3/5] END .................C=1, gamma=0.0001;, score=0.786 total time=   0.5s
[CV 4/5] END .................C=1, gamma=0.0001;, score=0.794 total time=   0.5s
[CV 5/5] END .................C=1, gamma=0.0001;, score=0.775 total time=   0.5s
[CV 1/5] END .................C=1, gamma=0.0005;, score=0.783 total time=   0.5s
[CV 2/5] END .................C=1, gamma=0.0005;, score=0.782 total time=   0.5s
[CV 3/5] END .................C=1, gamma=0.0005;, score=0.786 total time=   0.5s
[CV 4/5] END .................C=1, gamma=0.0005;, score=0.794 total time=   0.5s
[CV 5/5] END .................C=1, gamma=0.0005;, score=0.775 total time=   0.5s
[CV 1/5] END ..................C=1, gamma=0.001;, score=0.783 total time=   0.6s
[CV 2/5] END ..................C=1, gamma=0.001;

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [15]:
print(grid.best_params_)

{'C': 10, 'gamma': 0.0001}


In [24]:
model = SVC(kernel='linear', C=10, gamma=0.0001)
model.fit(scaled_x_train, encoded_y_train)
predictions = model.predict(scaled_x_test)
print(model.score(scaled_x_test,encoded_y_test))

0.7724801338352154


In [25]:
from sklearn.metrics import classification_report
print(classification_report(encoded_y_test, predictions,
                            target_names=target_names))

                precision    recall  f1-score   support

     CONFIRMED       0.73      0.18      0.29       567
FALSE POSITIVE       0.52      0.92      0.67       574
     CANDIDATE       0.98      0.97      0.98      1250

      accuracy                           0.77      2391
     macro avg       0.74      0.69      0.64      2391
  weighted avg       0.81      0.77      0.74      2391

