In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
len(df)

6991

In [4]:
X = df[['koi_period', 'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_duration', 'koi_impact', 'koi_depth', 'koi_prad']]
y = df['koi_disposition']
X.head()

Unnamed: 0,koi_period,koi_teq,koi_insol,koi_model_snr,koi_duration,koi_impact,koi_depth,koi_prad
0,54.418383,443,9.11,25.8,4.507,0.586,874.8,2.83
1,19.89914,638,39.3,76.3,1.7822,0.969,10829.0,14.6
2,1.736952,1395,891.96,505.6,2.40641,1.276,8079.2,33.46
3,2.525592,1406,926.16,40.9,1.6545,0.701,603.3,2.75
4,4.134435,1160,427.65,40.2,3.1402,0.762,686.0,2.77


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [8]:
print(f"Training Data Score: {knn.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {knn.score(X_test_scaled, y_test)}")

Training Data Score: 0.7297348846080488
Testing Data Score: 0.6636155606407322


In [9]:
predictions = knn.predict(X_test_scaled)

In [10]:
dd = pd.DataFrame({"Prediction": predictions, "Actual":y_test})
dd.head()

Unnamed: 0,Prediction,Actual
4990,FALSE POSITIVE,FALSE POSITIVE
1425,CONFIRMED,CONFIRMED
144,CONFIRMED,CONFIRMED
6589,FALSE POSITIVE,FALSE POSITIVE
4443,FALSE POSITIVE,CANDIDATE


In [11]:
# To Confirm the accuracy of testing data
s = dd["Prediction"].count()
wrong = 0
for i in range(0,s):
    a=dd["Prediction"].iloc[i]
    b=dd["Actual"].iloc[i]
    if a != b:
        wrong = wrong + 1
        
(s - wrong) / s

0.6636155606407322

# Hyperparameter Tuning

In [12]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': np.arange(1,20,2), 'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p' : [1, 2]}

grid = GridSearchCV(knn, param_grid, verbose = 3)

In [13]:
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 80 candidates, totalling 240 fits
[CV] algorithm=auto, n_neighbors=1, p=1 ..............................
[CV] .. algorithm=auto, n_neighbors=1, p=1, score=0.607, total=   0.1s
[CV] algorithm=auto, n_neighbors=1, p=1 ..............................
[CV] .. algorithm=auto, n_neighbors=1, p=1, score=0.585, total=   0.1s
[CV] algorithm=auto, n_neighbors=1, p=1 ..............................
[CV] .. algorithm=auto, n_neighbors=1, p=1, score=0.606, total=   0.1s
[CV] algorithm=auto, n_neighbors=1, p=2 ..............................
[CV] .. algorithm=auto, n_neighbors=1, p=2, score=0.604, total=   0.1s
[CV] algorithm=auto, n_neighbors=1, p=2 ..............................
[CV] .. algorithm=auto, n_neighbors=1, p=2, score=0.560, total=   0.1s
[CV] algorithm=auto, n_neighbors=1, p=2 ..............................
[CV] .. algorithm=auto, n_neighbors=1, p=2, score=0.598, total=   0.1s
[CV] algorithm=auto, n_neighbors=3, p=1 ..............................
[CV] .. algorit

[CV] . algorithm=auto, n_neighbors=19, p=2, score=0.651, total=   0.1s
[CV] algorithm=auto, n_neighbors=19, p=2 .............................
[CV] . algorithm=auto, n_neighbors=19, p=2, score=0.657, total=   0.1s
[CV] algorithm=ball_tree, n_neighbors=1, p=1 .........................
[CV]  algorithm=ball_tree, n_neighbors=1, p=1, score=0.607, total=   0.1s
[CV] algorithm=ball_tree, n_neighbors=1, p=1 .........................
[CV]  algorithm=ball_tree, n_neighbors=1, p=1, score=0.585, total=   0.1s
[CV] algorithm=ball_tree, n_neighbors=1, p=1 .........................
[CV]  algorithm=ball_tree, n_neighbors=1, p=1, score=0.606, total=   0.1s
[CV] algorithm=ball_tree, n_neighbors=1, p=2 .........................
[CV]  algorithm=ball_tree, n_neighbors=1, p=2, score=0.604, total=   0.1s
[CV] algorithm=ball_tree, n_neighbors=1, p=2 .........................
[CV]  algorithm=ball_tree, n_neighbors=1, p=2, score=0.560, total=   0.1s
[CV] algorithm=ball_tree, n_neighbors=1, p=2 .................

[CV]  algorithm=ball_tree, n_neighbors=19, p=1, score=0.686, total=   0.1s
[CV] algorithm=ball_tree, n_neighbors=19, p=2 ........................
[CV]  algorithm=ball_tree, n_neighbors=19, p=2, score=0.642, total=   0.1s
[CV] algorithm=ball_tree, n_neighbors=19, p=2 ........................
[CV]  algorithm=ball_tree, n_neighbors=19, p=2, score=0.651, total=   0.1s
[CV] algorithm=ball_tree, n_neighbors=19, p=2 ........................
[CV]  algorithm=ball_tree, n_neighbors=19, p=2, score=0.657, total=   0.1s
[CV] algorithm=kd_tree, n_neighbors=1, p=1 ...........................
[CV]  algorithm=kd_tree, n_neighbors=1, p=1, score=0.607, total=   0.1s
[CV] algorithm=kd_tree, n_neighbors=1, p=1 ...........................
[CV]  algorithm=kd_tree, n_neighbors=1, p=1, score=0.585, total=   0.1s
[CV] algorithm=kd_tree, n_neighbors=1, p=1 ...........................
[CV]  algorithm=kd_tree, n_neighbors=1, p=1, score=0.606, total=   0.1s
[CV] algorithm=kd_tree, n_neighbors=1, p=2 ...............

[CV]  algorithm=kd_tree, n_neighbors=19, p=1, score=0.663, total=   0.1s
[CV] algorithm=kd_tree, n_neighbors=19, p=1 ..........................
[CV]  algorithm=kd_tree, n_neighbors=19, p=1, score=0.669, total=   0.1s
[CV] algorithm=kd_tree, n_neighbors=19, p=1 ..........................
[CV]  algorithm=kd_tree, n_neighbors=19, p=1, score=0.686, total=   0.1s
[CV] algorithm=kd_tree, n_neighbors=19, p=2 ..........................
[CV]  algorithm=kd_tree, n_neighbors=19, p=2, score=0.642, total=   0.1s
[CV] algorithm=kd_tree, n_neighbors=19, p=2 ..........................
[CV]  algorithm=kd_tree, n_neighbors=19, p=2, score=0.651, total=   0.1s
[CV] algorithm=kd_tree, n_neighbors=19, p=2 ..........................
[CV]  algorithm=kd_tree, n_neighbors=19, p=2, score=0.657, total=   0.1s
[CV] algorithm=brute, n_neighbors=1, p=1 .............................
[CV] . algorithm=brute, n_neighbors=1, p=1, score=0.607, total=   0.1s
[CV] algorithm=brute, n_neighbors=1, p=1 ........................

[CV]  algorithm=brute, n_neighbors=17, p=2, score=0.647, total=   0.2s
[CV] algorithm=brute, n_neighbors=17, p=2 ............................
[CV]  algorithm=brute, n_neighbors=17, p=2, score=0.662, total=   0.2s
[CV] algorithm=brute, n_neighbors=19, p=1 ............................
[CV]  algorithm=brute, n_neighbors=19, p=1, score=0.663, total=   0.2s
[CV] algorithm=brute, n_neighbors=19, p=1 ............................
[CV]  algorithm=brute, n_neighbors=19, p=1, score=0.669, total=   0.2s
[CV] algorithm=brute, n_neighbors=19, p=1 ............................
[CV]  algorithm=brute, n_neighbors=19, p=1, score=0.686, total=   0.2s
[CV] algorithm=brute, n_neighbors=19, p=2 ............................
[CV]  algorithm=brute, n_neighbors=19, p=2, score=0.642, total=   0.2s
[CV] algorithm=brute, n_neighbors=19, p=2 ............................
[CV]  algorithm=brute, n_neighbors=19, p=2, score=0.651, total=   0.2s
[CV] algorithm=brute, n_neighbors=19, p=2 ............................
[CV]  

[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:   27.1s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=7, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                         'p': [1, 2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [14]:
print(grid.best_params_)
print(grid.best_score_)

{'algorithm': 'auto', 'n_neighbors': 15, 'p': 1}
0.6748045012397482


In [15]:
predictions = grid.predict(X_test_scaled)

In [16]:
target_names = ["False Positive", "Candidate", "Confirmed"]

In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=target_names))

                precision    recall  f1-score   support

False Positive       0.50      0.45      0.48       420
     Candidate       0.64      0.66      0.65       461
     Confirmed       0.79      0.81      0.80       867

      accuracy                           0.69      1748
     macro avg       0.64      0.64      0.64      1748
  weighted avg       0.68      0.69      0.68      1748



In [18]:
print(f"Training Data Score: {grid.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {grid.score(X_test_scaled, y_test)}")

Training Data Score: 0.720579820713332
Testing Data Score: 0.6859267734553776


In [19]:
gg = pd.DataFrame({"Prediction": predictions, "Actual":y_test})
gg.head()

Unnamed: 0,Prediction,Actual
4990,FALSE POSITIVE,FALSE POSITIVE
1425,CONFIRMED,CONFIRMED
144,CONFIRMED,CONFIRMED
6589,FALSE POSITIVE,FALSE POSITIVE
4443,FALSE POSITIVE,CANDIDATE


In [20]:
# To confirm accuracy
s = gg["Prediction"].count()
wrong = 0
for i in range(0,s):
    a=gg["Prediction"].iloc[i]
    b=gg["Actual"].iloc[i]
    if a != b:
        wrong = wrong + 1
        
(s - wrong) / s

0.6859267734553776