In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

### Read CSV and Clean Data
#### N.B. Using the original exoplanets dataset found here: https://www.kaggle.com/nasa/kepler-exoplanet-search-results#cumulative.csv

In [3]:
df = pd.read_csv("cumulative.csv")
df.koi_disposition.value_counts()
y = df["koi_disposition"].apply(lambda x:0 if x == "FALSE POSITIVE" else 1)
y.value_counts()

0    5023
1    4541
Name: koi_disposition, dtype: int64

In [4]:
df.head()

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


### Feature Selection (using ExtraTreesClassifier)
#### N.B. feature importance done separately

In [6]:
X = df[['koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_nt', 'koi_prad', 'koi_fpflag_ec', 'koi_prad_err1']].copy()

In [7]:
X = X.fillna(X.median())

In [8]:
X.head()

Unnamed: 0,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_nt,koi_prad,koi_fpflag_ec,koi_prad_err1
0,0,0,0,2.26,0,0.26
1,0,0,0,2.83,0,0.32
2,1,0,0,14.6,0,3.92
3,1,0,0,33.46,0,8.5
4,0,0,0,2.75,0,0.88


### Create a Train - Test Split (with the selected features)

In [9]:
model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=6) 

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
X_train     

Unnamed: 0,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_nt,koi_prad,koi_fpflag_ec,koi_prad_err1
3564,1,1,0,87.21,0,16.61
5606,0,1,0,18.33,0,5.57
9360,0,0,0,1.43,0,0.33
1764,0,0,0,2.12,0,0.18
719,0,0,0,6.63,0,0.88
...,...,...,...,...,...,...
4862,1,0,0,47.83,0,7.54
584,0,0,0,1.94,0,0.13
5961,0,0,1,12.74,0,6.86
5272,1,0,0,89.85,0,0.00


### MinMaxScaler

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Fit on training set only.
scaler.fit(X_train)
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test= scaler.transform(X_test)

### Train the Model

In [12]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [14]:
print(f"Training Data Score for Extra Trees: {model.score(X_train, y_train)}")
print(f"Testing Data Score for Extra Trees: {model.score(X_test, y_test)}")

Training Data Score for Extra Trees: 0.9793670709605465
Testing Data Score for Extra Trees: 0.9820158929318277


### Cross-validation

In [16]:
clf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=6)
scores = cross_val_score(clf, X_test, y_test, cv=5)
print(scores)
print(scores.mean())

[0.9874739  0.98535565 0.9832636  0.97280335 0.9790795 ]
0.9815951992033612


In [17]:
print("Average Cross Validation Score of ExtraTreesClassifier is 98.2%")

Average Cross Validation Score of ExtraTreesClassifier is 98.2%


### Classification Report 1

In [18]:
target_names = ["Candidate", "Not"]
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

   Candidate       0.98      0.98      0.98      1256
         Not       0.98      0.98      0.98      1135

    accuracy                           0.98      2391
   macro avg       0.98      0.98      0.98      2391
weighted avg       0.98      0.98      0.98      2391



### Hypertuning Parameters (GridSearchCV)

In [19]:
param_grid = {'max_depth': np.arange(1, 12)}
tree = GridSearchCV(model, param_grid, cv=5)

In [20]:
tree.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=6,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_de

In [21]:
print(tree.best_params_)

{'max_depth': 9}


In [23]:
print(f'{tree.best_score_}')

0.9795064826432455


In [24]:
predictions = tree.predict(X_test)

### Classification Report 2

In [25]:
print(classification_report(y_test, predictions,
                            target_names= target_names))

              precision    recall  f1-score   support

   Candidate       0.98      0.98      0.98      1256
         Not       0.98      0.98      0.98      1135

    accuracy                           0.98      2391
   macro avg       0.98      0.98      0.98      2391
weighted avg       0.98      0.98      0.98      2391

