In [None]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [1]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
# !pip install joblib

In [26]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.metrics import classification_report


# Read the CSV and Perform Basic Data Cleaning

In [27]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [59]:
# Set features. This will also be used as your x values.
selected_features = df[[
    'koi_fpflag_nt',
    'koi_fpflag_ss',
    'koi_fpflag_co',
    'koi_fpflag_ec',
    'koi_period',
    'koi_time0bk',
    'koi_impact',
    'koi_duration',
    'koi_depth',
    'koi_prad',
    'koi_teq',
    'koi_insol',
    'koi_model_snr',
    'koi_tce_plnt_num',
    'koi_steff',
    'koi_slogg',
    'koi_srad',
    'ra',
    'dec',
    'koi_kepmag'
]]

selected_features.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,9.11,25.8,2,5455,4.467,0.927,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,39.3,76.3,1,5853,4.544,0.868,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,1,5805,4.564,0.791,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,926.16,40.9,1,6031,4.438,1.046,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,427.65,40.2,2,6046,4.486,0.972,296.28613,48.22467,15.714


# Create a Train Test Split

Use `koi_disposition` for the y values

In [60]:
from sklearn.model_selection import train_test_split

X = selected_features
y = df['koi_disposition'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)

In [61]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
6080,1,0,0,0,12.496435,132.0358,1.17,84.32,271.7,55.34,1397,899.44,141.5,1,6821,3.805,2.73,289.2308,44.412483,13.054
3001,0,0,0,0,11.615625,131.96843,0.977,2.233,191.2,2.84,905,158.69,8.8,2,5332,4.083,1.453,293.52756,41.111439,15.162
570,0,1,0,0,10.980246,137.137607,0.733,3.74464,50078.0,21.94,821,107.47,1555.4,1,5952,4.462,0.897,282.79764,43.578129,14.212
4897,1,0,0,0,466.90824,136.3731,0.0868,2.64,660.0,2.19,210,0.46,5.4,1,5340,4.456,0.867,297.65436,43.178551,15.202
625,0,1,1,1,1.061933,133.850441,0.713,2.1429,133.6,2.29,2508,9391.15,80.2,1,6134,3.975,1.851,288.90253,44.632992,12.953


In [62]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5243, 20)
(5243,)
(1748, 20)
(1748,)


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [63]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [66]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
acc=clf.score(X_test, y_test)
print(f"accuracy = {acc}")
print(f"Training Data Score: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {clf.score(X_test_scaled, y_test)}")

accuracy = 0.8672768878718535
Training Data Score: 0.7516688918558078
Testing Data Score: 0.7528604118993135


In [65]:
# Create a random forest classifier
from sklearn.ensemble import RandomForestClassifier
model_2 = RandomForestClassifier(n_estimators=200)
model_2 = model_2.fit(X_train, y_train)
acc=model_2.score(X_test, y_test)
print(f"accuracy = {acc}")
print(f"Training Data Score: {model_2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_2.score(X_test_scaled, y_test)}")

accuracy = 0.9056064073226545
Training Data Score: 0.737745565515926
Testing Data Score: 0.7408466819221968


In [54]:
# Random Forests in sklearn will automatically calculate feature importance
importances = model_2.feature_importances_
importances

array([0.12204548, 0.10332726, 0.12524107, 0.0416541 , 0.04461708,
       0.02632594, 0.0395735 , 0.0320797 , 0.04686217, 0.08316384,
       0.03448311, 0.03903246, 0.1250942 , 0.00760827, 0.0220803 ,
       0.02041404, 0.0224416 , 0.02157726, 0.02133351, 0.02104513])

In [55]:
# We can sort the features by their importance
sorted(zip(clf.feature_importances_, selected_features), reverse=True)

[(0.19068884542343384, 'koi_fpflag_nt'),
 (0.18148609192032017, 'koi_fpflag_co'),
 (0.1736947985372159, 'koi_fpflag_ss'),
 (0.1467487294189566, 'koi_model_snr'),
 (0.03590285984089158, 'koi_impact'),
 (0.0325732158187725, 'koi_fpflag_ec'),
 (0.03023090543318452, 'koi_duration'),
 (0.027665245874056037, 'ra'),
 (0.02611804100053343, 'koi_prad'),
 (0.022049447035755797, 'koi_period'),
 (0.02106938301283799, 'koi_kepmag'),
 (0.020003349268032333, 'dec'),
 (0.019866382201573034, 'koi_time0bk'),
 (0.017687636220241415, 'koi_steff'),
 (0.01390613260887192, 'koi_depth'),
 (0.010904023964867279, 'koi_slogg'),
 (0.010175981988304252, 'koi_srad'),
 (0.007531202678129836, 'koi_tce_plnt_num'),
 (0.0067414949716250415, 'koi_insol'),
 (0.0049562327823964554, 'koi_teq')]

In [56]:
y_test

array(['FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', ...,
       'FALSE POSITIVE', 'CONFIRMED', 'FALSE POSITIVE'], dtype=object)

In [57]:
print(classification_report(clf.predict(X_test), y_test))

                precision    recall  f1-score   support

     CANDIDATE       0.75      0.71      0.73       444
     CONFIRMED       0.71      0.75      0.73       425
FALSE POSITIVE       0.98      0.98      0.98       879

      accuracy                           0.86      1748
     macro avg       0.81      0.81      0.81      1748
  weighted avg       0.86      0.86      0.86      1748



In [58]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'zen_model_2.sav'
joblib.dump(model_2, filename)

['zen_model_2.sav']