### Import some dependencies

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

### Read in and clean up data

In [None]:
exoplanets = pd.read_csv(os.path.join('Resources', 'cumulative.csv'))
pd.set_option('display.max_columns', None)

exoplanets

In [None]:
#Remove rows (if any) where koi_pdisposition is not FALSE POSITIVE or CANDIDATE; koi_disposition has additional categories
exoplanets.koi_pdisposition.unique()
#None found in current file

In [None]:
#Make koi_pdisposition and koi_disposition numerical variables, see if they are the same (no, koi_disposition has more categories)
exoplanets_pdisp_cat = pd.get_dummies(exoplanets, prefix=['koi_pdisposition'], columns=['koi_pdisposition'])

In [None]:
exoplanets_disp_cat = pd.get_dummies(exoplanets_pdisp_cat, prefix=['koi_disposition'], columns=['koi_disposition'])
exoplanets_disp_cat.drop('koi_pdisposition_FALSE POSITIVE', axis=1, inplace=True)

exoplanets_disp_cat

In [None]:
#Drop error columns (although these could be useful in the real world), extra IDs, KOI score, 
#and extra evaluation columns
exoplanets_basic = exoplanets_disp_cat.drop(["rowid", "kepoi_name", "kepler_name", "koi_score", 
                                             "koi_period_err1", "koi_period_err2", "koi_time0bk_err1", 
                                             "koi_time0bk_err2", "koi_impact_err1", "koi_impact_err2", 
                                             "koi_duration_err1", "koi_duration_err2", "koi_depth_err1", 
                                             "koi_depth_err2", "koi_prad_err1", "koi_prad_err2", "koi_teq_err1", 
                                             "koi_teq_err2", "koi_insol_err1", "koi_insol_err2", "koi_tce_plnt_num", 
                                             "koi_tce_delivname","koi_steff_err1", "koi_steff_err2", 
                                             "koi_slogg_err1", "koi_slogg_err2", "koi_srad_err1", "koi_srad_err2", 
                                             "koi_disposition_CANDIDATE", "koi_disposition_CONFIRMED", 
                                             "koi_disposition_FALSE POSITIVE"], axis=1)
exoplanets_basic.rename(columns={'koi_fpflag_nt': 'flag_not_transit_like', 
                   'koi_fpflag_ss': 'flag_stellar_eclipse', 
                   'koi_fpflag_co': 'flag_centroid_offset',
                   'koi_fpflag_ec': 'flag_ephemeris match',                  
                   'koi_period': 'orbital_period',                  
                   'koi_time0bk': 'time_first_trans_detected',
                   'koi_impact': 'star_planet_dist_at_conj',                   
                   'koi_duration': 'trans_duration',                   
                   'koi_depth': 'stellar_flux_loss_at_trans_min',
                   'koi_prad': 'planet_radius',
                   'koi_teq': 'approx_planet_temp',
                   'koi_insol': 'insolation_flux',
                   'koi_model_snr': 'trans_sig_to_noise',
                   'koi_steff': 'stellar_eff_temp',
                   'koi_slogg': 'stellar_surf_gravity',
                   'koi_srad': 'stellar_photosph_rad',
                   'ra': 'sky_location_right_asc',
                   'dec': 'sky_location_declination',                   
                   'koi_kepmag': 'stellar_magnitude'}, inplace=True)
exoplanets_basic.dropna(axis=0)
#No na found by this method...

In [None]:
np.any(np.isnan(exoplanets_basic))

In [None]:
np.all(np.isfinite(exoplanets_basic))

In [None]:
#https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)
clean_dataset(exoplanets_basic)
exoplanets_basic


In [None]:
np.any(np.isnan(exoplanets_basic))

In [None]:
np.all(np.isfinite(exoplanets_basic))

## Decision Tree, random state=57

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]

In [None]:
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
from sklearn import tree

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

### Apparently quite accurate, but included "flag" columns, which are scores themselves

## Decision Tree, random state=57, "flag" variables removed

In [None]:
data_deflag = exoplanets_basic.drop(["kepid", 'flag_not_transit_like','flag_centroid_offset','flag_stellar_eclipse','flag_ephemeris match' ], axis=1)
data_deflag

In [None]:
data = data_deflag.drop(["koi_pdisposition_CANDIDATE"], axis=1)
feature_names = data.columns
target = data_deflag["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

### Score without "flag" columns is lower, but still I guess respectable

# Random Forest Classifier: Some naive parameter adjustments

## Random Forest, all columns, random state = 57, n_estimators = 200

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

### Excellent match to Kepler classification - but note 4 of the top 5 predictors are "flag" columns

## Random forest with "flag" variables removed; random state=57

In [None]:
data = data_deflag.drop(["koi_pdisposition_CANDIDATE"], axis=1)
feature_names = data.columns
target = data_deflag["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]

In [None]:
#Keep same random state as above
X_train, X_test, y_train, y_test = train_test_split(data_deflag, target, random_state=57)

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

### Even better match (at least in this particular run) when flag variables removed

## Random forest, all columns; random state=57; n_estimators=50

In [1]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

NameError: name 'exoplanets_basic' is not defined

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=50)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

### Very slightly lower score than with 200 estimators; relative factor importances quite similar

## Random forest , all columns; random state=57; n_estimators=10

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=50)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

### Very slightly lower score than with 200 estimators; relative factor importances quite similar

## Random Forest, all columns, random state = 312, n_estimators = 200

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=312)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

### No major changes with choice of random state

# Random Forest Classifier: Additional parameters

Not sure how much this matters given the already high scores, but easy enough to test
Some possibly useful choices from scikit-learn.org (based on limited understanding!):

criterion{“gini”, “entropy”}, default=”gini”
https://towardsdatascience.com/gini-index-vs-information-entropy-7a7e4fed3fcb: :entropy might give sharper delineation


max_features{“auto”, “sqrt”, “log2”}, int or float, default=”auto”
If “auto”, then max_features=sqrt(n_features).
If “log2”, then max_features=log2(n_features).
>>If None, then max_features=n_features.
Note: the search for a split does not stop until at least one valid partition of the node samples is found
Increasing features considered at each step might affect results (and run time of course), assuming not already at max.

oob_scorebool, default=False
https://towardsdatascience.com/what-is-out-of-bag-oob-score-in-random-forest-a7fa23d710
Apparently best for small datasets, which ours is not


## Random Forest, all columns, random state = 57, n_estimators = 200, criterion = "entropy"

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200, criterion='entropy')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Random Forest, all columns, random state = 57, n_estimators = 200, max_features='auto'

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200, max_features='auto')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Random forest with "flag" variables removed; random state = 57, n_estimators = 200, criterion = "entropy"

In [None]:
data = data_deflag.drop(["koi_pdisposition_CANDIDATE"], axis=1)
feature_names = data.columns
target = data_deflag["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200, criterion='entropy')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## Random forest with "flag" variables removed; random state = 57, n_estimators = 200, max_features='auto'

In [None]:
data = data_deflag.drop(["koi_pdisposition_CANDIDATE"], axis=1)
feature_names = data.columns
target = data_deflag["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200, max_features='auto')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)