### Import some dependencies

In [None]:
import pandas as pd
import numpy as np
import os

### Read in and clean up data

In [None]:
exoplanets = pd.read_csv(os.path.join('Resources', 'cumulative.csv'))
pd.set_option('display.max_columns', None)

exoplanets

In [None]:
#Remove rows (if any) where koi_pdisposition is not FALSE POSITIVE or CANDIDATE; koi_disposition has additional categories
exoplanets.koi_pdisposition.unique()
#None found in current file

In [None]:
#Make koi_pdisposition and koi_disposition numerical variables, see if they are the same (no, koi_disposition has more categories)
exoplanets_pdisp_cat = pd.get_dummies(exoplanets, prefix=['koi_pdisposition'], columns=['koi_pdisposition'])

In [None]:
exoplanets_disp_cat = pd.get_dummies(exoplanets_pdisp_cat, prefix=['koi_disposition'], columns=['koi_disposition'])
exoplanets_disp_cat.drop('koi_pdisposition_FALSE POSITIVE', axis=1, inplace=True)

In [None]:
#Drop error columns (although these could be useful in the real world), extra IDs, KOI score, 
#and extra evaluation columns
exoplanets_basic = exoplanets_disp_cat.drop(["rowid", "kepoi_name", "kepler_name", "koi_score", 
                                             "koi_period_err1", "koi_period_err2", "koi_time0bk_err1", 
                                             "koi_time0bk_err2", "koi_impact_err1", "koi_impact_err2", 
                                             "koi_duration_err1", "koi_duration_err2", "koi_depth_err1", 
                                             "koi_depth_err2", "koi_prad_err1", "koi_prad_err2", "koi_teq_err1", 
                                             "koi_teq_err2", "koi_insol_err1", "koi_insol_err2", "koi_tce_plnt_num", 
                                             "koi_tce_delivname","koi_steff_err1", "koi_steff_err2", 
                                             "koi_slogg_err1", "koi_slogg_err2", "koi_srad_err1", "koi_srad_err2", 
                                             "koi_disposition_CANDIDATE", "koi_disposition_CONFIRMED", 
                                             "koi_disposition_FALSE POSITIVE"], axis=1)
exoplanets_basic.rename(columns={'koi_fpflag_nt': 'flag_not_transit_like', 
                   'koi_fpflag_ss': 'flag_stellar_eclipse', 
                   'koi_fpflag_co': 'flag_centroid_offset',
                   'koi_fpflag_ec': 'flag_ephemeris match',                  
                   'koi_period': 'orbital_period',                  
                   'koi_time0bk': 'time_first_trans_detected',
                   'koi_impact': 'star_planet_dist_at_conj',                   
                   'koi_duration': 'trans_duration',                   
                   'koi_depth': 'stellar_flux_loss_at_trans_min',
                   'koi_prad': 'planet_radius',
                   'koi_teq': 'approx_planet_temp',
                   'koi_insol': 'insolation_flux',
                   'koi_model_snr': 'trans_sig_to_noise',
                   'koi_steff': 'stellar_eff_temp',
                   'koi_slogg': 'stellar_surf_gravity',
                   'koi_srad': 'stellar_photosph_rad',
                   'ra': 'sky_location_right_asc',
                   'dec': 'sky_location_declination',                   
                   'koi_kepmag': 'stellar_magnitude'}, inplace=True)
exoplanets_basic.dropna(axis=0)
#No na found by this method...

In [None]:
np.any(np.isnan(exoplanets_basic))

In [None]:
np.all(np.isfinite(exoplanets_basic))

In [None]:
#https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)
clean_dataset(exoplanets_basic)
exoplanets_basic

In [None]:
np.any(np.isnan(exoplanets_basic))

In [None]:
np.all(np.isfinite(exoplanets_basic))

## First run: Decision Tree, random state=57

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]

In [None]:
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
from sklearn import tree

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

First run results: clf.score(X_test, y_test) 0.9843478260869565

### Apparently quite accurate, but included "flag" columns, which are scores themselves

## Second run: Decision Tree, random state=57, "flag" variables removed

In [None]:
data_deflag = exoplanets_basic.drop(["kepid", 'flag_not_transit_like','flag_centroid_offset',
                                     'flag_stellar_eclipse','flag_ephemeris match' ], axis=1)
data_deflag

In [None]:
data = data_deflag.drop(["koi_pdisposition_CANDIDATE"], axis=1)
feature_names = data.columns
target = data_deflag["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Second run results: clf.score(X_test, y_test) 0.7669565217391304

### Score without "flag" columns is lower, but still I guess respectable

# Random Forest Classifier: Some naive parameter adjustments

## Random Forest, all columns, random state = 57, n_estimators = 200

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

Third run results: rf.score(X_test, y_test) 0.9904347826086957
    
[(0.22733560271085565, 'flag_not_transit_like'),
 (0.19420355973907696, 'flag_centroid_offset'),
 (0.1440095276751463, 'flag_stellar_eclipse'),
 (0.10109978958257873, 'planet_radius'),
 (0.07024011822169293, 'flag_ephemeris match'),
 (0.044640688652287444, 'stellar_flux_loss_at_trans_min'),
 (0.03311771350470804, 'orbital_period'),
 (0.031480649395637335, 'trans_sig_to_noise'),
 (0.030972109742308376, 'insolation_flux'),
 (0.029535382736840466, 'approx_planet_temp'),
 (0.020736500860786416, 'star_planet_dist_at_conj'),
 (0.018509642109181546, 'trans_duration'),
 (0.013508461271531618, 'time_first_trans_detected'),
 (0.008068372003769954, 'stellar_eff_temp'),
 (0.007187249648553146, 'stellar_photosph_rad'),
 (0.00658100063113589, 'sky_location_right_asc'),
 (0.006470822163761753, 'stellar_surf_gravity'),
 (0.006287781229770447, 'stellar_magnitude'),
 (0.006015028120377005, 'sky_location_declination')]

### Excellent match to Kepler classification - but note 4 of the top 5 predictors are "flag" columns

## Random forest with "flag" variables removed; random state=57

In [None]:
data = data_deflag.drop(["koi_pdisposition_CANDIDATE"], axis=1)
feature_names = data.columns
target = data_deflag["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]

In [None]:
#Keep same random state as above
X_train, X_test, y_train, y_test = train_test_split(data_deflag, target, random_state=57)

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

Fourth run results:
rf.score(X_test, y_test)  1.0

[(0.0756992214210558, 'planet_radius'),
 (0.02594718491892835, 'stellar_flux_loss_at_trans_min'),
 (0.025467720100393297, 'insolation_flux'),
 (0.022833793544298753, 'star_planet_dist_at_conj'),
 (0.02201414443349766, 'orbital_period'),
 (0.020332680056512346, 'approx_planet_temp'),
 (0.019891447901408545, 'trans_sig_to_noise'),
 (0.012684390468460921, 'trans_duration'),
 (0.007709256710770052, 'time_first_trans_detected'),
 (0.0032457954749037054, 'stellar_eff_temp'),
 (0.002960467272649715, 'stellar_photosph_rad'),
 (0.002777782158230879, 'sky_location_right_asc'),
 (0.002364284916553437, 'stellar_surf_gravity'),
 (0.0018585959312780007, 'sky_location_declination'),
 (0.001716789180097814, 'stellar_magnitude')]

### Even better match (at least in this particular run) when flag variables removed

## Random forest, all columns; random state=57; n_estimators=50

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=50)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

Fifth run results:

rf.score(X_test, y_test) 0.9895652173913043

[(0.22478679101692392, 'flag_not_transit_like'),
 (0.18334698737771735, 'flag_centroid_offset'),
 (0.15177305910682304, 'flag_stellar_eclipse'),
 (0.0820154292443887, 'planet_radius'),
 (0.07045560645341226, 'flag_ephemeris match'),
 (0.05070168167163782, 'stellar_flux_loss_at_trans_min'),
 (0.04053729942631173, 'orbital_period'),
 (0.03542118416488744, 'approx_planet_temp'),
 (0.031646608026528, 'trans_sig_to_noise'),
 (0.027659473609593074, 'star_planet_dist_at_conj'),
 (0.02660825118820068, 'insolation_flux'),
 (0.018389226884730783, 'trans_duration'),
 (0.01320545764939473, 'time_first_trans_detected'),
 (0.008415441461114706, 'stellar_eff_temp'),
 (0.008244441513973305, 'stellar_photosph_rad'),
 (0.007365022806077216, 'stellar_surf_gravity'),
 (0.007039589664750343, 'stellar_magnitude'),
 (0.006670640694098431, 'sky_location_right_asc'),
 (0.005717808039436437, 'sky_location_declination')]


### Very slightly lower score than with 200 estimators; relative factor importances quite similar

## Random forest , all columns; random state=57; n_estimators=10

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=50)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

Sixth run result:

rf.score(X_test, y_test) 0.9891304347826086

[(0.2259593552429587, 'flag_not_transit_like'),
 (0.20216741873290553, 'flag_centroid_offset'),
 (0.1376671830280171, 'flag_stellar_eclipse'),
 (0.12487004832135692, 'planet_radius'),
 (0.07077921338368869, 'flag_ephemeris match'),
 (0.03676421924755866, 'stellar_flux_loss_at_trans_min'),
 (0.031613473252989584, 'orbital_period'),
 (0.029247794125857682, 'insolation_flux'),
 (0.027746302416463732, 'approx_planet_temp'),
 (0.027118406419649455, 'trans_sig_to_noise'),
 (0.017319409129850877, 'trans_duration'),
 (0.016328628721901377, 'star_planet_dist_at_conj'),
 (0.013032099322464014, 'time_first_trans_detected'),
 (0.007692441245159498, 'stellar_eff_temp'),
 (0.007656213060912029, 'stellar_surf_gravity'),
 (0.006604557561888076, 'stellar_photosph_rad'),
 (0.006000789138146452, 'sky_location_right_asc'),
 (0.005907446209645676, 'sky_location_declination'),
 (0.005525001438586003, 'stellar_magnitude')]


### Very slightly lower score than with 200 estimators; relative factor importances quite similar

## Random Forest, all columns, random state = 312, n_estimators = 200

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=312)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

Seventh run results:
    
rf.score(X_test, y_test) 0.9882608695652174

[(0.23798774600917552, 'flag_not_transit_like'),
 (0.19186263353584754, 'flag_centroid_offset'),
 (0.16692955018983197, 'flag_stellar_eclipse'),
 (0.08542584938692699, 'planet_radius'),
 (0.07186881412337937, 'flag_ephemeris match'),
 (0.04271948290966886, 'stellar_flux_loss_at_trans_min'),
 (0.03130938860531501, 'insolation_flux'),
 (0.029941578261044405, 'trans_sig_to_noise'),
 (0.029162560264454945, 'orbital_period'),
 (0.028171703218690346, 'approx_planet_temp'),
 (0.01932885503213915, 'star_planet_dist_at_conj'),
 (0.015147970719536408, 'trans_duration'),
 (0.01031685362043467, 'time_first_trans_detected'),
 (0.007974241401576123, 'stellar_eff_temp'),
 (0.007710574883976139, 'stellar_photosph_rad'),
 (0.006595751769123111, 'stellar_surf_gravity'),
 (0.0061432410779326515, 'sky_location_right_asc'),
 (0.005783169289187998, 'stellar_magnitude'),
 (0.005620035701758849, 'sky_location_declination')]

### No major changes with choice of random state

# Random Forest Classifier: Additional parameters

Not sure how much this matters given the already high scores, but easy enough to test.
Some possibly useful choices from scikit-learn.org (based on limited understanding!):

criterion{“gini”, “entropy”}, default=”gini”
https://towardsdatascience.com/gini-index-vs-information-entropy-7a7e4fed3fcb: :entropy might give sharper delineation


max_features{“auto”, “sqrt”, “log2”}, int or float, default=”auto”
If “auto”, then max_features=sqrt(n_features).
If “log2”, then max_features=log2(n_features).
>>If None, then max_features=n_features.
Note: the search for a split does not stop until at least one valid partition of the node samples is found
Increasing features considered at each step might affect results (and run time of course), assuming not already at max.

oob_scorebool, default=False
https://towardsdatascience.com/what-is-out-of-bag-oob-score-in-random-forest-a7fa23d710
Apparently best for small datasets, which ours is not


## Random Forest, all columns, random state = 57, n_estimators = 200, criterion = "entropy"

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200, criterion='entropy')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

Eighth run result:
    
rf.score(X_test, y_test) 0.9904347826086957

[(0.22605002898133872, 'flag_not_transit_like'),
 (0.19519889356462308, 'flag_centroid_offset'),
 (0.15197875446292186, 'flag_stellar_eclipse'),
 (0.0838030500923313, 'planet_radius'),
 (0.07276204231577676, 'flag_ephemeris match'),
 (0.04219325150244801, 'stellar_flux_loss_at_trans_min'),
 (0.03658629069697878, 'trans_sig_to_noise'),
 (0.03450634261202376, 'orbital_period'),
 (0.029405794268847978, 'insolation_flux'),
 (0.025164503624640545, 'approx_planet_temp'),
 (0.020040278368036304, 'star_planet_dist_at_conj'),
 (0.018600221393501714, 'trans_duration'),
 (0.013328520799608404, 'time_first_trans_detected'),
 (0.009693009092842604, 'stellar_photosph_rad'),
 (0.009514958668370243, 'stellar_eff_temp'),
 (0.008082442537119995, 'stellar_surf_gravity'),
 (0.007957083474579353, 'sky_location_right_asc'),
 (0.007719477673848345, 'stellar_magnitude'),
 (0.007415055870162336, 'sky_location_declination')]

High score 

## Random Forest, all columns, random state = 57, n_estimators = 200, max_features='auto'

In [None]:
target = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data = exoplanets_basic.drop(["koi_pdisposition_CANDIDATE", "kepid"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200, max_features='auto')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

Ninth run results

rf.score(X_test, y_test) 0.9904347826086957

[(0.22872426067052315, 'flag_not_transit_like'),
 (0.19127100946869177, 'flag_centroid_offset'),
 (0.16299930629129977, 'flag_stellar_eclipse'),
 (0.08718918447587949, 'planet_radius'),
 (0.07243404191102386, 'flag_ephemeris match'),
 (0.0373124679610056, 'trans_sig_to_noise'),
 (0.03577251639510632, 'stellar_flux_loss_at_trans_min'),
 (0.035374110557582196, 'orbital_period'),
 (0.02977581241967809, 'approx_planet_temp'),
 (0.026619083006408518, 'insolation_flux'),
 (0.021057379335761735, 'star_planet_dist_at_conj'),
 (0.01629170482114803, 'trans_duration'),
 (0.013611690283884705, 'time_first_trans_detected'),
 (0.008348201095845754, 'stellar_eff_temp'),
 (0.007934720066254973, 'stellar_photosph_rad'),
 (0.006717208390302393, 'stellar_surf_gravity'),
 (0.006539054804130948, 'sky_location_right_asc'),
 (0.006139290905798041, 'stellar_magnitude'),
 (0.005888957139674693, 'sky_location_declination')]

High score

## Random forest with "flag" variables removed; random state = 57, n_estimators = 200, criterion = "entropy"

In [None]:
data = data_deflag.drop(["koi_pdisposition_CANDIDATE"], axis=1)
feature_names = data.columns
target = data_deflag["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200, criterion='entropy')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

Tenth run results:
    
rf.score(X_test, y_test) 0.8386956521739131

[(0.1379310462689551, 'planet_radius'),
 (0.09258523408807015, 'orbital_period'),
 (0.08848849033703969, 'stellar_flux_loss_at_trans_min'),
 (0.08450236297953793, 'trans_sig_to_noise'),
 (0.07993781905557479, 'trans_duration'),
 (0.07773010302285813, 'star_planet_dist_at_conj'),
 (0.06802682933552859, 'insolation_flux'),
 (0.06287145573790465, 'approx_planet_temp'),
 (0.0522589495016858, 'time_first_trans_detected'),
 (0.04521993048295201, 'sky_location_right_asc'),
 (0.043680510036280915, 'stellar_eff_temp'),
 (0.0434146383951555, 'stellar_surf_gravity'),
 (0.043329969186600946, 'stellar_photosph_rad'),
 (0.04102089876256105, 'sky_location_declination'),
 (0.03900176280929468, 'stellar_magnitude')]


Lower score

## Random forest with "flag" variables removed; random state = 57, n_estimators = 200, max_features='auto'

In [None]:
data = data_deflag.drop(["koi_pdisposition_CANDIDATE"], axis=1)
feature_names = data.columns
target = data_deflag["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200, max_features='auto')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
# Examine feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

Eleventh run result:
    
rf.score(X_test, y_test) 0.8369565217391305

[(0.14776291108077555, 'planet_radius'),
 (0.09561514385817545, 'orbital_period'),
 (0.08686027002282865, 'trans_sig_to_noise'),
 (0.08370918639742753, 'star_planet_dist_at_conj'),
 (0.0809737656439492, 'stellar_flux_loss_at_trans_min'),
 (0.0768699129102505, 'trans_duration'),
 (0.07343482879377032, 'approx_planet_temp'),
 (0.07113635596863883, 'insolation_flux'),
 (0.05086760639057528, 'time_first_trans_detected'),
 (0.041273752166773474, 'stellar_eff_temp'),
 (0.041271588512042455, 'sky_location_right_asc'),
 (0.039728371705770225, 'stellar_photosph_rad'),
 (0.038560686609861984, 'stellar_surf_gravity'),
 (0.036885649669834576, 'sky_location_declination'),
 (0.035049970269326075, 'stellar_magnitude')]

Lower score