In [33]:
import os
import pandas as pd
from sklearn import tree 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Kepler Exoplanet Search Results Feature Importance

In [34]:
data_path = os.path.join('data','learning_data.csv')

In [35]:
learning_data = pd.read_csv(data_path,index_col=False)
learning_data

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5299,FALSE POSITIVE,0,1,0,0,21.513523,2.714000e-04,-2.714000e-04,132.335600,0.012200,...,-141,3.508,0.187,-0.153,3.318,0.665,-0.813,287.46786,37.966640,10.630
5300,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
5301,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
5302,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


## Data Preparation

In [58]:
exoplanet_data = learning_data.drop(columns=['koi_disposition',
                                             'koi_tce_plnt_num',
                                             'koi_time0bk',
                                             'koi_time0bk_err1',
exoplanet_labels = learning_data['koi_disposition'].replace({
    'FALSE POSITIVE':0,
    'CONFIRMED':1})

feature_names = exoplanet_data.columns
feature_names

Index(['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_impact',
       'koi_impact_err1', 'koi_impact_err2', 'koi_duration',
       'koi_duration_err1', 'koi_duration_err2', 'koi_depth', 'koi_depth_err1',
       'koi_depth_err2', 'koi_prad', 'koi_prad_err1', 'koi_prad_err2',
       'koi_teq', 'koi_insol', 'koi_insol_err1', 'koi_insol_err2',
       'koi_model_snr', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
       'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad',
       'koi_srad_err1', 'koi_srad_err2', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')

In [73]:
training_data, testing_data,\
training_labels, testing_labels = train_test_split(exoplanet_data,
                                                   exoplanet_labels,
                                                   random_state = 1)

print(training_data.shape)
print(training_labels.shape)
print(testing_data.shape)
print(testing_labels.shape)

(3978, 36)
(3978,)
(1326, 36)
(1326,)


## Training

In [74]:
feature_scaler = StandardScaler().fit(training_data)

training_data = feature_scaler.transform(training_data)
testing_data = feature_scaler.transform(testing_data)

print(training_data.shape)
print(testing_data.shape)

(3978, 36)
(1326, 36)


In [75]:
classifier = RandomForestClassifier(n_estimators=200)
classifier = classifier.fit(training_data, training_labels)


In [76]:
classifier.score(testing_data, testing_labels)

0.9871794871794872

## Feature Ranking

In [72]:
sorted(zip(classifier.feature_importances_, feature_names), reverse = True)

[(0.1331708593113587, 'koi_fpflag_co'),
 (0.08609280832446181, 'koi_steff_err1'),
 (0.07555562473457464, 'koi_fpflag_ss'),
 (0.07507530782089028, 'koi_fpflag_nt'),
 (0.06663770589272401, 'koi_prad'),
 (0.06497535335217686, 'koi_steff_err2'),
 (0.052386930499095236, 'koi_duration_err1'),
 (0.044120743112326694, 'koi_duration_err2'),
 (0.04303586072425637, 'koi_prad_err2'),
 (0.041463663995138435, 'koi_prad_err1'),
 (0.034820431725956745, 'koi_fpflag_ec'),
 (0.026936830136102437, 'koi_model_snr'),
 (0.025011273581668766, 'koi_period'),
 (0.019488163852711606, 'koi_teq'),
 (0.018810467095489048, 'koi_insol_err1'),
 (0.016737248618677078, 'koi_srad_err1'),
 (0.01630404296470203, 'koi_insol'),
 (0.0152053910353404, 'koi_insol_err2'),
 (0.014862121125237273, 'koi_depth'),
 (0.014160942335183722, 'koi_depth_err1'),
 (0.013456094815180153, 'koi_slogg_err2'),
 (0.013132437635010657, 'koi_period_err2'),
 (0.012334505477930625, 'koi_impact'),
 (0.012043549026256905, 'koi_period_err1'),
 (0.009550

## Feature Importance without False Positive Test Results

In [78]:
#Data Preparation
# Drop columns corresponding to data labels, false positive flags, candidatite ids, observation dates
candidate_exoplanet_data = learning_data.drop(columns=['koi_disposition',
                                             'koi_fpflag_nt', 
                                             'koi_fpflag_ss',
                                             'koi_fpflag_co',
                                             'koi_fpflag_ec',
                                             'koi_tce_plnt_num',
                                             'koi_time0bk',
                                             'koi_time0bk_err1',
                                             'koi_time0bk_err2'])
candidate_exoplanet_labels = learning_data['koi_disposition'].replace({
    'FALSE POSITIVE':0,
    'CONFIRMED':1})
feature_names = exoplanet_data.columns

# Splitting the data for training and testing
training_data, testing_data,\
training_labels, testing_labels = train_test_split(candidate_exoplanet_data,
                                                   candidate_exoplanet_labels,
                                                   random_state = 1)

# Scaling the data
feature_scaler = StandardScaler().fit(training_data)
training_data = feature_scaler.transform(training_data)
testing_data = feature_scaler.transform(testing_data)

# Testing and feature ranking
classifier = RandomForestClassifier(n_estimators=200)
classifier = classifier.fit(training_data, training_labels)

sorted(zip(classifier.feature_importances_, feature_names), reverse = True)

[(0.1086512062713591, 'koi_prad_err2'),
 (0.08466889699184653, 'koi_teq'),
 (0.0789968070570996, 'koi_impact_err2'),
 (0.06987126509400735, 'koi_time0bk_err1'),
 (0.06189124727929439, 'koi_time0bk'),
 (0.058219675069698745, 'koi_duration'),
 (0.050164624251181074, 'koi_duration_err1'),
 (0.03913255545281861, 'koi_prad'),
 (0.0341157953156302, 'koi_depth_err1'),
 (0.031091978376656844, 'koi_fpflag_nt'),
 (0.030715684228049826, 'koi_depth_err2'),
 (0.028949755502304372, 'koi_fpflag_ec'),
 (0.026855116006018514, 'koi_depth'),
 (0.02613295166279397, 'koi_time0bk_err2'),
 (0.02449742132934539, 'koi_duration_err2'),
 (0.02366799283790416, 'koi_tce_plnt_num'),
 (0.02107246162167947, 'koi_impact_err1'),
 (0.01982595480708911, 'koi_fpflag_ss'),
 (0.019086032876247088, 'koi_fpflag_co'),
 (0.018725952521056136, 'koi_impact'),
 (0.018358638883582867, 'koi_insol_err2'),
 (0.0170435182850586, 'koi_period_err2'),
 (0.013834802573751382, 'koi_steff_err1'),
 (0.013224298891565171, 'koi_model_snr'),
 (0