In [1]:
import os
import pandas as pd
from sklearn import tree 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Kepler Exoplanet Search Results Feature Importance

In [2]:
data_path = os.path.join('data','learning_data.csv')

In [3]:
learning_data = pd.read_csv(data_path,index_col=False)
learning_data

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5299,FALSE POSITIVE,0,1,0,0,21.513523,2.714000e-04,-2.714000e-04,132.335600,0.012200,...,-141,3.508,0.187,-0.153,3.318,0.665,-0.813,287.46786,37.966640,10.630
5300,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
5301,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
5302,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


## Data Preparation

In [4]:
#Data Preparation
# Drop columns corresponding to data labels, false positive flags, candidatite ids, observation dates
candidate_exoplanet_data = learning_data.copy().drop(columns=['koi_disposition',
                                                              'koi_fpflag_nt', 
                                                              'koi_fpflag_ss',
                                                              'koi_fpflag_co',
                                                              'koi_fpflag_ec',
                                                              'koi_tce_plnt_num',
                                                              'koi_time0bk',
                                                              'koi_time0bk_err1',
                                                              'koi_time0bk_err2'])


candidate_exoplanet_labels = learning_data['koi_disposition'].replace({
    'FALSE POSITIVE':0,
    'CONFIRMED':1})
candidate_feature_names = candidate_exoplanet_data.columns

In [5]:
# Splitting the data for training and testing
candidate_training_data, candidate_testing_data,\
candidate_training_labels, candidate_testing_labels = train_test_split(candidate_exoplanet_data,
                                                   candidate_exoplanet_labels,
                                                   random_state = 1)

In [6]:
# Scaling the data
feature_scaler = StandardScaler().fit(candidate_training_data)
training_data = feature_scaler.transform(candidate_training_data)
testing_data = feature_scaler.transform(candidate_testing_data)

## Training

In [7]:
# Testing and feature ranking
candidate_classifier = RandomForestClassifier(n_estimators=200)
candidate_classifier = candidate_classifier.fit(candidate_training_data, candidate_training_labels)

In [8]:
candidate_classifier.score(candidate_testing_data, candidate_testing_labels)

0.9306184012066365

## Feature Ranking

In [10]:
pd.DataFrame(sorted(zip(candidate_classifier.feature_importances_,
                        candidate_feature_names), reverse = True))\
.rename(columns={0:'Weight',1:'Feature'}).set_index('Feature')

Unnamed: 0_level_0,Weight
Feature,Unnamed: 1_level_1
koi_steff_err1,0.115051
koi_prad,0.081761
koi_steff_err2,0.080127
koi_duration_err1,0.065991
koi_duration_err2,0.064618
koi_prad_err2,0.059246
koi_prad_err1,0.042728
koi_model_snr,0.035243
koi_impact,0.032946
koi_insol_err1,0.031675
