##### Import libraries

In [1]:
import os
import requests
import io
import pandas as pd
from pycaret.classification import *

##### Get the data from the NASA Exoplanet Archive API

In [2]:
url = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI?table=cumulative&format=csv"
r = requests.get(url)
r = r.content
df = pd.read_csv(io.StringIO(r.decode('utf-8')))

##### Explore the data

In [3]:
len(df)

9564

In [4]:
df.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra_str,dec_str,koi_kepmag,koi_kepmag_err
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.467,0.064,-0.096,0.927,0.105,-0.061,19h27m44.22s,+48d08m29.9s,15.347,
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,4.467,0.064,-0.096,0.927,0.105,-0.061,19h27m44.22s,+48d08m29.9s,15.347,
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,4.544,0.044,-0.176,0.868,0.233,-0.078,19h48m01.16s,+48d08m02.9s,15.436,
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,4.564,0.053,-0.168,0.791,0.201,-0.067,19h02m08.31s,+48d17m06.8s,15.597,
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.438,0.07,-0.21,1.046,0.334,-0.133,19h15m01.17s,+48d13m34.3s,15.509,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 50 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   kepid              9564 non-null   int64  
 1   kepoi_name         9564 non-null   object 
 2   kepler_name        2671 non-null   object 
 3   koi_disposition    9564 non-null   object 
 4   koi_pdisposition   9564 non-null   object 
 5   koi_score          8054 non-null   float64
 6   koi_fpflag_nt      9564 non-null   int64  
 7   koi_fpflag_ss      9564 non-null   int64  
 8   koi_fpflag_co      9564 non-null   int64  
 9   koi_fpflag_ec      9564 non-null   int64  
 10  koi_period         9564 non-null   float64
 11  koi_period_err1    9110 non-null   float64
 12  koi_period_err2    9110 non-null   float64
 13  koi_time0bk        9564 non-null   float64
 14  koi_time0bk_err1   9110 non-null   float64
 15  koi_time0bk_err2   9110 non-null   float64
 16  koi_impact         9201 

##### Clean the data

We can drop the columns that have no data.

In [6]:
df.drop(columns=['kepler_name', 'koi_teq_err1', 'koi_teq_err2', 'koi_kepmag_err'], inplace=True)

Let's drop 'koi_score' for the time being, due to missing data. We can try different imputation methods later.

In [7]:
df.drop(columns=['koi_score'], inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   kepid              9564 non-null   int64  
 1   kepoi_name         9564 non-null   object 
 2   koi_disposition    9564 non-null   object 
 3   koi_pdisposition   9564 non-null   object 
 4   koi_fpflag_nt      9564 non-null   int64  
 5   koi_fpflag_ss      9564 non-null   int64  
 6   koi_fpflag_co      9564 non-null   int64  
 7   koi_fpflag_ec      9564 non-null   int64  
 8   koi_period         9564 non-null   float64
 9   koi_period_err1    9110 non-null   float64
 10  koi_period_err2    9110 non-null   float64
 11  koi_time0bk        9564 non-null   float64
 12  koi_time0bk_err1   9110 non-null   float64
 13  koi_time0bk_err2   9110 non-null   float64
 14  koi_impact         9201 non-null   float64
 15  koi_impact_err1    9110 non-null   float64
 16  koi_impact_err2    9110 

For simplicity, and to get a baseline model, let's remove the remaning rows that have nulls. It should be noted that any rows we choose to remove are rows that our model will not have the opportunity to learn from. In this case, it will only reduce our data by about 8.5%. This is an area to revisit; exploring different ways we can impute missing data.

In [9]:
df.dropna(inplace=True)

In [10]:
df.shape

(8744, 45)

For clarity, let's replace the abbreviated column names with their complete names. Column descriptions can be found at: https://exoplanetarchive.ipac.caltech.edu/docs/program_interfaces.html#defcols

In [11]:
# replace abbreviated column names with complete names for clarity
df = df.rename(columns={'kepid':'KepID',
'kepoi_name':'KOIName',
'kepler_name':'KeplerName',
'koi_disposition':'ExoplanetArchiveDisposition',
'koi_pdisposition':'DispositionUsingKeplerData',
'koi_score':'DispositionScore',
'koi_fpflag_nt':'NotTransit-LikeFalsePositiveFlag',
'koi_fpflag_ss':'koi_fpflag_ss',
'koi_fpflag_co':'CentroidOffsetFalsePositiveFlag',
'koi_fpflag_ec':'EphemerisMatchIndicatesContaminationFalsePositiveFlag',
'koi_period':'OrbitalPeriod[days',
'koi_period_err1':'OrbitalPeriodUpperUnc.[days',
'koi_period_err2':'OrbitalPeriodLowerUnc.[days',
'koi_time0bk':'TransitEpoch[BKJD',
'koi_time0bk_err1':'TransitEpochUpperUnc.[BKJD',
'koi_time0bk_err2':'TransitEpochLowerUnc.[BKJD',
'koi_impact':'ImpactParamete',
'koi_impact_err1':'ImpactParameterUpperUnc',
'koi_impact_err2':'ImpactParameterLowerUnc',
'koi_duration':'TransitDuration[hrs',
'koi_duration_err1':'TransitDurationUpperUnc.[hrs',
'koi_duration_err2':'TransitDurationLowerUnc.[hrs',
'koi_depth':'TransitDepth[ppm',
'koi_depth_err1':'TransitDepthUpperUnc.[ppm',
'koi_depth_err2':'TransitDepthLowerUnc.[ppm',
'koi_prad':'PlanetaryRadius[Earthradii',
'koi_prad_err1':'PlanetaryRadiusUpperUnc.[Earthradii',
'koi_prad_err2':'PlanetaryRadiusLowerUnc.[Earthradii',
'koi_teq':'EquilibriumTemperature[K',
'koi_teq_err1':'EquilibriumTemperatureUpperUnc.[K',
'koi_teq_err2':'EquilibriumTemperatureLowerUnc.[K',
'koi_insol':'InsolationFlux[Earthflux',
'koi_insol_err1':'InsolationFluxUpperUnc.[Earthflux',
'koi_insol_err2':'InsolationFluxLowerUnc.[Earthflux',
'koi_model_snr':'TransitSignal-to-Nois',
'koi_tce_plnt_num':'TCEPlanetNumbe',
'koi_tce_delivname':'TCEDeliver',
'koi_steff':'StellarEffectiveTemperature[K',
'koi_steff_err1':'StellarEffectiveTemperatureUpperUnc.[K',
'koi_steff_err2':'StellarEffectiveTemperatureLowerUnc.[K',
'koi_slogg':'StellarSurfaceGravity[log10(cm/s**2)',
'koi_slogg_err1':'StellarSurfaceGravityUpperUnc.[log10(cm/s**2)',
'koi_slogg_err2':'StellarSurfaceGravityLowerUnc.[log10(cm/s**2)',
'koi_srad':'StellarRadius[Solarradii',
'koi_srad_err1':'StellarRadiusUpperUnc.[Solarradii',
'koi_srad_err2':'StellarRadiusLowerUnc.[Solarradii',
'ra':'RA[decimaldegrees',
'dec':'Dec[decimaldegrees',
'koi_kepmag':'Kepler-band[mag]'
})

In [12]:
# Remove special characters from column names for processing
import re
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [13]:
df.columns

Index(['KepID', 'KOIName', 'ExoplanetArchiveDisposition',
       'DispositionUsingKeplerData', 'NotTransitLikeFalsePositiveFlag',
       'koi_fpflag_ss', 'CentroidOffsetFalsePositiveFlag',
       'EphemerisMatchIndicatesContaminationFalsePositiveFlag',
       'OrbitalPerioddays', 'OrbitalPeriodUpperUncdays',
       'OrbitalPeriodLowerUncdays', 'TransitEpochBKJD',
       'TransitEpochUpperUncBKJD', 'TransitEpochLowerUncBKJD',
       'ImpactParamete', 'ImpactParameterUpperUnc', 'ImpactParameterLowerUnc',
       'TransitDurationhrs', 'TransitDurationUpperUnchrs',
       'TransitDurationLowerUnchrs', 'TransitDepthppm',
       'TransitDepthUpperUncppm', 'TransitDepthLowerUncppm',
       'PlanetaryRadiusEarthradii', 'PlanetaryRadiusUpperUncEarthradii',
       'PlanetaryRadiusLowerUncEarthradii', 'EquilibriumTemperatureK',
       'InsolationFluxEarthflux', 'InsolationFluxUpperUncEarthflux',
       'InsolationFluxLowerUncEarthflux', 'TransitSignaltoNois',
       'TCEPlanetNumbe', 'TCEDeliver',

##### Creating the Target Variable
Before it was discontinued in 2018 the Kepler telescope discovered thousands of planets outside of our solar system. It accomplished this by looking for small dips in the brightness of a star when a planet transitted in front of it.

The column 'ExoplanetCandidate' indicates whether or not an observation from the Kepler telescope is a potential exoplanet.

Let's create a numeric version of 'ExoplanetCandidate' as our target variable.

In [14]:
df['ExoplanetCandidate'] = df['DispositionUsingKeplerData'].apply(lambda x: 1 if x == 'CANDIDATE' else 0)

We can drop columns that won't be of any use to the model.

In [15]:
df.drop(columns=['KOIName', 'KepID', 'ra_str', 'dec_str', 
                 'ExoplanetArchiveDisposition', 'DispositionUsingKeplerData', 'TCEDeliver'], inplace=True)

##### Explore categorical variables

Next we can check for any extreme class imbalances in the categorical features.

In [16]:
df['NotTransitLikeFalsePositiveFlag'].value_counts()

0      7606
1      1137
465       1
Name: NotTransitLikeFalsePositiveFlag, dtype: int64

In [17]:
df.koi_fpflag_ss.value_counts()

0    6622
1    2122
Name: koi_fpflag_ss, dtype: int64

In [18]:
df.CentroidOffsetFalsePositiveFlag.value_counts()

0    6965
1    1779
Name: CentroidOffsetFalsePositiveFlag, dtype: int64

In [19]:
df.EphemerisMatchIndicatesContaminationFalsePositiveFlag.value_counts()

0    7651
1    1093
Name: EphemerisMatchIndicatesContaminationFalsePositiveFlag, dtype: int64

##### Prepare the data for modeling

To increase the validity of our results, we can sample 5% of the data to be used as unseen data; i.e., this data was "not available" at the time of training ML models.

In [20]:
data = df.sample(frac=0.95, random_state=42)
data_unseen = df.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)

Separate out the features and target for both the seen and unseen data.

In [21]:
features = data.drop(columns=['ExoplanetCandidate'])
target = data['ExoplanetCandidate']

unseen_features = data_unseen.drop(columns=['ExoplanetCandidate'])
unseen_target = data_unseen['ExoplanetCandidate']

In [22]:
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data for Predictions: ' + str(data_unseen.shape))

Data for Modeling: (8307, 39)
Unseen Data for Predictions: (437, 39)


##### Train and evaluate models with PyCaret

In [23]:
experiment = setup(data = data, target = 'ExoplanetCandidate', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,ExoplanetCandidate
2,Target type,Binary
3,Original data shape,"(8307, 39)"
4,Transformed data shape,"(8307, 39)"
5,Transformed train set shape,"(5814, 39)"
6,Transformed test set shape,"(2493, 39)"
7,Numeric features,38
8,Preprocess,True
9,Imputation type,simple


##### Test models

The PyCaret library automates the process of testing models and provides us with a leaderboard.

In [24]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9978,0.9997,0.9977,0.998,0.9978,0.9955,0.9955,1.557
catboost,CatBoost Classifier,0.9976,0.9996,0.9973,0.998,0.9977,0.9952,0.9952,12.651
ada,Ada Boost Classifier,0.9974,0.9997,0.9973,0.9977,0.9975,0.9948,0.9948,1.176
lightgbm,Light Gradient Boosting Machine,0.9972,0.9993,0.9973,0.9973,0.9973,0.9945,0.9945,0.575
gbc,Gradient Boosting Classifier,0.9971,0.9991,0.9963,0.998,0.9972,0.9941,0.9942,3.463
et,Extra Trees Classifier,0.9971,0.9993,0.9963,0.998,0.9972,0.9941,0.9942,0.511
dt,Decision Tree Classifier,0.9969,0.9969,0.9967,0.9973,0.997,0.9938,0.9938,0.581
rf,Random Forest Classifier,0.9928,0.9994,0.988,0.998,0.993,0.9855,0.9856,1.627
qda,Quadratic Discriminant Analysis,0.9245,0.9329,0.9757,0.8905,0.9308,0.8482,0.8532,0.084
ridge,Ridge Classifier,0.9197,0.0,0.9698,0.8866,0.9261,0.8386,0.8431,0.206


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

##### Make predictions using the best model

Now that PyCaret has determined the best model, let's test it. We also have the option of choosing any other model from the list.

In [25]:
predict_model(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9984,0.9997,0.9984,0.9984,0.9984,0.9968,0.9968


Unnamed: 0,NotTransitLikeFalsePositiveFlag,koi_fpflag_ss,CentroidOffsetFalsePositiveFlag,EphemerisMatchIndicatesContaminationFalsePositiveFlag,OrbitalPerioddays,OrbitalPeriodUpperUncdays,OrbitalPeriodLowerUncdays,TransitEpochBKJD,TransitEpochUpperUncBKJD,TransitEpochLowerUncBKJD,...,StellarSurfaceGravitylog10cms2,StellarSurfaceGravityUpperUnclog10cms2,StellarSurfaceGravityLowerUnclog10cms2,StellarRadiusSolarradii,StellarRadiusUpperUncSolarradii,StellarRadiusLowerUncSolarradii,Keplerbandmag,ExoplanetCandidate,Label,Score
5814,0.0,0.0,1.0,0.0,2.452618,0.000022,-0.000022,133.608795,0.007660,-0.007660,...,4.564,0.046,-0.046,0.751,0.156,-0.065,15.223,0,0,0.9990
5815,0.0,0.0,0.0,0.0,30.928898,0.000325,-0.000325,153.906464,0.007820,-0.007820,...,3.237,0.030,-0.030,5.127,0.468,-1.085,13.907,1,1,0.9998
5816,0.0,1.0,0.0,0.0,223.112747,0.000290,-0.000290,140.947128,0.000892,-0.000892,...,4.389,0.105,-0.195,1.056,0.315,-0.145,14.380,0,0,0.9999
5817,1.0,0.0,0.0,0.0,230.385483,0.002330,-0.002330,268.656982,0.006960,-0.006960,...,3.241,0.273,-0.343,4.331,2.578,-1.338,13.264,0,0,0.9980
5818,1.0,0.0,1.0,1.0,0.933770,0.000008,-0.000008,132.421097,0.009240,-0.009240,...,4.448,0.084,-0.196,0.975,0.304,-0.121,15.318,0,0,1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8302,0.0,0.0,0.0,0.0,1.289590,0.000004,-0.000004,131.890564,0.002540,-0.002540,...,4.578,0.034,-0.136,0.808,0.163,-0.070,15.471,1,1,0.9994
8303,0.0,0.0,0.0,0.0,208.047134,0.009334,-0.009334,321.902100,0.022800,-0.022800,...,4.175,0.309,-0.255,1.200,0.456,-0.354,13.934,1,1,0.9998
8304,0.0,0.0,0.0,0.0,67.876663,0.000797,-0.000797,187.370300,0.010500,-0.010500,...,4.485,0.044,-0.302,0.975,0.415,-0.089,14.953,1,1,0.9995
8305,1.0,0.0,0.0,0.0,3.589667,0.000019,-0.000019,134.566208,0.004280,-0.004280,...,3.515,0.368,-0.092,3.837,0.379,-1.514,13.391,0,0,0.9997


##### Save/Load model

In [26]:
#save_model(best_model, model_name='')

In [27]:
#model = load_model('')

In [28]:
#model.predict(df.tail())