#  Exoplanet Transit Classification (Kepler Mission)
## Data Wrangling Part I
### I. Extracting Kepler Cumulative Table

In [64]:

#standard data wrangling packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests

import os
import os.path



In [65]:
caltech_KOI_url = 'https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI?table=cumulative'
KOI_data = pd.read_csv(caltech_KOI_url)

In [66]:
KOI_data.columns

Index(['kepid', 'kepoi_name', 'kepler_name', 'koi_disposition',
       'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss',
       'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1',
       'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1',
       'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2',
       'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth',
       'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1',
       'koi_prad_err2', 'koi_teq', 'koi_teq_err1', 'koi_teq_err2', 'koi_insol',
       'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_tce_delivname', 'koi_steff', 'koi_steff_err1', 'koi_steff_err2',
       'koi_slogg', 'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad',
       'koi_srad_err1', 'koi_srad_err2', 'ra_str', 'dec_str', 'koi_kepmag',
       'koi_kepmag_err'],
      dtype='object')

#### These columns index Kepler objects of interests with transit crossing events TCEs identified in the light curves observed on a given star. Data column definitions can be found at https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html. Many of the planetary and orbital parameters are quantities derived from model fits to the light curve data. The stellar parameters are derived from ground based spectroscopy.

In [67]:
KOI_data.shape

(9564, 50)

In [68]:
KOI_data.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra_str,dec_str,koi_kepmag,koi_kepmag_err
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.467,0.064,-0.096,0.927,0.105,-0.061,19h27m44.22s,+48d08m29.9s,15.347,
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,4.467,0.064,-0.096,0.927,0.105,-0.061,19h27m44.22s,+48d08m29.9s,15.347,
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,0,...,4.544,0.044,-0.176,0.868,0.233,-0.078,19h48m01.16s,+48d08m02.9s,15.436,
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,4.564,0.053,-0.168,0.791,0.201,-0.067,19h02m08.31s,+48d17m06.8s,15.597,
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.438,0.07,-0.21,1.046,0.334,-0.133,19h15m01.17s,+48d13m34.3s,15.509,


In [69]:
KOI_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 50 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   kepid              9564 non-null   int64  
 1   kepoi_name         9564 non-null   object 
 2   kepler_name        2360 non-null   object 
 3   koi_disposition    9564 non-null   object 
 4   koi_pdisposition   9564 non-null   object 
 5   koi_score          8054 non-null   float64
 6   koi_fpflag_nt      9564 non-null   int64  
 7   koi_fpflag_ss      9564 non-null   int64  
 8   koi_fpflag_co      9564 non-null   int64  
 9   koi_fpflag_ec      9564 non-null   int64  
 10  koi_period         9564 non-null   float64
 11  koi_period_err1    9110 non-null   float64
 12  koi_period_err2    9110 non-null   float64
 13  koi_time0bk        9564 non-null   float64
 14  koi_time0bk_err1   9110 non-null   float64
 15  koi_time0bk_err2   9110 non-null   float64
 16  koi_impact         9201 

### Label Identification: Defining the classes

In [70]:
KeplerCPFPcount = {'Kepler Confirmed Planet': (KOI_data['koi_disposition'] == 'CONFIRMED').sum(), 'Kepler False Positives': (KOI_data['koi_disposition'] == 'FALSE POSITIVE').sum()}
print(KeplerCPFPcount)

{'Kepler Confirmed Planet': 2358, 'Kepler False Positives': 4840}


#### Subsetting on confirmed planets and false positives, we see that we have a healthy number in each class for doing binary classification. The Kepler cumulative table is nice enough to have flags indicating what kind of false positive the transit is believed to be.

#### koi_fpflag_nt is a non-transiting oscillatory phenomenon in the light curve. These can come from variable pulsing stars. This also encompasses instrument noise and the like.

#### koi_fpflag_ss flags secondary signal phenomena. Thus we have a signal with a primary and secondary dip in a given period. These are likely due to eclipsing binaries that orbit the star of interest and create periodic dips in the light curve amplitude that can mimic a transiting planet. 

#### koi_fpflag_co is due to a star near to the star of interest and in the field of view with its motion creating a periodic modulation of the light curve. This results in a fluctuation of the centroid of light on the detecting aperture (centroid offset).

### Note: Centroid offsets and secondary signal flags and are not mutually exlusive.



In [71]:
{'Secondary Signal': (KOI_data['koi_fpflag_ss'] == 1).sum(), 'Non Transiting': (KOI_data['koi_fpflag_nt'] == 1).sum(), 'Centroid Offset': (KOI_data['koi_fpflag_co'] == 1).sum()}

{'Secondary Signal': 2226, 'Non Transiting': 1530, 'Centroid Offset': 1889}

Overlap of secondary signal false positives with centroid offset:

In [72]:
((KOI_data['koi_fpflag_ss'] == 1) & (KOI_data['koi_fpflag_co'] == 1)).sum()

683

An interesting point is that there is a small overlap between secondary signal false positives and a transit that has actually been confirmed to be a planet. It turns out that there are a very small fraction of seemingly eclipsing binary star partners that are in fact 'Hot Jupiters'. These are planets that are really large and close to the star thus sending out a high degree of reflected light from the star. This can result in weak secondary transit phenomena. 

In [73]:
((KOI_data['koi_fpflag_ss'] == 1) & (KOI_data['koi_disposition'] == 'CONFIRMED')).sum()

14

#### Let's also take a look at the overlap between non-transiting flags and confirmed planets / secondary transiting phenomena.

In [74]:
((KOI_data['koi_fpflag_nt'] == 1) & (KOI_data['koi_disposition'] == 'CONFIRMED')).sum()

2

In [75]:
((KOI_data['koi_fpflag_nt'] == 1) & (KOI_data['koi_fpflag_ss'] == 1)).sum()

16

This is certainly interesting and probably indicates that members of each class live at/near a classification boundary. Maybe these will be reflected in some of the catalog features and time-series extracted features which we can take a look at when doing EDA.

#### The following has confirmed planets, false positive secondaries and false positive non transiting. We're going to do a three-way classification. We have excluded centroid offset because these are usually determined by looking at pixel imaging of the telescope and not solely from image-averaged light curves.

In [76]:
KOI_intermediate =  KOI_data.loc[(KOI_data['koi_disposition'] == 'CONFIRMED') | ( (KOI_data['koi_disposition'] == 'FALSE POSITIVE') & (KOI_data['koi_fpflag_ss'] == 1) ) | ( (KOI_data['koi_disposition'] == 'FALSE POSITIVE') & (KOI_data['koi_fpflag_nt'] == 1) )]
KOI_intermediate.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra_str,dec_str,koi_kepmag,koi_kepmag_err
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.467,0.064,-0.096,0.927,0.105,-0.061,19h27m44.22s,+48d08m29.9s,15.347,
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,4.467,0.064,-0.096,0.927,0.105,-0.061,19h27m44.22s,+48d08m29.9s,15.347,
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,4.564,0.053,-0.168,0.791,0.201,-0.067,19h02m08.31s,+48d17m06.8s,15.597,
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.438,0.07,-0.21,1.046,0.334,-0.133,19h15m01.17s,+48d13m34.3s,15.509,
5,10872983,K00756.01,Kepler-228 d,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,4.486,0.054,-0.229,0.972,0.315,-0.105,19h45m08.67s,+48d13m28.8s,15.714,


#### Drop intersection of non-transiting and secondary signal false positives and intersection of confirmed planets and nt flags.

In [77]:
ss_nt_intersection = KOI_intermediate[(KOI_intermediate['koi_fpflag_nt'] == 1) & (KOI_intermediate['koi_fpflag_ss'] == 1)]
ss_nt_intersection.head()

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra_str,dec_str,koi_kepmag,koi_kepmag_err
3751,7515670,K02360.02,,FALSE POSITIVE,FALSE POSITIVE,,1,1,0,0,...,,,,,,,19h11m12.98s,+43d10m08.3s,15.206,
3864,9597729,K04417.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,1,1,1,0,...,4.508,0.039,-0.208,0.929,0.278,-0.093,19h42m11.41s,+46d17m15.7s,14.84,
4545,7889486,K03300.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,1,1,1,1,...,4.527,0.036,-0.204,0.904,0.273,-0.073,19h25m07.91s,+43d36m29.7s,15.752,
4633,8330548,K01132.02,,FALSE POSITIVE,FALSE POSITIVE,,1,1,0,0,...,,,,,,,20h04m04.01s,+44d14m01.8s,14.446,
4780,5816811,K01042.02,,FALSE POSITIVE,FALSE POSITIVE,,1,1,0,0,...,,,,,,,19h53m21.34s,+41d04m24.5s,14.85,


In [78]:
cp_nt_intersection = KOI_intermediate[(KOI_intermediate['koi_disposition'] == 'CONFIRMED') & (KOI_intermediate['koi_fpflag_nt'] == 1)]

In [79]:
KOI_intermediate = KOI_intermediate.drop(index = ss_nt_intersection.index).drop(index = cp_nt_intersection.index)

#### All classes mutually distinct: now construct target labels:
#### 1 = Confirmed Planet, 2 = Secondary Signal False Positive, 3 = Non transiting phenomena

In [80]:
# label encode the three classes

KOI_intermediate['target_label'] = (KOI_intermediate['koi_disposition'] == 'CONFIRMED').astype('int') + 2*KOI_intermediate['koi_fpflag_ss'] + 3*KOI_intermediate['koi_fpflag_nt'] 


In [81]:
KOI_intermediate['target_label'].value_counts()

1       2341
2       2194
3       1525
1396       1
Name: target_label, dtype: int64

There's a wierd value in there, let's check it out. It seems to be just one observation. Looks like an error.

In [82]:
KOI_intermediate[KOI_intermediate['target_label'] == 1396]

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra_str,dec_str,koi_kepmag,koi_kepmag_err,target_label
3008,10934674,K00477.01,Kepler-567 b,CONFIRMED,CANDIDATE,1.0,465,0,0,0,...,0.048,-0.104,0.867,0.134,-0.077,19h50m41.43s,+48d18m08.3s,14.687,,1396


In [83]:
KOI_intermediate.loc[KOI_intermediate['target_label'] == 1396, 'target_label'] = 1

In [84]:
KOI_intermediate.loc[KOI_intermediate['koi_fpflag_nt'] == 465, 'koi_fpflag_nt'] = 0

In [85]:
KOI_intermediate['target_label'].value_counts()

1    2342
2    2194
3    1525
Name: target_label, dtype: int64

### Taking subset of dataframe and data cleanup

In [86]:
KOI_intermediate.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6061 entries, 0 to 9559
Data columns (total 51 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   kepid              6061 non-null   int64  
 1   kepoi_name         6061 non-null   object 
 2   kepler_name        2357 non-null   object 
 3   koi_disposition    6061 non-null   object 
 4   koi_pdisposition   6061 non-null   object 
 5   koi_score          5337 non-null   float64
 6   koi_fpflag_nt      6061 non-null   int64  
 7   koi_fpflag_ss      6061 non-null   int64  
 8   koi_fpflag_co      6061 non-null   int64  
 9   koi_fpflag_ec      6061 non-null   int64  
 10  koi_period         6061 non-null   float64
 11  koi_period_err1    5813 non-null   float64
 12  koi_period_err2    5813 non-null   float64
 13  koi_time0bk        6061 non-null   float64
 14  koi_time0bk_err1   5813 non-null   float64
 15  koi_time0bk_err2   5813 non-null   float64
 16  koi_impact         5865 

#### The kepid and koi_tce_plnt_num are the Kepler Input Catalog ID (corresponding to a given star) and TCE number used to index light curves in the Kepler data validation pipeline. We'll use these indices to extract features from the light curve time series via a custom built class (KOI_class). Thus these are our indices.

#### Feature that is extracted from Kepler's data validation automated statistical testing is the koi_period. Keep this. Also keep koi_depth and koi_duration which are the depth and duration of the primary transit event. Other columns have to do with stellar parameters (e.g. koi_steff, stellar coordinates, etc) not gained directly from light curves as opposed to spectroscopy as well as parameters fit assuming the TCE was a planet. Let's drop all of those.

In [87]:
cols_to_keep = ['kepid', 'koi_tce_plnt_num', 'koi_period', 'koi_depth', 'koi_duration', 'target_label']
KOI_cumulative = KOI_intermediate[cols_to_keep]
KOI_cumulative.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6061 entries, 0 to 9559
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   kepid             6061 non-null   int64  
 1   koi_tce_plnt_num  5882 non-null   float64
 2   koi_period        6061 non-null   float64
 3   koi_depth         5865 non-null   float64
 4   koi_duration      6061 non-null   float64
 5   target_label      6061 non-null   int64  
dtypes: float64(4), int64(2)
memory usage: 331.5 KB


Take care of any remaining NaNs

In [88]:
KOI_cumulative = KOI_cumulative.dropna(how = 'any')
KOI_cumulative.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5707 entries, 0 to 9559
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   kepid             5707 non-null   int64  
 1   koi_tce_plnt_num  5707 non-null   float64
 2   koi_period        5707 non-null   float64
 3   koi_depth         5707 non-null   float64
 4   koi_duration      5707 non-null   float64
 5   target_label      5707 non-null   int64  
dtypes: float64(4), int64(2)
memory usage: 312.1 KB


In [89]:
KOI_cumulative.head()

Unnamed: 0,kepid,koi_tce_plnt_num,koi_period,koi_depth,koi_duration,target_label
0,10797460,1.0,9.488036,615.8,2.9575,1
1,10797460,2.0,54.418383,874.8,4.507,1
3,10848459,1.0,1.736952,8079.2,2.40641,2
4,10854555,1.0,2.525592,603.3,1.6545,1
5,10872983,1.0,11.094321,1517.5,4.5945,1


#### Finish up cleaning on KOI_cumulatve: data types, relabel columns

Still good amount in each class after clean up

In [90]:
KOI_cumulative['target_label'].value_counts()

1    2335
2    2173
3    1199
Name: target_label, dtype: int64

The values for koi_tce_plnt_num are clearly integers...but are interpreted as floats. change dtype to int

In [91]:
KOI_cumulative['koi_tce_plnt_num'].unique()

array([1., 2., 3., 4., 5., 6., 7., 8.])

In [92]:
KOI_cumulative['koi_tce_plnt_num'] = KOI_cumulative['koi_tce_plnt_num'].astype('int')
print(KOI_cumulative['koi_tce_plnt_num'].dtype)
print(KOI_cumulative['koi_tce_plnt_num'].unique())

int32
[1 2 3 4 5 6 7 8]


Rename columns

In [93]:
KOI_cumulative.columns

Index(['kepid', 'koi_tce_plnt_num', 'koi_period', 'koi_depth', 'koi_duration',
       'target_label'],
      dtype='object')

In [94]:
KOI_cumulative = KOI_cumulative.rename(columns={'kepid': 'KIC_ID', 'koi_tce_plnt_num': 'TCE_num', 'koi_period': 'Period', 'koi_depth': 'Depth', 'koi_duration': 'Duration'})

### II. Light Curve Time Series Download

In [95]:
KOI_cumulative = KOI_cumulative.set_index(['KIC_ID', 'TCE_num']).sort_index()
KOI_cumulative.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Period,Depth,Duration,target_label
KIC_ID,TCE_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
757450,1,8.884923,16053.4,2.07004,1
1026032,1,8.460439,76412.9,4.73492,2
1161949,1,473.95103,330.6,11.778,3
1293031,1,0.539366,145.0,5.434,3
1297263,1,153.521361,255.2,3.893,3


In [96]:
kic_tcenum_index = KOI_cumulative.index # generates 2-tuple Multindex

In [37]:
from KOIclass import KOIObject # loads custom library to take care of API requests, extracting the data, download, and logging

### WARNING! This next command can take some time as there's about 26 GB of data to be downloaded to the local file system.

#### The download is necessary as the main speed bottleneck in transforming the data and feature creation would be in actually loading it remotely via API requests. 

#### Some KIC ID/TCEs that exist in the kepler input catalog don't have data in the data validation pipeline. These are, when they do happen, almost always non-transiting phenomena. We don't save a file for these offending TCEs and instead log the KIC and TCE in the download.log file.

In [42]:
kic_tcenum_index.map(lambda x: KOIObject(x[0], x[1]).download_data() )

Index([None, None, None, None, None, None, None, None, None, None,
       ...
       None, None, None, None, None, None, None, None, None, None],
      dtype='object', length=5707)

### Remove all KICs with empty data in data validation from Kepler cumulative table.

In [110]:
logfile_path = "..\\data\\external\\DVSeries\\download.log"
emptydvdata_list = []
with open(logfile_path) as f:
    for x in f.readlines():
        kicstring, tcenumstring = x.strip().split(":")[2].split('_')
        emptydvdata_list.append((int(kicstring), int(tcenumstring)))
print(emptydvdata_list[0:5])

[(1161949, 1), (1297263, 1), (1433531, 1), (2163226, 1), (2305311, 1)]


#### There are 365 entries in KOI_cumulative that have no data in the data validation pipeline.

In [116]:
KOI_cumulative.loc[emptydvdata_list].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Period,Depth,Duration,target_label
KIC_ID,TCE_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1161949,1,473.95103,330.6,11.778,3
1297263,1,153.521361,255.2,3.893,3
1433531,1,567.71329,259.4,12.85,3
2163226,1,452.10093,464.0,12.69,3
2305311,1,1.404674,155.5,5.056,3


#### Mostly non-transiting, but there are some secondary eclipses / confirmed planets missing in the data validation pipeline

In [118]:
KOI_cumulative.loc[emptydvdata_list]['target_label'].value_counts()

3    329
2     34
1      2
Name: target_label, dtype: int64

#### Drop all of these from the cumulative table then save to file.

In [120]:
KOI_cumulative = KOI_cumulative.drop(index=emptydvdata_list)

#### There are quite a bit fewer non transiting phenomena in our set than the other two classes...but it may be alright.

In [122]:
KOI_cumulative['target_label'].value_counts()

1    2333
2    2140
3     880
Name: target_label, dtype: int64

In [121]:
KOI_cumulative.to_csv('..\\data\\external\\KOI_cumulative.csv', sep=",")