# Assignment Objective
To help process this data, you will create machine learning models capable of classifying candidate exoplanets from the raw dataset.

# DATA CLEANUP (PREPROCESS DATA)

## 1. Ensure data only contains rows which have data for an exoplanet. 
- Filter for only rows under column 'koi_pidisposition' = 'CANDIDATE'. 
## 2. Perform feature selection and remove unnecessary features.
## 3. Use MinMaxScaler to scale the numerical data.
## 4. Separate the data into training and testing data.

In [1]:
# Dependencies
import pandas as pd

In [2]:
# Read in csv file as a pandas dataframe
planets = pd.read_csv("Resources/cumulative.csv")
planets  ## Shape: 9564 rows x 50 columns

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-211.0,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,9560,10031643,K07984.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,0,0,...,-152.0,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
9560,9561,10090151,K07985.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,1,...,-166.0,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
9561,9562,10128825,K07986.01,,CANDIDATE,CANDIDATE,0.497,0,0,0,...,-220.0,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
9562,9563,10147276,K07987.01,,FALSE POSITIVE,FALSE POSITIVE,0.021,0,0,1,...,-236.0,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


## 1. Ensure data only contains rows which have data for an exoplanet.

In [3]:
# Filter for only rows under column 'koi_pidisposition' = 'CANDIDATE'.
planets = planets[planets['koi_pdisposition'].str.contains('CANDIDATE')]
planets

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-211.0,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
5,6,10872983,K00756.01,Kepler-228 d,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-232.0,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
6,7,10872983,K00756.02,Kepler-228 c,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-232.0,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9550,9551,8488381,K07888.01,,CANDIDATE,CANDIDATE,0.532,0,0,0,...,-201.0,4.456,0.056,-0.224,1.011,0.329,-0.110,289.20331,44.505138,13.922
9551,9552,8489260,K07889.01,,CANDIDATE,CANDIDATE,0.386,0,0,0,...,-183.0,4.529,0.036,-0.192,0.903,0.251,-0.084,289.57452,44.519939,15.991
9557,9558,11911561,K03875.01,,CANDIDATE,CANDIDATE,1.000,0,0,0,...,-181.0,4.027,0.434,-0.186,1.514,0.426,-0.640,290.14914,50.239178,13.579
9558,9559,8765560,K03891.01,,CANDIDATE,CANDIDATE,1.000,0,0,0,...,-159.0,3.597,0.968,-0.242,2.780,1.089,-2.022,296.15601,44.920090,13.731


In [7]:
# Drop all column 'kepler_name' NaN 
# planets = planets['kepler_name'].dropna()
planets = planets.dropna(subset=['kepler_name'])
planets


Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-211.0,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
5,6,10872983,K00756.01,Kepler-228 d,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-232.0,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
6,7,10872983,K00756.02,Kepler-228 c,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-232.0,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9305,9306,8801316,K02956.01,Kepler-1394 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-113.0,3.728,0.218,-0.073,2.741,0.324,-0.757,283.45319,45.084141,11.801
9309,9310,7347246,K03014.01,Kepler-1411 b,CONFIRMED,CANDIDATE,0.941,0,0,0,...,-236.0,4.454,0.054,-0.216,1.053,0.357,-0.119,286.36157,42.963921,15.831
9353,9354,8895758,K03106.01,Kepler-1427 b,CONFIRMED,CANDIDATE,0.877,0,0,0,...,-203.0,4.473,0.054,-0.216,1.000,0.322,-0.107,295.34967,45.114552,15.415
9479,9480,7503885,K03417.01,Kepler-1494 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-205.0,4.437,0.072,-0.203,1.008,0.319,-0.137,282.65741,43.162521,15.214
