# Assignment Objective
To help process this data, you will create machine learning models capable of classifying candidate exoplanets from the raw dataset.

# DATA CLEANUP (PREPROCESS DATA)

## 1. Ensure data only contains rows which have data for an exoplanet. (X variables to have data to determine predictions)
## 2. Perform feature selection and remove unnecessary features.
## 3. Separate the data into training and testing data.
## 4. Use MinMaxScaler to scale the numerical data.

In [1]:
# Dependencies
import pandas as pd

In [2]:
# Read in csv file as a pandas dataframe
planets = pd.read_csv("Resources/exoplanet.csv")
planets  ## Shape: 9564 rows x 50 columns

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


In [3]:
# Drop all columns with NaN 
## (variables which have NaN will not be used as X variables due to no value)
planets = planets.dropna(axis='columns', how='all') # From 50 to 10 columns

# # Drop all rows with NaN
planets = planets.dropna() # Remains at 9564 rows
planets

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


## 2. Perform feature selection and remove unnecessary features.
Resources on Column variable definitions: https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html
### Clean columns

In [4]:
# Drop all columns which are only names or ids 
# planets = planets.drop(columns=['rowid', 'kepid','kepoi_name','koi_pdisposition'])
planets = planets.drop(columns=['koi_period_err2','koi_steff_err2','koi_slogg_err1','koi_slogg_err2','koi_srad_err2'])
planets

# Determine X Variables: koi_fpflag_nt, koi_fpflag_ss, koi_fpflag_co, koi_fpflag_ec, koi_period, koi_time0bk, koi_duration, ra, dec

# Determine Y variable: koi_disposition

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_slogg,koi_srad,koi_srad_err1,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,2.479000e-04,162.513840,0.003520,-0.003520,...,25.8,2,5455,81,4.467,0.927,0.105,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,175.850252,0.000581,-0.000581,...,76.3,1,5853,158,4.544,0.868,0.233,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,170.307565,0.000115,-0.000115,...,505.6,1,5805,157,4.564,0.791,0.201,285.53461,48.285210,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.760000e-06,171.595550,0.001130,-0.001130,...,40.9,1,6031,169,4.438,1.046,0.334,288.75488,48.226200,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.050000e-05,172.979370,0.001900,-0.001900,...,40.2,2,6046,189,4.486,0.972,0.315,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,132.016100,0.015700,-0.015700,...,8.4,1,5638,169,4.296,1.088,0.313,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,131.705093,0.000170,-0.000170,...,453.3,1,5638,139,4.529,0.903,0.237,297.18875,47.093819,14.082
6988,CANDIDATE,0,0,0,0,1.739849,1.780000e-05,133.001270,0.007690,-0.007690,...,10.6,1,6119,165,4.444,1.031,0.341,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,132.181750,0.002850,-0.002850,...,12.3,1,6173,193,4.447,1.041,0.341,294.16489,47.176281,15.385


In [5]:
# Determine unique values 
planets['koi_disposition'].unique() 
# y variables: 'CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE'

array(['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE'], dtype=object)

In [6]:
# planets = planets.dropna(subset=['koi_disposition'])
planets[planets['koi_disposition']=='FALSE POSITIVE']
# Candidate: 2248 rows
# Confirmed: 2293 rows
# False Positive: 5023 rows

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_slogg,koi_srad,koi_srad_err1,ra,dec,koi_kepmag
1,FALSE POSITIVE,0,1,0,0,19.899140,1.490000e-05,175.850252,0.000581,-0.000581,...,76.3,1,5853,158,4.544,0.868,0.233,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.630000e-07,170.307565,0.000115,-0.000115,...,505.6,1,5805,157,4.564,0.791,0.201,285.53461,48.285210,15.597
11,FALSE POSITIVE,0,1,0,0,11.521446,1.980000e-06,170.839688,0.000131,-0.000131,...,622.1,1,5795,155,4.554,0.848,0.224,297.07993,47.597401,15.472
12,FALSE POSITIVE,0,1,0,0,19.221389,1.120000e-06,184.552164,0.000045,-0.000045,...,2317.0,1,6117,182,4.496,0.947,0.308,295.81454,47.690350,15.341
13,FALSE POSITIVE,0,1,0,0,16.469838,1.360000e-05,180.881761,0.000623,-0.000623,...,303.4,1,5152,168,4.517,0.786,0.088,297.15442,47.668701,15.788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6983,FALSE POSITIVE,0,1,0,0,21.513523,2.714000e-04,132.335600,0.012200,-0.012200,...,171.5,1,5088,128,3.508,3.318,0.665,287.46786,37.966640,10.630
6986,FALSE POSITIVE,0,0,0,1,8.589871,1.846000e-04,132.016100,0.015700,-0.015700,...,8.4,1,5638,169,4.296,1.088,0.313,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0,1,1,0,0.527699,1.160000e-07,131.705093,0.000170,-0.000170,...,453.3,1,5638,139,4.529,0.903,0.237,297.18875,47.093819,14.082
6989,FALSE POSITIVE,0,0,1,0,0.681402,2.430000e-06,132.181750,0.002850,-0.002850,...,12.3,1,6173,193,4.447,1.041,0.341,294.16489,47.176281,15.385


## 3. 
## - Save X and y variables. 
## - Encode y to 0,1,2. 
## - Train Test Split data.


In [7]:
# Assign X (data) and y (target)
X = planets.drop("koi_disposition", axis=1)
y = planets["koi_disposition"]

from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)
encoded_y

print(X.shape, y.shape)
print(X.shape, encoded_y.shape)

(6991, 35) (6991,)
(6991, 35) (6991,)


In [8]:
print(y)

0            CONFIRMED
1       FALSE POSITIVE
2       FALSE POSITIVE
3            CONFIRMED
4            CONFIRMED
             ...      
6986    FALSE POSITIVE
6987    FALSE POSITIVE
6988         CANDIDATE
6989    FALSE POSITIVE
6990    FALSE POSITIVE
Name: koi_disposition, Length: 6991, dtype: object


In [9]:
print(encoded_y)

[1 2 2 ... 0 2 2]


In [10]:
for label, original_class in zip(encoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

# 0: CANDIDATE
# 1: CONFIRMED
# 2: FALSE POSITIVE

Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CO

In [11]:
# Dependencies (to split data)
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=42)

## 4. Use MinMaxScaler to scale the numerical data.

In [12]:
# Perform MinMaxScaler to X_train and X_test
# Categorical columns do not need to be scaled. i.e. y_train and y_test
from sklearn.preprocessing import MinMaxScaler

# Attain X_scaler using x_train data
X_scaler = MinMaxScaler().fit(X_train)

# Save all variables using X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Save variables for use in model1, model2, model3

In [13]:
%store X_train_scaled
%store X_test_scaled
%store y_train
%store y_test

Stored 'X_train_scaled' (ndarray)
Stored 'X_test_scaled' (ndarray)
Stored 'y_train' (ndarray)
Stored 'y_test' (ndarray)


In [14]:
# Print variable
y_train

array([0, 2, 2, ..., 2, 2, 2])