# Assignment Objective
To help process this data, you will create machine learning models capable of classifying candidate exoplanets from the raw dataset.

# DATA CLEANUP (PREPROCESS DATA)

## 1. Ensure data only contains rows which have data for an exoplanet. (X variables to have data to determine predictions)
## 2. Perform feature selection and remove unnecessary features.
## 3. Separate the data into training and testing data.
## 4. Use MinMaxScaler to scale the numerical data.

In [29]:
# Dependencies
import pandas as pd

In [30]:
# Read in csv file as a pandas dataframe
planets = pd.read_csv("Resources/cumulative.csv")
planets  ## Shape: 9564 rows x 50 columns

Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.000,0,0,0,...,-211.0,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,9560,10031643,K07984.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,0,0,...,-152.0,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
9560,9561,10090151,K07985.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,1,...,-166.0,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
9561,9562,10128825,K07986.01,,CANDIDATE,CANDIDATE,0.497,0,0,0,...,-220.0,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
9562,9563,10147276,K07987.01,,FALSE POSITIVE,FALSE POSITIVE,0.021,0,0,1,...,-236.0,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


In [31]:
# Drop all columns with NaN 
## (variables which have NaN will not be used as X variables due to no value)
planets = planets.dropna(axis='columns') # From 50 to 14 columns

# # Drop all rows with NaN
planets = planets.dropna() # Remains at 9564 rows
planets

Unnamed: 0,rowid,kepid,kepoi_name,koi_disposition,koi_pdisposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_duration,ra,dec
0,1,10797460,K00752.01,CONFIRMED,CANDIDATE,0,0,0,0,9.488036,170.538750,2.95750,291.93423,48.141651
1,2,10797460,K00752.02,CONFIRMED,CANDIDATE,0,0,0,0,54.418383,162.513840,4.50700,291.93423,48.141651
2,3,10811496,K00753.01,FALSE POSITIVE,FALSE POSITIVE,0,1,0,0,19.899140,175.850252,1.78220,297.00482,48.134129
3,4,10848459,K00754.01,FALSE POSITIVE,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,2.40641,285.53461,48.285210
4,5,10854555,K00755.01,CONFIRMED,CANDIDATE,0,0,0,0,2.525592,171.595550,1.65450,288.75488,48.226200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,9560,10031643,K07984.01,FALSE POSITIVE,FALSE POSITIVE,0,0,0,1,8.589871,132.016100,4.80600,298.74921,46.973351
9560,9561,10090151,K07985.01,FALSE POSITIVE,FALSE POSITIVE,0,1,1,0,0.527699,131.705093,3.22210,297.18875,47.093819
9561,9562,10128825,K07986.01,CANDIDATE,CANDIDATE,0,0,0,0,1.739849,133.001270,3.11400,286.50937,47.163219
9562,9563,10147276,K07987.01,FALSE POSITIVE,FALSE POSITIVE,0,0,1,0,0.681402,132.181750,0.86500,294.16489,47.176281


## 2. Perform feature selection and remove unnecessary features.
Resources on Column variable definitions: https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html
### Clean columns

In [32]:
# Drop all columns which are only names or ids 
planets = planets.drop(columns=['rowid', 'kepid','kepoi_name','koi_pdisposition'])
planets

# Determine X Variables: koi_fpflag_nt, koi_fpflag_ss, koi_fpflag_co, koi_fpflag_ec, koi_period, koi_time0bk, koi_duration, ra, dec

# Determine Y variable: koi_disposition

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_duration,ra,dec
0,CONFIRMED,0,0,0,0,9.488036,170.538750,2.95750,291.93423,48.141651
1,CONFIRMED,0,0,0,0,54.418383,162.513840,4.50700,291.93423,48.141651
2,FALSE POSITIVE,0,1,0,0,19.899140,175.850252,1.78220,297.00482,48.134129
3,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,2.40641,285.53461,48.285210
4,CONFIRMED,0,0,0,0,2.525592,171.595550,1.65450,288.75488,48.226200
...,...,...,...,...,...,...,...,...,...,...
9559,FALSE POSITIVE,0,0,0,1,8.589871,132.016100,4.80600,298.74921,46.973351
9560,FALSE POSITIVE,0,1,1,0,0.527699,131.705093,3.22210,297.18875,47.093819
9561,CANDIDATE,0,0,0,0,1.739849,133.001270,3.11400,286.50937,47.163219
9562,FALSE POSITIVE,0,0,1,0,0.681402,132.181750,0.86500,294.16489,47.176281


In [33]:
# Determine unique values 
planets['koi_disposition'].unique() 
# y variables: 'CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE'

array(['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE'], dtype=object)

In [34]:
# planets = planets.dropna(subset=['koi_disposition'])
planets[planets['koi_disposition']=='FALSE POSITIVE']
# Candidate: 2248 rows
# Confirmed: 2293 rows
# False Positive: 5023 rows

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_duration,ra,dec
2,FALSE POSITIVE,0,1,0,0,19.899140,175.850252,1.78220,297.00482,48.134129
3,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,2.40641,285.53461,48.285210
8,FALSE POSITIVE,0,1,1,0,7.361790,132.250530,5.02200,298.86435,42.151569
14,FALSE POSITIVE,0,1,0,0,11.521446,170.839688,3.63990,297.07993,47.597401
15,FALSE POSITIVE,0,1,0,0,19.403938,172.484253,12.21550,289.25821,47.635319
...,...,...,...,...,...,...,...,...,...,...
9556,FALSE POSITIVE,0,1,0,0,23.627035,150.036200,11.48100,299.21881,44.181862
9559,FALSE POSITIVE,0,0,0,1,8.589871,132.016100,4.80600,298.74921,46.973351
9560,FALSE POSITIVE,0,1,1,0,0.527699,131.705093,3.22210,297.18875,47.093819
9562,FALSE POSITIVE,0,0,1,0,0.681402,132.181750,0.86500,294.16489,47.176281


## 3. 
## - Save X and y variables. 
## - Encode y to 0,1,2. 
## - Train Test Split data.


In [35]:
# Assign X (data) and y (target)
X = planets.drop("koi_disposition", axis=1)
y = planets["koi_disposition"]

from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)
encoded_y

print(X.shape, y.shape)
print(X.shape, encoded_y.shape)

(9564, 9) (9564,)
(9564, 9) (9564,)


In [36]:
print(y)

0            CONFIRMED
1            CONFIRMED
2       FALSE POSITIVE
3       FALSE POSITIVE
4            CONFIRMED
             ...      
9559    FALSE POSITIVE
9560    FALSE POSITIVE
9561         CANDIDATE
9562    FALSE POSITIVE
9563    FALSE POSITIVE
Name: koi_disposition, Length: 9564, dtype: object


In [37]:
print(encoded_y)

[1 1 2 ... 0 2 2]


In [38]:
for label, original_class in zip(encoded_y, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

# 0: CANDIDATE
# 1: CONFIRMED
# 2: FALSE POSITIVE

Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FA

In [39]:
# Dependencies (to split data)
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=1)

## 4. Use MinMaxScaler to scale the numerical data.

In [40]:
# Perform MinMaxScaler to X_train and X_test
# Categorical columns do not need to be scaled. i.e. y_train and y_test
from sklearn.preprocessing import MinMaxScaler

# Attain X_scaler using x_train data
X_scaler = MinMaxScaler().fit(X_train)

# Save all variables using X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Save variables for use in model1 and model2

In [41]:
%store X_train_scaled
%store X_test_scaled
%store y_train
%store y_test

Stored 'X_train_scaled' (ndarray)
Stored 'X_test_scaled' (ndarray)
Stored 'y_train' (ndarray)
Stored 'y_test' (ndarray)


In [42]:
# Print variable
y_train

array([2, 2, 2, ..., 1, 2, 2])