In [1]:
import pandas as pd

In [2]:
from numpy.random import seed
seed(42)

# Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6990
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    6991 non-null   object 
 1   koi_fpflag_nt      6991 non-null   int64  
 2   koi_fpflag_ss      6991 non-null   int64  
 3   koi_fpflag_co      6991 non-null   int64  
 4   koi_fpflag_ec      6991 non-null   int64  
 5   koi_period         6991 non-null   float64
 6   koi_period_err1    6991 non-null   float64
 7   koi_period_err2    6991 non-null   float64
 8   koi_time0bk        6991 non-null   float64
 9   koi_time0bk_err1   6991 non-null   float64
 10  koi_time0bk_err2   6991 non-null   float64
 11  koi_impact         6991 non-null   float64
 12  koi_impact_err1    6991 non-null   float64
 13  koi_impact_err2    6991 non-null   float64
 14  koi_duration       6991 non-null   float64
 15  koi_duration_err1  6991 non-null   float64
 16  koi_duration_err2  6991 

# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
selected_features = df[['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 
                        'koi_time0bk', 'koi_impact', 'koi_duration', 'koi_depth','koi_prad', 'koi_teq',
                       'koi_insol','koi_model_snr','koi_steff','koi_slogg','koi_srad','ra','dec','koi_kepmag']]

In [6]:
selected_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6990
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   koi_fpflag_nt  6991 non-null   int64  
 1   koi_fpflag_ss  6991 non-null   int64  
 2   koi_fpflag_co  6991 non-null   int64  
 3   koi_fpflag_ec  6991 non-null   int64  
 4   koi_period     6991 non-null   float64
 5   koi_time0bk    6991 non-null   float64
 6   koi_impact     6991 non-null   float64
 7   koi_duration   6991 non-null   float64
 8   koi_depth      6991 non-null   float64
 9   koi_prad       6991 non-null   float64
 10  koi_teq        6991 non-null   int64  
 11  koi_insol      6991 non-null   float64
 12  koi_model_snr  6991 non-null   float64
 13  koi_steff      6991 non-null   int64  
 14  koi_slogg      6991 non-null   float64
 15  koi_srad       6991 non-null   float64
 16  ra             6991 non-null   float64
 17  dec            6991 non-null   float64
 18  koi_kepm

# Create a Train Test Split

Use `koi_disposition` for the y values

In [7]:
target=df['koi_disposition']
target.head()

0         CONFIRMED
1    FALSE POSITIVE
2    FALSE POSITIVE
3         CONFIRMED
4         CONFIRMED
Name: koi_disposition, dtype: object

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(selected_features,target,random_state=42)

In [9]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
6122,0,0,0,0,6.768901,133.07724,0.15,3.616,123.1,1.24,1017,253.3,10.8,5737,4.327,1.125,294.40472,39.351681,14.725
6370,0,1,0,1,0.733726,132.02005,0.291,2.309,114.6,0.86,1867,2891.64,13.8,5855,4.578,0.797,284.50391,42.46386,15.77
2879,1,0,0,0,7.652707,134.46038,0.97,79.8969,641.1,3.21,989,226.81,254.3,6328,4.481,0.963,295.50211,38.98354,13.099
107,0,0,0,0,7.953547,174.66224,0.3,2.6312,875.4,2.25,696,55.37,38.4,4768,4.536,0.779,291.15878,40.750271,15.66
29,0,0,0,0,4.959319,172.258529,0.831,2.22739,9802.0,12.21,1103,349.4,696.5,5712,4.359,1.082,292.16705,48.727589,15.263


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [10]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
X_scaler=MinMaxScaler().fit(X_train)
X_train_scaled=X_scaler.transform(X_train)
X_test_scaled=X_scaler.transform(X_test)

In [11]:
#One-hot Encoding
from tensorflow.keras.utils import to_categorical

#Step1: Label-encode dataset
label_encoder=LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train=label_encoder.transform(y_train)
encoded_y_test=label_encoder.transform(y_test)


In [12]:
#Step2: Convert encoded Labels to one-hot-encoding
y_train_categorical=to_categorical(encoded_y_train)
y_test_categorical=to_categorical (encoded_y_test)

In [13]:
print(label_encoder.classes_)

['CANDIDATE' 'CONFIRMED' 'FALSE POSITIVE']


# Train the Model



In [14]:
from tensorflow.keras.models import Sequential
model = Sequential()

In [15]:
from tensorflow.keras.layers import Dense
model.add(Dense(units=10, activation='relu', input_dim=19))
model.add(Dense(units=3, activation='softmax'))

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                200       
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 33        
Total params: 233
Trainable params: 233
Non-trainable params: 0
_________________________________________________________________


In [17]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [18]:
#Training the model
model.fit(X_train_scaled, y_train_categorical, epochs=100, shuffle=True, verbose=2)

Epoch 1/100
164/164 - 0s - loss: 0.9313 - accuracy: 0.4455
Epoch 2/100
164/164 - 0s - loss: 0.6898 - accuracy: 0.7032
Epoch 3/100
164/164 - 0s - loss: 0.5466 - accuracy: 0.7685
Epoch 4/100
164/164 - 0s - loss: 0.4675 - accuracy: 0.7801
Epoch 5/100
164/164 - 0s - loss: 0.4287 - accuracy: 0.7870
Epoch 6/100
164/164 - 0s - loss: 0.4089 - accuracy: 0.7906
Epoch 7/100
164/164 - 0s - loss: 0.3972 - accuracy: 0.7940
Epoch 8/100
164/164 - 0s - loss: 0.3905 - accuracy: 0.7919
Epoch 9/100
164/164 - 0s - loss: 0.3858 - accuracy: 0.7938
Epoch 10/100
164/164 - 0s - loss: 0.3827 - accuracy: 0.7927
Epoch 11/100
164/164 - 0s - loss: 0.3800 - accuracy: 0.7955
Epoch 12/100
164/164 - 0s - loss: 0.3772 - accuracy: 0.7982
Epoch 13/100
164/164 - 0s - loss: 0.3752 - accuracy: 0.7990
Epoch 14/100
164/164 - 0s - loss: 0.3742 - accuracy: 0.7948
Epoch 15/100
164/164 - 0s - loss: 0.3721 - accuracy: 0.7980
Epoch 16/100
164/164 - 0s - loss: 0.3707 - accuracy: 0.7982
Epoch 17/100
164/164 - 0s - loss: 0.3694 - accura

<tensorflow.python.keras.callbacks.History at 0x188ef8c6d00>

In [20]:
#Quantifying the model
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

55/55 - 0s - loss: 0.3721 - accuracy: 0.8072
Loss: 0.3721422553062439, Accuracy: 0.807208240032196
