### Import some dependencies

In [None]:
# Set the seed value for the notebook so the results are reproducible
import numpy as np
from numpy.random import seed
seed(1)
#wanted to take a look
np.random.rand(3)

In [None]:
#%matplotlib inline
#import matplotlib.pyplot as plt
#from sklearn.neighbors import KNeighborsClassifier
#why the above?
import pandas as pd
#import numpy as np
import os

In [None]:
import tensorflow
tensorflow.keras.__version__

### Read in and clean up data

In [None]:
exoplanets = pd.read_csv(os.path.join('Resources', 'cumulative.csv'))
pd.set_option('display.max_columns', None)
exoplanets

In [None]:
#Remove rows (if any) where koi_pdisposition is not FALSE POSITIVE or CANDIDATE; koi_disposition has additional categories
exoplanets.koi_pdisposition.unique()
#None found in current file

In [None]:
#Make koi_pdisposition and koi_disposition numerical variables, see if they are the same (no, koi_disposition has more categories)
exoplanets_pdisp_cat = pd.get_dummies(exoplanets, prefix=['koi_pdisposition'], columns=['koi_pdisposition'])

In [None]:
exoplanets_disp_cat = pd.get_dummies(exoplanets_pdisp_cat, prefix=['koi_disposition'], columns=['koi_disposition'])
exoplanets_disp_cat.drop('koi_pdisposition_FALSE POSITIVE', axis=1, inplace=True)

exoplanets_disp_cat

In [None]:
#Drop error columns (although these could be useful in the real world), extra IDs, KOI score, 
#and extra evaluation columns
exoplanets_basic = exoplanets_disp_cat.drop(["rowid", "kepoi_name", "kepler_name", "koi_score", 
                                             "koi_period_err1", "koi_period_err2", "koi_time0bk_err1", 
                                             "koi_time0bk_err2", "koi_impact_err1", "koi_impact_err2", 
                                             "koi_duration_err1", "koi_duration_err2", "koi_depth_err1", 
                                             "koi_depth_err2", "koi_prad_err1", "koi_prad_err2", "koi_teq_err1", 
                                             "koi_teq_err2", "koi_insol_err1", "koi_insol_err2", "koi_tce_plnt_num", 
                                             "koi_tce_delivname","koi_steff_err1", "koi_steff_err2", 
                                             "koi_slogg_err1", "koi_slogg_err2", "koi_srad_err1", "koi_srad_err2", 
                                             "koi_disposition_CANDIDATE", "koi_disposition_CONFIRMED", 
                                             "koi_disposition_FALSE POSITIVE"], axis=1)
exoplanets_basic.rename(columns={'koi_fpflag_nt': 'flag_not_transit_like', 
                   'koi_fpflag_ss': 'flag_stellar_eclipse', 
                   'koi_fpflag_co': 'flag_centroid_offset',
                   'koi_fpflag_ec': 'flag_ephemeris match',                  
                   'koi_period': 'orbital_period',                  
                   'koi_time0bk': 'time_first_trans_detected',
                   'koi_impact': 'star_planet_dist_at_conj',                   
                   'koi_duration': 'trans_duration',                   
                   'koi_depth': 'stellar_flux_loss_at_trans_min',
                   'koi_prad': 'planet_radius',
                   'koi_teq': 'approx_planet_temp',
                   'koi_insol': 'insolation_flux',
                   'koi_model_snr': 'trans_sig_to_noise',
                   'koi_steff': 'stellar_eff_temp',
                   'koi_slogg': 'stellar_surf_gravity',
                   'koi_srad': 'stellar_photosph_rad',
                   'ra': 'sky_location_right_asc',
                   'dec': 'sky_location_declination',                   
                   'koi_kepmag': 'stellar_magnitude'}, inplace=True)
exoplanets_basic.dropna(axis=0)
#No na found by this method...

In [None]:
np.any(np.isnan(exoplanets_basic))

In [1]:
np.all(np.isfinite(exoplanets_basic))

NameError: name 'np' is not defined

In [None]:
#https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)
clean_dataset(exoplanets_basic)
exoplanets_basic
#This is a little mysterious to me, the df seems to be exactly the same size as before, but as shown below the answers to both diagnostic questions have changed.

In [None]:
np.any(np.isnan(exoplanets_basic))

In [None]:
np.all(np.isfinite(exoplanets_basic))

## First run

In [None]:
X = exoplanets_basic.drop("koi_pdisposition_CANDIDATE", axis=1)
y = exoplanets_basic["koi_pdisposition_CANDIDATE"]
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1)

In [None]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [None]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
# Create model and add layers
model = Sequential()
model.add(keras.Input(shape=(9200,)))
model.add(Dense(units=100, activation='relu', input_dim=20))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [None]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network 1 - Loss: {model_loss}, Accuracy: {model_accuracy}")

### Impressively accurate - but includes Kepler evaluations in "flag" columns. What happens if these are removed?

## Second run: Same parameters, but remove "flag" columns

In [None]:
data_deflag = exoplanets_basic.drop(["kepid", 'flag_not_transit_like','flag_centroid_offset','flag_stellar_eclipse','flag_ephemeris match' ], axis=1)
feature_names = data_deflag.columns
target = data_deflag["koi_pdisposition_CANDIDATE"]
target_names = ["False_positive", "Candidate"]
data_deflag

In [None]:
X = data_deflag.drop("koi_pdisposition_CANDIDATE", axis=1)
y = data_deflag["koi_pdisposition_CANDIDATE"]
print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1)

In [None]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [None]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=15))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [None]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network 2 - Loss: {model_loss}, Accuracy: {model_accuracy}")

### Removing "flag" columns reduces accuracy

## Third run: "Flag" parameters removed, add one more layer

In [None]:
X = data_deflag.drop("koi_pdisposition_CANDIDATE", axis=1)
y = data_deflag["koi_pdisposition_CANDIDATE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=15))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

# Compile and fit the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)

print(f"Normal Neural Network 3 - Loss: {model_loss}, Accuracy: {model_accuracy}")

### No noticeable gain in accuracy with added layer

## Fourth run: same as third, but with different random starting state

In [None]:
X = data_deflag.drop("koi_pdisposition_CANDIDATE", axis=1)
y = data_deflag["koi_pdisposition_CANDIDATE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=18)
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=15))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

# Compile and fit the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)

print(f"Normal Neural Network 4 - Loss: {model_loss}, Accuracy: {model_accuracy}")

### Accuracy comparable with data split in third run

## Fifth run: same as third, but with twice the units

In [None]:
X = data_deflag.drop("koi_pdisposition_CANDIDATE", axis=1)
y = data_deflag["koi_pdisposition_CANDIDATE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=200, activation='relu', input_dim=15))
model.add(Dense(units=200, activation='relu'))
model.add(Dense(units=200, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

# Compile and fit the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)

print(f"Normal Neural Network 5 - Loss: {model_loss}, Accuracy: {model_accuracy}")

### No change in accuracy

## Sixth run: same as third, but with twice the epochs

In [None]:
X = data_deflag.drop("koi_pdisposition_CANDIDATE", axis=1)
y = data_deflag["koi_pdisposition_CANDIDATE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=15))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

# Compile and fit the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=120,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)

print(f"Normal Neural Network 6 - Loss: {model_loss}, Accuracy: {model_accuracy}")

### No noticeable gain in accuracy with added epochs

## Seventh run: same as third, but try "Nadam" normalization 

### (just messing, I really don't understand how these work)

In [None]:
X = data_deflag.drop("koi_pdisposition_CANDIDATE", axis=1)
y = data_deflag["koi_pdisposition_CANDIDATE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=15))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

# Compile and fit the model
model.compile(optimizer='nadam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)

print(f"Normal Neural Network 7 - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
## Eighth run: same as third, but try "Adamax" normalization 

In [None]:
X = data_deflag.drop("koi_pdisposition_CANDIDATE", axis=1)
y = data_deflag["koi_pdisposition_CANDIDATE"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [None]:
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=15))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

# Compile and fit the model
model.compile(optimizer='adamax', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=60,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)

print(f"Normal Neural Network 8 - Loss: {model_loss}, Accuracy: {model_accuracy}")