In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



# Read the CSV and Perform Basic Data Cleaning

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [4]:
df.shape

(6991, 41)

# Create a Train Test Split

Use `koi_disposition` for the y values

In [5]:
X = df.drop("koi_disposition", axis=1)
y = df["koi_disposition"] #y.values.reshape(-1, 1)
print(X.shape, y.shape)

(6991, 40) (6991,)


In [6]:
# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [7]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
3563,0,0,0,0,10.548413,5.47e-05,-5.47e-05,139.06402,0.00411,-0.00411,...,-133,4.387,0.066,-0.123,1.092,0.181,-0.097,298.09543,44.737061,13.204
4099,0,0,0,0,24.754385,0.0001365,-0.0001365,140.20732,0.00446,-0.00446,...,-144,4.519,0.078,-0.052,0.804,0.056,-0.076,295.73535,42.576248,15.514
5460,0,0,0,0,1.057336,1.23e-07,-1.23e-07,131.792007,9.6e-05,-9.6e-05,...,-140,4.594,0.054,-0.027,0.683,0.054,-0.06,292.18417,49.31004,15.414
1091,0,0,0,0,201.118319,0.001461,-0.001461,187.56986,0.00529,-0.00529,...,-112,4.447,0.072,-0.108,0.954,0.135,-0.083,283.11377,48.13139,13.328
5999,0,0,0,0,91.649983,0.003181,-0.003181,175.7156,0.0286,-0.0286,...,-233,4.145,0.164,-0.164,1.608,0.905,-0.383,294.93198,39.81242,12.964


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [8]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [10]:
encoded_y_train.shape

(5243,)

In [11]:
from tensorflow.keras.utils import to_categorical

# One-hot encoding
# YOUR CODE HERE
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [12]:
y_train_categorical

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [13]:
for label, original_class in zip(y_train_categorical, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: CONFIRMED
Encoded Label: [1. 0. 0.]
------------
Original Class: FALSE POSITIVE
Encoded Label: [0. 1. 0.]
------------
Original Class: FALSE POSITIVE
Encoded Label: [1. 0. 0.]
------------
Original Class: CONFIRMED
Encoded Label: [0. 1. 0.]
------------
Original Class: CONFIRMED
Encoded Label: [1. 0. 0.]
------------
Original Class: CONFIRMED
Encoded Label: [0. 1. 0.]
------------
Original Class: CONFIRMED
Encoded Label: [0. 0. 1.]
------------
Original Class: CONFIRMED
Encoded Label: [0. 1. 0.]
------------
Original Class: CONFIRMED
Encoded Label: [0. 1. 0.]
------------
Original Class: CONFIRMED
Encoded Label: [0. 1. 0.]
------------
Original Class: CONFIRMED
Encoded Label: [0. 0. 1.]
------------
Original Class: FALSE POSITIVE
Encoded Label: [1. 0. 0.]
------------
Original Class: FALSE POSITIVE
Encoded Label: [0. 0. 1.]
------------
Original Class: FALSE POSITIVE
Encoded Label: [1. 0. 0.]
------------
Original Class: CONFIRMED
Encoded Label: [0. 0. 1.]
------------


In [14]:
print(X_train_scaled.shape, y_train_categorical.shape)
df_y_train = pd.DataFrame(y_train_categorical)
df_y_train

(5243, 40) (5243, 3)


Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
...,...,...,...
5238,0.0,1.0,0.0
5239,0.0,0.0,1.0
5240,0.0,0.0,1.0
5241,0.0,1.0,0.0


# Select your features (columns)

In [15]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold()
selector.fit_transform(X_train_scaled)
selector.fit_transform(y_train_categorical)
print(X_train_scaled.shape, y_train_categorical.shape)

(5243, 40) (5243, 3)


In [16]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
select = SelectKBest(chi2, k=7)
train_new_xy = select.fit_transform(X_train_scaled, y_train_categorical)
filter = select.get_support()
cols = df.columns.array
features = cols[1:]
print(features)
print(features[filter])
X_train_scaled.shape


<PandasArray>
[    'koi_fpflag_nt',     'koi_fpflag_ss',     'koi_fpflag_co',
     'koi_fpflag_ec',        'koi_period',   'koi_period_err1',
   'koi_period_err2',       'koi_time0bk',  'koi_time0bk_err1',
  'koi_time0bk_err2',        'koi_impact',   'koi_impact_err1',
   'koi_impact_err2',      'koi_duration', 'koi_duration_err1',
 'koi_duration_err2',         'koi_depth',    'koi_depth_err1',
    'koi_depth_err2',          'koi_prad',     'koi_prad_err1',
     'koi_prad_err2',           'koi_teq',         'koi_insol',
    'koi_insol_err1',    'koi_insol_err2',     'koi_model_snr',
  'koi_tce_plnt_num',         'koi_steff',    'koi_steff_err1',
    'koi_steff_err2',         'koi_slogg',    'koi_slogg_err1',
    'koi_slogg_err2',          'koi_srad',     'koi_srad_err1',
     'koi_srad_err2',                'ra',               'dec',
        'koi_kepmag']
Length: 40, dtype: object
<PandasArray>
[   'koi_fpflag_nt',    'koi_fpflag_ss',    'koi_fpflag_co',
    'koi_fpflag_ec',        'ko

(5243, 40)

In [17]:
#In X keep only cols 
# koi_fpflag_nt is 0,koi_fpflag_ss is 1, koi_fpflag_co is 2, koi_fpflag_ec is 3, 
# koi_depth' is 16, 'koi_model_snr is 26',koi_tce_plnt_num is 27
X_train_featured = X_train_scaled[:, [0,1,2,3,16,26,27]]
X_test_featured = X_test_scaled[:, [0,1,2,3,16,26,27]]
X_train_featured_df = pd.DataFrame(X_train_featured)
X_train_featured_df 

Unnamed: 0,0,1,2,3,4,5,6
0,0.0,0.0,0.0,0.0,0.000062,0.001116,0.142857
1,0.0,0.0,0.0,0.0,0.000380,0.001812,0.000000
2,0.0,0.0,0.0,0.0,0.030706,0.052402,0.000000
3,0.0,0.0,0.0,0.0,0.000375,0.003656,0.000000
4,0.0,0.0,0.0,0.0,0.000121,0.000784,0.142857
...,...,...,...,...,...,...,...
5238,0.0,0.0,0.0,0.0,0.000357,0.002651,0.000000
5239,1.0,0.0,0.0,0.0,0.000069,0.004429,0.000000
5240,0.0,0.0,1.0,0.0,0.006409,0.031194,0.000000
5241,0.0,0.0,0.0,0.0,0.000512,0.002773,0.142857


In [18]:
print(X_train_featured.shape, y_train_categorical.shape)
print(X_test_featured.shape, y_test_categorical.shape)

(5243, 7) (5243, 3)
(1748, 7) (1748, 3)


In [19]:
X_train_featured.shape
y_train_categorical.shape

(5243, 3)

# Train the Model

In [20]:
# Created a normal neural network with 7 inputs, 4 hidden nodes, and 3 outputs
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=7, activation='relu', input_dim=7))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 7)                 56        
                                                                 
 dense_1 (Dense)             (None, 100)               800       
                                                                 
 dense_2 (Dense)             (None, 100)               10100     
                                                                 
 dense_3 (Dense)             (None, 100)               10100     
                                                                 
 dense_4 (Dense)             (None, 100)               10100     
                                                                 
 dense_5 (Dense)             (None, 3)                 303       
                                                                 
Total params: 31,459
Trainable params: 31,459
Non-traina

In [21]:
# Compile the model
# YOUR CODE HERE
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [22]:
y_train_categorical.reshape(-1, 1)
y_test_categorical.reshape(-1, 1)

print(X_train_featured.shape, y_train_categorical.shape)
print(X_test_featured.shape, y_test_categorical.shape)

(5243, 7) (5243, 3)
(1748, 7) (1748, 3)


In [23]:
# Fit the model to the training data
# YOUR CODE HERE

model.fit(
    X_train_featured,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
164/164 - 0s - loss: 0.5233 - accuracy: 0.7021 - 447ms/epoch - 3ms/step
Epoch 2/100
164/164 - 0s - loss: 0.4056 - accuracy: 0.7572 - 108ms/epoch - 659us/step
Epoch 3/100
164/164 - 0s - loss: 0.4051 - accuracy: 0.7610 - 114ms/epoch - 696us/step
Epoch 4/100
164/164 - 0s - loss: 0.4036 - accuracy: 0.7513 - 113ms/epoch - 687us/step
Epoch 5/100
164/164 - 0s - loss: 0.4009 - accuracy: 0.7591 - 114ms/epoch - 695us/step
Epoch 6/100
164/164 - 0s - loss: 0.4013 - accuracy: 0.7637 - 111ms/epoch - 675us/step
Epoch 7/100
164/164 - 0s - loss: 0.4007 - accuracy: 0.7604 - 108ms/epoch - 656us/step
Epoch 8/100
164/164 - 0s - loss: 0.3995 - accuracy: 0.7601 - 112ms/epoch - 683us/step
Epoch 9/100
164/164 - 0s - loss: 0.4014 - accuracy: 0.7606 - 106ms/epoch - 647us/step
Epoch 10/100
164/164 - 0s - loss: 0.3971 - accuracy: 0.7639 - 125ms/epoch - 761us/step
Epoch 11/100
164/164 - 0s - loss: 0.3975 - accuracy: 0.7667 - 106ms/epoch - 648us/step
Epoch 12/100
164/164 - 0s - loss: 0.3984 - accuracy: 0

<keras.callbacks.History at 0x1c77ceda1c0>

In [24]:
model_loss, model_accuracy = model.evaluate(X_train_featured, y_train_categorical, verbose=2)
print(f"Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

164/164 - 0s - loss: 0.3049 - accuracy: 0.8655 - 279ms/epoch - 2ms/step
Neural Network - Loss: 0.3048975169658661, Accuracy: 0.8655350208282471


In [25]:
print(f"Training Data Score: {model.evaluate(X_train_featured, y_train_categorical)}")
print(f"Testing Data Score: {model.evaluate(X_test_featured, y_test_categorical)}")

Training Data Score: [0.3048975169658661, 0.8655350208282471]
Testing Data Score: [0.29324230551719666, 0.8735697865486145]


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [26]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [27]:
# Drop extra y column koi_disposition
X_data = df.drop(columns=['koi_disposition'], axis=1)
y_target = df['koi_disposition']
print(X_data.shape,y_target.shape)

(6991, 40) (6991,)


In [28]:
#In X_data keep only featured cols 
# koi_fpflag_nt is 0,koi_fpflag_ss is 1, koi_fpflag_co is 2, koi_fpflag_ec is 3, 
# koi_depth' is 16, 'koi_model_snr is 26',koi_tce_plnt_num is 27

X_data_featured = X_data.iloc[:,[0,1,2,3,16,26,27]]
print(X_data_featured.shape,y_target.shape)

(6991, 7) (6991,)


In [29]:
# Split data and target into training and testing
from sklearn.model_selection import train_test_split
X_data_train, X_data_test, y_target_train, y_target_test = train_test_split(X_data_featured,y_target,test_size=0.4,random_state=42)
print(X_data_train.shape,y_target_train.shape)
print(X_data_test.shape,y_target_test.shape)

(4194, 7) (4194,)
(2797, 7) (2797,)


In [30]:
y_target_train

368     FALSE POSITIVE
1937         CONFIRMED
1151         CONFIRMED
5610    FALSE POSITIVE
3395    FALSE POSITIVE
             ...      
3772    FALSE POSITIVE
5191         CANDIDATE
5226    FALSE POSITIVE
5390    FALSE POSITIVE
860     FALSE POSITIVE
Name: koi_disposition, Length: 4194, dtype: object

In [31]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

X_scaler = MinMaxScaler().fit(X_data_train)
X_data_train_scaled = X_scaler.transform(X_data_train)
X_data_test_scaled = X_scaler.transform(X_data_test)

In [32]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_target_train)
y_target_train_encoded = label_encoder.transform(y_target_train)
y_target_test_encoded = label_encoder.transform(y_target_test)

In [71]:
y_target_train_encoded

array([2, 1, 1, ..., 2, 2, 2])

In [73]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [77]:

model_params = {
    'n_neighbors' : {
        'model' : KNeighborsClassifier(),
        'params' : {
           'n_neighbors': [3,5,11,19],
           'weights': ['uniform','distance']#,
           #'metric':['eulidean','manhattan']
        }
    },
    'svm' : {
        'model' : svm.SVC(),
        'params' : {
            'C' : [1,5,10,50],
            'kernel': ['rbf','linear'],
            'gamma': [0.0001, 0.0005, 0.001, 0.005]
        }
    },
    'random_forest': {
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10,50,80],
            'max_features': ['auto','sqrt'],
            'max_depth': [2,4],
            'min_samples_split': [2,5],
            'min_samples_leaf': [1,2],
            'bootstrap': [True,False]
        }
    },
    'logistic_regression' : {
    'model' : LogisticRegression(solver='liblinear',multi_class='auto'),
    'params' : { 'C': [1,5,10,50]
    }
  }
}

In [78]:
from sklearn.model_selection import GridSearchCV
scores= []
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_data_train_scaled, y_target_train_encoded)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_})

In [79]:
df_score = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_score

Unnamed: 0,model,best_score,best_params
0,n_neighbors,0.869577,"{'n_neighbors': 19, 'weights': 'distance'}"
1,svm,0.797331,"{'C': 50, 'gamma': 0.0001, 'kernel': 'linear'}"
2,random_forest,0.87029,"{'bootstrap': True, 'max_depth': 4, 'max_featu..."
3,logistic_regression,0.782072,{'C': 50}


In [47]:
# Make predictions with the hypertuned model
predictions_clf = clf.predict(X_data_test_scaled)

from sklearn.metrics import classification_report
target_names = ["CANDIDATE 0","CONFIRMED 1","FALSE POSITIVE 2"] 
print(classification_report(y_target_test_encoded, predictions_clf, target_names=target_names))

                  precision    recall  f1-score   support

     CANDIDATE 0       0.54      0.89      0.67       672
     CONFIRMED 1       0.79      0.32      0.45       764
FALSE POSITIVE 2       0.98      1.00      0.99      1361

        accuracy                           0.79      2797
       macro avg       0.77      0.74      0.70      2797
    weighted avg       0.82      0.79      0.77      2797



# Save the Model

In [43]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'antoinetteboyle.sav'
joblib.dump(model, filename)

INFO:tensorflow:Assets written to: ram://317417ff-7b78-45d6-abc7-93d5acd7b4cc/assets


['antoinetteboyle.sav']