In [25]:
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow import keras 
from keras import Sequential
from keras.layers import Dense

In [26]:
def saveModel(model, filename):
    model.save(filename)

def loadModel(filename):
    model = joblib.load(filename)
    return model

In [27]:
df = pd.read_csv("../data/[CLEANED]kepler-data.csv")
df.drop(columns = ["Unnamed: 0"], inplace=True)

In [28]:
ALL_COLUMNS = df.columns
ERROR_COLUMNS = [col for col in ALL_COLUMNS if "err" in col]
EXCLUDE = ["rowid", "kepid", "kepoi_name", "koi_score", "koi_disposition", "koi_pdisposition", "koi_tce_delivname", "koi_tce_plnt_num"] + ERROR_COLUMNS
TO_USE = list(set(ALL_COLUMNS) - set(EXCLUDE))

In [29]:
subset_df = df[df["koi_disposition"] != "CANDIDATE"]

In [30]:
X = subset_df[TO_USE].values
y = subset_df["koi_disposition"].apply(lambda x: x=='CONFIRMED').astype(int).values

In [31]:
X = StandardScaler().fit_transform(X)
X = PCA().fit_transform(X)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0, stratify=y)

In [33]:
def createModel():
    model = Sequential([
                    Dense(256, activation = 'relu'),
                    Dense(128, activation = 'relu'),
                    Dense(128, activation = 'relu'),
                    Dense(1, activation = 'sigmoid')
                   ])
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [34]:
def performance(test, pred):
    conf_matrix = confusion_matrix(test, pred)
    f1 = f1_score(test, pred)
    report = classification_report(test, pred)
    accuracy = balanced_accuracy_score(test, pred)
    kappa = cohen_kappa_score(test, pred)
    print(f"F1 Score: {f1}")
    print(f"Kappa Score: {kappa}")
    print(f"Accuracy Score: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(report)

In [35]:
def trainEvaluate(model, fold, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train, epochs=20, verbose=0)
    pred = model.predict(X_test)
    pred = pred >= 0.5
    f1 = f1_score(y_test, pred)
    print(f"F1 Score in fold {fold} = {f1}")
    return f1


def crossValidation(K=10):
    kFold = KFold(n_splits=K, shuffle=True, random_state=0)
    f1_scores = list()
    k_ctr = 1
    for train, test in kFold.split(X, y):
        model = None
        model = createModel()
        current_f1 = trainEvaluate(model, k_ctr, X[train], y[train], X[test], y[test])
        f1_scores.append(current_f1)
        k_ctr+=1
    print(f"Average {K}-Fold F1 Score = {np.mean(f1_scores)}\n")
    
    k_ctr = 1
    kFold = StratifiedKFold(n_splits=K, shuffle=True, random_state=0)
    f1_scores = list()
    for train, test in kFold.split(X, y):
        model = None
        model = createModel()
        current_f1 = trainEvaluate(model, k_ctr, X[train], y[train], X[test], y[test])
        f1_scores.append(current_f1)
        k_ctr+=1
    print(f"Average Stratified {K}-Fold F1 Score = {np.mean(f1_scores)}")

In [36]:
crossValidation(K=10)

F1 Score in fold 1 = 0.9728506787330317
F1 Score in fold 2 = 0.974477958236659
F1 Score in fold 3 = 0.9590909090909091
F1 Score in fold 4 = 0.9799554565701558
F1 Score in fold 5 = 0.9655172413793104
F1 Score in fold 6 = 0.9682875264270614
F1 Score in fold 7 = 0.9774127310061601
F1 Score in fold 8 = 0.9720430107526882
F1 Score in fold 9 = 0.9776876267748479
F1 Score in fold 10 = 0.968609865470852
Average 10-Fold F1 Score = 0.9715933004441675

F1 Score in fold 1 = 0.973568281938326
F1 Score in fold 2 = 0.9804772234273318
F1 Score in fold 3 = 0.957683741648107
F1 Score in fold 4 = 0.9650655021834061
F1 Score in fold 5 = 0.9715536105032824
F1 Score in fold 6 = 0.9826086956521739
F1 Score in fold 7 = 0.986842105263158
F1 Score in fold 8 = 0.9649122807017544
F1 Score in fold 9 = 0.9826086956521739
F1 Score in fold 10 = 0.9711751662971175
Average Stratified 10-Fold F1 Score = 0.973649530326683


In [37]:
model = createModel()
model.fit(X_train, y_train, epochs=20, verbose=0)
pred = model.predict(X_test)
pred = pred >= 0.5

In [38]:
performance(y_test, pred)

F1 Score: 0.9807308970099669
Kappa Score: 0.9713003065737467
Accuracy Score: 0.9845098404420438
Confusion Matrix:
[[1523   11]
 [  18  738]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1534
           1       0.99      0.98      0.98       756

    accuracy                           0.99      2290
   macro avg       0.99      0.98      0.99      2290
weighted avg       0.99      0.99      0.99      2290



In [39]:
saveModel(model, "../model/nn-model.h5")