In [16]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.decomposition import PCA
from tensorflow import keras 
from keras import Sequential
from keras.layers import Dense
import joblib
from sklearn.metrics import confusion_matrix, f1_score, classification_report, balanced_accuracy_score, cohen_kappa_score

In [17]:
def saveModel(model, filename):
    model.save(filename)

def loadModel(filename):
    model = joblib.load(filename)
    return model

In [18]:
df = pd.read_csv("../data/[CLEANED]kepler-data.csv")
df.drop(columns = ["Unnamed: 0"], inplace=True)

In [19]:
ALL_COLUMNS = df.columns
ERROR_COLUMNS = [col for col in ALL_COLUMNS if "err" in col]
EXCLUDE = ["rowid", "kepid", "kepoi_name", "koi_score", "koi_disposition", "koi_pdisposition", "koi_tce_delivname", "koi_tce_plnt_num"] #+ ERROR_COLUMNS
TO_USE = list(set(ALL_COLUMNS) - set(EXCLUDE))

In [20]:
subset_df = df[df["koi_disposition"] != "CANDIDATE"]

In [21]:
X = subset_df[TO_USE].values
y = subset_df["koi_disposition"].apply(lambda x: x=='CONFIRMED').astype(int).values

In [22]:
X = StandardScaler().fit_transform(X)
X = PCA(n_components=30).fit_transform(X)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0, stratify=y)

In [24]:
def createModel():
    model = Sequential([
                    Dense(256, activation = 'relu'),
                    Dense(128, activation = 'relu'),
                    Dense(128, activation = 'relu'),
                    Dense(1, activation = 'sigmoid')
                   ])
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [25]:
def performance(test, pred):
    conf_matrix = confusion_matrix(test, pred)
    f1 = f1_score(test, pred)
    report = classification_report(test, pred)
    accuracy = balanced_accuracy_score(test, pred)
    kappa = cohen_kappa_score(test, pred)
    print(f"F1 Score: {f1}")
    print(f"Kappa Score: {kappa}")
    print(f"Accuracy Score: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(report)

In [26]:
def trainEvaluate(model, fold, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train, epochs=20, verbose=0)
    pred = model.predict(X_test)
    pred = pred >= 0.5
    f1 = f1_score(y_test, pred)
    print(f"F1 Score in fold {fold} = {f1}")
    return f1


def crossValidation(K=10):
    kFold = KFold(n_splits=K, shuffle=True, random_state=0)
    f1_scores = list()
    k_ctr = 1
    for train, test in kFold.split(X, y):
        model = None
        model = createModel()
        current_f1 = trainEvaluate(model, k_ctr, X[train], y[train], X[test], y[test])
        f1_scores.append(current_f1)
        k_ctr+=1
    print(f"Average {K}-Fold F1 Score = {np.mean(f1_scores)}\n")
    
    k_ctr = 1
    kFold = StratifiedKFold(n_splits=K, shuffle=True, random_state=0)
    f1_scores = list()
    for train, test in kFold.split(X, y):
        model = None
        model = createModel()
        current_f1 = trainEvaluate(model, k_ctr, X[train], y[train], X[test], y[test])
        f1_scores.append(current_f1)
        k_ctr+=1
    print(f"Average Stratified {K}-Fold F1 Score = {np.mean(f1_scores)}")

In [27]:
crossValidation(K=10)

F1 Score in fold 1 = 0.979498861047836
F1 Score in fold 2 = 0.985981308411215
F1 Score in fold 3 = 0.9755011135857461
F1 Score in fold 4 = 0.9756097560975608
F1 Score in fold 5 = 0.9837587006960558
F1 Score in fold 6 = 0.9794238683127573
F1 Score in fold 7 = 0.9854469854469856
F1 Score in fold 8 = 0.9892933618843683
F1 Score in fold 9 = 0.9897330595482547
F1 Score in fold 10 = 0.9841986455981941
Average 10-Fold F1 Score = 0.9828445660628976

F1 Score in fold 1 = 0.9827586206896551
F1 Score in fold 2 = 0.989010989010989
F1 Score in fold 3 = 0.9730941704035875
F1 Score in fold 4 = 0.973568281938326
F1 Score in fold 5 = 0.9823788546255506
F1 Score in fold 6 = 0.9847494553376905
F1 Score in fold 7 = 0.9956140350877193
F1 Score in fold 8 = 0.9868995633187773
F1 Score in fold 9 = 0.982532751091703
F1 Score in fold 10 = 0.9823008849557523
Average Stratified 10-Fold F1 Score = 0.9832907606459751


In [28]:
model = createModel()
model.fit(X_train, y_train, epochs=20, verbose=0)
pred = model.predict(X_test)
pred = pred >= 0.5

In [29]:
performance(y_test, pred)

F1 Score: 0.9880159786950733
Kappa Score: 0.9821684111803456
Accuracy Score: 0.9894369597759428
Confusion Matrix:
[[1530    4]
 [  14  742]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1534
           1       0.99      0.98      0.99       756

    accuracy                           0.99      2290
   macro avg       0.99      0.99      0.99      2290
weighted avg       0.99      0.99      0.99      2290



In [30]:
saveModel(model, "../model/nn-model-error.h5")