In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.decomposition import PCA
from tensorflow import keras 
from keras import Sequential
from keras.layers import Dense
import joblib
from sklearn.metrics import confusion_matrix, f1_score, classification_report, balanced_accuracy_score, cohen_kappa_score
from keras.wrappers.scikit_learn import KerasClassifier
import eli5
from eli5.sklearn import PermutationImportance



In [2]:
def saveModel(model, filename):
    model.save(filename)

def loadModel(filename):
    model = joblib.load(filename)
    return model

In [3]:
df = pd.read_csv("../data/[CLEANED]kepler-data.csv")
df.drop(columns = ["Unnamed: 0"], inplace=True)

In [4]:
ALL_COLUMNS = df.columns
ERROR_COLUMNS = [col for col in ALL_COLUMNS if "err" in col]
EXCLUDE = ["rowid", "kepid", "kepoi_name", "koi_score", "koi_disposition", "koi_pdisposition", "koi_tce_delivname", "koi_tce_plnt_num"] #+ ERROR_COLUMNS
TO_USE = list(set(ALL_COLUMNS) - set(EXCLUDE))

In [5]:
subset_df = df[df["koi_disposition"] != "CANDIDATE"]

In [6]:
X = subset_df[TO_USE].values
y = subset_df["koi_disposition"].apply(lambda x: x=='CONFIRMED').astype(int).values

In [7]:
X = StandardScaler().fit_transform(X)
X = PCA(n_components=30).fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0, stratify=y)

In [9]:
def createModel():
    model = Sequential([
                    Dense(256, activation = 'relu'),
                    Dense(128, activation = 'relu'),
                    Dense(128, activation = 'relu'),
                    Dense(1, activation = 'sigmoid')
                   ])
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [10]:
def performance(test, pred):
    conf_matrix = confusion_matrix(test, pred)
    f1 = f1_score(test, pred)
    report = classification_report(test, pred)
    accuracy = balanced_accuracy_score(test, pred)
    kappa = cohen_kappa_score(test, pred)
    print(f"F1 Score: {f1}")
    print(f"Kappa Score: {kappa}")
    print(f"Accuracy Score: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(report)

In [11]:
def trainEvaluate(model, fold, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train, epochs=20, verbose=0)
    pred = model.predict(X_test)
    pred = pred >= 0.5
    f1 = f1_score(y_test, pred)
    print(f"F1 Score in fold {fold} = {f1}")
    return f1


def crossValidation(K=10):
    kFold = KFold(n_splits=K, shuffle=True, random_state=0)
    f1_scores = list()
    k_ctr = 1
    for train, test in kFold.split(X, y):
        model = None
        model = createModel()
        current_f1 = trainEvaluate(model, k_ctr, X[train], y[train], X[test], y[test])
        f1_scores.append(current_f1)
        k_ctr+=1
    print(f"Average {K}-Fold F1 Score = {np.mean(f1_scores)}\n")
    
    k_ctr = 1
    kFold = StratifiedKFold(n_splits=K, shuffle=True, random_state=0)
    f1_scores = list()
    for train, test in kFold.split(X, y):
        model = None
        model = createModel()
        current_f1 = trainEvaluate(model, k_ctr, X[train], y[train], X[test], y[test])
        f1_scores.append(current_f1)
        k_ctr+=1
    print(f"Average Stratified {K}-Fold F1 Score = {np.mean(f1_scores)}")

In [12]:
crossValidation(K=10)

F1 Score in fold 1 = 0.9662921348314607
F1 Score in fold 2 = 0.9834515366430261
F1 Score in fold 3 = 0.9777777777777777
F1 Score in fold 4 = 0.9776785714285714
F1 Score in fold 5 = 0.9836065573770492
F1 Score in fold 6 = 0.9876543209876544
F1 Score in fold 7 = 0.9834024896265561
F1 Score in fold 8 = 0.9892933618843683
F1 Score in fold 9 = 0.9877049180327868
F1 Score in fold 10 = 0.9819819819819819
Average 10-Fold F1 Score = 0.9818843650571232

F1 Score in fold 1 = 0.9827586206896551
F1 Score in fold 2 = 0.986784140969163
F1 Score in fold 3 = 0.96875
F1 Score in fold 4 = 0.9695652173913043
F1 Score in fold 5 = 0.9823788546255506
F1 Score in fold 6 = 0.982532751091703
F1 Score in fold 7 = 0.9956140350877193
F1 Score in fold 8 = 0.9721627408993576
F1 Score in fold 9 = 0.986784140969163
F1 Score in fold 10 = 0.9823008849557523
Average Stratified 10-Fold F1 Score = 0.9809631386679369


In [13]:
model = createModel()
model.fit(X_train, y_train, epochs=20, verbose=0)
pred = model.predict(X_test)
pred = pred >= 0.5

In [14]:
performance(y_test, pred)

F1 Score: 0.9866844207723036
Kappa Score: 0.9801871235337174
Accuracy Score: 0.9884496388733677
Confusion Matrix:
[[1529    5]
 [  15  741]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1534
           1       0.99      0.98      0.99       756

    accuracy                           0.99      2290
   macro avg       0.99      0.99      0.99      2290
weighted avg       0.99      0.99      0.99      2290



In [15]:
saveModel(model, "../model/nn-model-error.h5")

In [16]:
model = KerasClassifier(build_fn=createModel, epochs=20, verbose=0)
model.fit(X, y)
perm = PermutationImportance(model, random_state=0).fit(X,y)
eli5.show_weights(perm, feature_names = [f"Component {i+1}" for i in range(X.shape[1])])

Weight,Feature
0.1000  ± 0.0040,Component 4
0.0714  ± 0.0066,Component 8
0.0410  ± 0.0031,Component 1
0.0283  ± 0.0043,Component 9
0.0283  ± 0.0014,Component 17
0.0271  ± 0.0018,Component 6
0.0260  ± 0.0019,Component 14
0.0245  ± 0.0026,Component 2
0.0117  ± 0.0011,Component 23
0.0106  ± 0.0020,Component 10
