# KOI Model Training
*By Adam Zheng*
> Github Repository: https://github.com/adz888/Exoplanet-Prediction-Model

### Read in KOI Data

In [42]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

koi = pd.read_csv('model_training/kepler_objects_of_interest.csv')
koi.drop(columns = ['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec'], inplace=True)
koi.head()

Unnamed: 0,kepid,kepoi_name,koi_disposition,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_model_snr,koi_steff,koi_srad,ra,dec,koi_kepmag
0,10797460,K00752.01,CONFIRMED,9.488036,170.53875,0.146,2.9575,615.8,2.26,793.0,35.8,5455.0,0.927,291.93423,48.141651,15.347
1,10797460,K00752.02,CONFIRMED,54.418383,162.51384,0.586,4.507,874.8,2.83,443.0,25.8,5455.0,0.927,291.93423,48.141651,15.347
2,10811496,K00753.01,CANDIDATE,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638.0,76.3,5853.0,0.868,297.00482,48.134129,15.436
3,10848459,K00754.01,FALSE POSITIVE,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,505.6,5805.0,0.791,285.53461,48.28521,15.597
4,10854555,K00755.01,CONFIRMED,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406.0,40.9,6031.0,1.046,288.75488,48.2262,15.509


#### All objects with null data values and some irrelevant columns were removed prior to downloading from website as csv (as a result, this data set contains 9200 out of the 9564 total objects stored in the Cumulative KOI Dataset)
##### Source: https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=cumulative

In [43]:
koi.isnull().sum() 

kepid              0
kepoi_name         0
koi_disposition    0
koi_period         0
koi_time0bk        0
koi_impact         0
koi_duration       0
koi_depth          0
koi_prad           0
koi_teq            0
koi_model_snr      0
koi_steff          0
koi_srad           0
ra                 0
dec                0
koi_kepmag         0
dtype: int64

#### Saving a dataframe of KOI candidates with only the columns 'kepid' and 'kepoi_name'

In [44]:
koi_candidate_ids = koi.loc[koi['koi_disposition'] == 'CANDIDATE'].copy()[['kepid', 'kepoi_name']]
koi_candidate_ids.reset_index(inplace=True)
koi_candidate_ids.drop(columns = ['index'], inplace=True)

#### Dropping unnecessary id/name columns

In [45]:
koi.drop(columns = ['kepid', 'kepoi_name'], inplace=True)

#### Separating out the KOI Candidates

In [46]:
koi_candidates = koi.loc[koi['koi_disposition'] == 'CANDIDATE'].copy()

In [47]:
koi_candidates.drop(columns = ['koi_disposition'], inplace=True)

In [48]:
koi_features = koi.loc[koi['koi_disposition'] != 'CANDIDATE'].copy()

#### Separating out labels & features

In [49]:
koi_labels = koi_features['koi_disposition'].to_frame()

In [50]:
koi_features.drop(columns = ['koi_disposition'], inplace=True)

#### Splitting data into train, validation, and test set

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
train_features, test_features, train_labels, test_labels = train_test_split(koi_features, koi_labels, test_size=0.4, random_state=25)
eval_features, test_features, eval_labels, test_labels = train_test_split(test_features, test_labels, test_size=0.5, random_state=25)

#### Prior to training the models, my predictions as to which models will perform best overall are as follows:
##### From best to worst. . .
1. Multilayer Perceptron
2. Gradient Boosted Trees 
3. Random Forest
4. Logistic Regression

## Logistic Regression

In [13]:
import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
import warnings
import time

# Ignores certain warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [14]:
def print_results(results): # same function used for the training of all 5 models
    print(f'BEST PARAMS: {results.best_params_}\n')

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print(f'{mean:.3f} (+/-{std*2:.3f}) for {params}')

In [154]:
lr = LogisticRegression()
parameters = {
    'C': [0.1, 1, 10, 100],
    'max_iter': [100, 10000]
}

start_time = time.time()
cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)
print(f'--- {time.time() - start_time} seconds ---')

BEST PARAMS: {'C': 0.1, 'max_iter': 10000}

0.840 (+/-0.020) for {'C': 0.1, 'max_iter': 100}
0.843 (+/-0.029) for {'C': 0.1, 'max_iter': 10000}
0.840 (+/-0.025) for {'C': 1, 'max_iter': 100}
0.843 (+/-0.028) for {'C': 1, 'max_iter': 10000}
0.842 (+/-0.025) for {'C': 10, 'max_iter': 100}
0.840 (+/-0.028) for {'C': 10, 'max_iter': 10000}
0.842 (+/-0.022) for {'C': 100, 'max_iter': 100}
0.841 (+/-0.026) for {'C': 100, 'max_iter': 10000}
--- 11.654513120651245 seconds ---


In [39]:
joblib.dump(cv.best_estimator_, 'model_training/LR_koi_model.pkl')

['model_training/LR_koi_model.pkl']

## Multilayer Perceptron

In [156]:
import joblib
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.exceptions import ConvergenceWarning
import warnings
import time

# Ignores certain warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
# warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [157]:
mlp = MLPClassifier()
parameters = {
    'hidden_layer_sizes': [(50,), (100,)],
    'activation': ['relu', 'tanh', 'logistic'], #['relu', 'tanh', 'logistic'],
    'learning_rate': ['constant', 'invscaling', 'adaptive'] #['constant' = 34 sec, 'invscaling' = 30 sec, 'adaptive' = 36 sec]
}

start_time = time.time()
cv = GridSearchCV(mlp, parameters, cv=5)
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)
print(f'--- {time.time() - start_time} seconds ---')

BEST PARAMS: {'activation': 'logistic', 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive'}

0.840 (+/-0.041) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
0.807 (+/-0.116) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling'}
0.807 (+/-0.057) for {'activation': 'relu', 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive'}
0.797 (+/-0.091) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
0.818 (+/-0.060) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'invscaling'}
0.774 (+/-0.087) for {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive'}
0.805 (+/-0.032) for {'activation': 'tanh', 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
0.788 (+/-0.034) for {'activation': 'tanh', 'hidden_layer_sizes': (50,), 'learning_rate': 'invscaling'}
0.801 (+/-0.031) for {'activation': 'tanh', 'hidden_layer_sizes': (50,), 'le

In [40]:
joblib.dump(cv.best_estimator_, 'model_training/MLP_koi_model.pkl')

['model_training/MLP_koi_model.pkl']

## Random Forest

In [159]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
import warnings
import time

# Ignores certain warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
# warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [160]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None] # None will just not limit the depth
}

start_time = time.time()
cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)
print(f'--- {time.time() - start_time} seconds ---')

BEST PARAMS: {'max_depth': 32, 'n_estimators': 250}

0.834 (+/-0.048) for {'max_depth': 2, 'n_estimators': 5}
0.849 (+/-0.042) for {'max_depth': 2, 'n_estimators': 50}
0.858 (+/-0.060) for {'max_depth': 2, 'n_estimators': 250}
0.858 (+/-0.037) for {'max_depth': 4, 'n_estimators': 5}
0.883 (+/-0.025) for {'max_depth': 4, 'n_estimators': 50}
0.890 (+/-0.017) for {'max_depth': 4, 'n_estimators': 250}
0.897 (+/-0.019) for {'max_depth': 8, 'n_estimators': 5}
0.910 (+/-0.023) for {'max_depth': 8, 'n_estimators': 50}
0.911 (+/-0.021) for {'max_depth': 8, 'n_estimators': 250}
0.903 (+/-0.031) for {'max_depth': 16, 'n_estimators': 5}
0.919 (+/-0.023) for {'max_depth': 16, 'n_estimators': 50}
0.922 (+/-0.024) for {'max_depth': 16, 'n_estimators': 250}
0.901 (+/-0.024) for {'max_depth': 32, 'n_estimators': 5}
0.917 (+/-0.018) for {'max_depth': 32, 'n_estimators': 50}
0.923 (+/-0.021) for {'max_depth': 32, 'n_estimators': 250}
0.900 (+/-0.032) for {'max_depth': None, 'n_estimators': 5}
0.919 (+/-0

In [38]:
joblib.dump(cv.best_estimator_, 'model_training/RF_koi_model.pkl')

['model_training/RF_koi_model.pkl']

## Gradient Boosted Trees

In [15]:
import joblib
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
import warnings
import time

# Ignores certain warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
# warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [16]:
gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [5, 7, 9],
    'learning_rate': [0.01, 0.1, 1]
}

start_time = time.time()
cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)
print(f'--- {time.time() - start_time} seconds ---')

BEST PARAMS: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 1000}

0.899 (+/-0.018) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
0.925 (+/-0.017) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500}
0.925 (+/-0.020) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000}
0.908 (+/-0.025) for {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100}
0.920 (+/-0.023) for {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500}
0.926 (+/-0.020) for {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 1000}
0.905 (+/-0.024) for {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 100}
0.918 (+/-0.019) for {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 500}
0.926 (+/-0.018) for {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 1000}
0.926 (+/-0.025) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
0.929 (+/-0.023) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500}
0.928 (+/-0.025) for {

In [37]:
joblib.dump(cv.best_estimator_, 'model_training/GB_koi_model.pkl')

['model_training/GB_koi_model.pkl']

# Model Evaluation

In [18]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

### Read in models from files

In [19]:
models = {}

for mdl in ['LR', 'MLP', 'RF', 'GB']:
    models[mdl] = joblib.load(f'{mdl}_koi_model.pkl')

### Evalatuating models based on accuracy, precision, recall, and predict time

In [34]:
def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 4)
    precision = round(precision_score(labels, pred, pos_label = "CONFIRMED"), 4)
    recall = round(recall_score(labels, pred, pos_label = "CONFIRMED"), 4)
    print(f'{name} -- Accuracy: {accuracy} / Precision: {precision} / Recall: {recall} / Predict Time: {(end - start)*1000:.3f} ms')

In [35]:
print("FOR REFERENCE:")
print("Accuracy = proportion of objects that were classified correctly")
print("Precision = proportion of predicted exoplanets that were actually confirmed exoplanets")
print("Recall = proportion of confirmed exoplanets that were predicted as exoplanets\n")
print("EVALUATION RESULTS:")
for name, mdl in models.items():
    evaluate_model(name, mdl, eval_features, eval_labels)

FOR REFERENCE:
Accuracy = proportion of objects that were classified correctly
Precision = proportion of predicted exoplanets that were actually confirmed exoplanets
Recall = proportion of confirmed exoplanets that were predicted as exoplanets

EVALUATION RESULTS:
LR -- Accuracy: 0.8393 / Precision: 0.7483 / Recall: 0.8406 / Predict Time: 4.695 ms
MLP -- Accuracy: 0.8759 / Precision: 0.7946 / Recall: 0.888 / Predict Time: 4.000 ms
RF -- Accuracy: 0.9166 / Precision: 0.8845 / Recall: 0.8861 / Predict Time: 54.021 ms
GB -- Accuracy: 0.9276 / Precision: 0.8879 / Recall: 0.9165 / Predict Time: 47.967 ms


### Final Performance Ranking

1. Gradient Boosted Trees - 1st in accuracy, 1st in precision, 1st in recall, 3rd in predict time
2. Random Forest - 2nd in accuracy, 2nd in precision, 3rd in recall, 4th in predict time
3. Multilayer Perceptron - 3rd in accuracy, 3rd in precision, 3rd in recall, 2nd in predict time
4. Logistic Regression - 4th in accuracy, 4th in precision, 4th in recall, 1st in predict time

### Evaluating best model on test set

In [36]:
evaluate_model('GB', models['GB'], test_features, test_labels)

GB -- Accuracy: 0.9152 / Precision: 0.8867 / Recall: 0.8883 / Predict Time: 49.531 ms


# Conclusion
#### The gradient-boosted trees model was our best model. When tested on data it had never encountered in training. . .
- It correctly classified 91.52% of all objects
- Of the objects it classified as exoplanets, 88.67% were actually confirmed exoplanets
- It correctly classified 88.83% of the confirmed exoplanets as being exoplanets

# Predictions for KOI candidates using our best model

In [30]:
GB_model = joblib.load('model_training/GB_koi_model.pkl')
pred = GB_model.predict(koi_candidates)
print(f'Number of KOI candidates predicted to be exoplanets: {np.count_nonzero(pred == "CONFIRMED")}')
print(f'Number of KOI candidates predicted to be false positives: {np.count_nonzero(pred == "FALSE POSITIVE")}')
predictions = pd.DataFrame(pred, columns = ['Exoplanet Prediction'])
koi_candidate_ids.rename(columns={"kepid": "Kepler ID", "kepoi_name": "KOI Name"}, inplace=True)
candidate_predictions = pd.concat([koi_candidate_ids, predictions], axis=1)
candidate_predictions.to_csv('koi_candidate_predictions.csv', index=False)
candidate_predictions.to_excel('koi_candidate_predictions.xlsx', sheet_name='PREDICTIONS', index=False)
print("\nThe predictions have been exported as a: \n- koi_candidate_predictions.csv \n- koi_candidate_predictions.xlsx")

Number of KOI candidates predicted to be exoplanets: 584
Number of KOI candidates predicted to be false positives: 1366

The predictions have been exported as a: 
- koi_candidate_predictions.csv 
- koi_candidate_predictions.xlsx
