<a href="https://colab.research.google.com/github/aditeyabaral/kepler-exoplanet-analysis/blob/master/src/Prediction_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, classification_report, cohen_kappa_score, f1_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

# Loading Dataframe

In [2]:
df = pd.read_csv("../data/[CLEANED]kepler-data.csv")
df.drop(columns = ["Unnamed: 0"], inplace=True)
print(df.shape)
df.head()

(9110, 47)


Unnamed: 0,rowid,kepid,kepoi_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Selecting Columns for Prediction

In [3]:
ALL_COLUMNS = df.columns
ERROR_COLUMNS = [col for col in ALL_COLUMNS if "err" in col]
EXCLUDE = ["rowid", "kepid", "kepoi_name", "koi_score", "koi_disposition", "koi_pdisposition", "koi_tce_delivname", "koi_tce_plnt_num"] + ERROR_COLUMNS
TO_USE = list(set(ALL_COLUMNS) - set(EXCLUDE))

In [4]:
print(len(TO_USE))
df[TO_USE].head()

19


Unnamed: 0,koi_slogg,koi_period,ra,koi_prad,koi_model_snr,koi_fpflag_ec,koi_srad,koi_teq,koi_impact,koi_fpflag_co,koi_insol,koi_fpflag_ss,koi_steff,dec,koi_fpflag_nt,koi_time0bk,koi_depth,koi_duration,koi_kepmag
0,4.467,9.488036,291.93423,2.26,35.8,0,0.927,793.0,0.146,0,93.59,0,5455.0,48.141651,0,170.53875,615.8,2.9575,15.347
1,4.467,54.418383,291.93423,2.83,25.8,0,0.927,443.0,0.586,0,9.11,0,5455.0,48.141651,0,162.51384,874.8,4.507,15.347
2,4.544,19.89914,297.00482,14.6,76.3,0,0.868,638.0,0.969,0,39.3,1,5853.0,48.134129,0,175.850252,10829.0,1.7822,15.436
3,4.564,1.736952,285.53461,33.46,505.6,0,0.791,1395.0,1.276,0,891.96,1,5805.0,48.28521,0,170.307565,8079.2,2.40641,15.597
4,4.438,2.525592,288.75488,2.75,40.9,0,1.046,1406.0,0.701,0,926.16,0,6031.0,48.2262,0,171.59555,603.3,1.6545,15.509


# Extracting Dataframe Subset

We extract only those rows which are either positive or negative examples i.e. the `disposition value` is *not* `CANDIDATE`

In [5]:
subset_df = df[df["koi_disposition"] != "CANDIDATE"]
print(subset_df.shape)
subset_df.head()

(6939, 47)


Unnamed: 0,rowid,kepid,kepoi_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,CONFIRMED,CANDIDATE,1.0,0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


# Obtaining X and y

We also need to encode the categorical labels
```
CONFIRMED - 1
FALSE POSITIVE - 0
```

In [6]:
X = subset_df[TO_USE].values
y = subset_df["koi_disposition"].apply(lambda x: x=='CONFIRMED').astype(int).values

In [7]:
subset_df["koi_disposition"].value_counts()

FALSE POSITIVE    4647
CONFIRMED         2292
Name: koi_disposition, dtype: int64

# PCA Visualisation

In [None]:
x = StandardScaler().fit_transform(X)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])
principalDf["TARGET"] = y
finalDf = principalDf
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [1, 0]
# print(finalDf)

colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['TARGET'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()
plt.show()

# Splitting into Train and Test sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4649, 19), (2290, 19), (4649,), (2290,))

# Decision Tree

In [None]:
# baseline model
classifier = DecisionTreeClassifier(random_state=0)
classifier.fit(X_train, y_train)

In [None]:
pred = classifier.predict(X_test)
conf_matrix = confusion_matrix(y_test, pred)
f1 = f1_score(y_test, pred)
report = classification_report(y_test, pred)
accuracy = balanced_accuracy_score(y_test, pred)
kappa = cohen_kappa_score(y_test, pred)

In [None]:
conf_matrix

In [None]:
print(report)

In [None]:
print(f"F1 Score: {f1}")
print(f"Kappa Score: {kappa}")
print(f"Accuracy Score: {accuracy}")

In [None]:
#tree.plot_tree(classifier, filled=True)

## Hyperparameter Tuning

In [10]:
parameters = {
    "criterion":["gini", "entropy"],
    "splitter":["best", "random"],
    "max_depth":[None] + np.linspace(1, 32, 32, endpoint=True).tolist(),
    "min_samples_split":np.linspace(0.1, 10, 100, endpoint=True),
    "min_samples_leaf":np.linspace(0.1, 1.0, 10, endpoint=True),
    "max_features":[None] + list(range(1,X_train.shape[1])),
    "min_weight_fraction_leaf":np.linspace(0.1, 1.0, 10, endpoint=True),
    "random_state":[0],
    "min_impurity_decrease":np.linspace(0, 10, 110, endpoint=True),
    "class_weight":[None, "balanced"],
    "max_leaf_nodes":[None] + np.linspace(1, 100, 100, endpoint=True).tolist(),
    "ccp_alpha":np.linspace(0, 10, 11, endpoint=True)
}

scores = ["f1", "balanced_accuracy"]

In [12]:
classifier = GridSearchCV(
    DecisionTreeClassifier(),
    param_grid=parameters,
    scoring=scores,
    return_train_score=True,
    refit=False,
    verbose=2,
    n_jobs=-1
)

classifier.fit(X_train, y_train)

MemoryError: 