In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import os

In [None]:
exoplanets = pd.read_csv(os.path.join('Resources', 'cumulative.csv'))
pd.set_option('display.max_columns', None)
exoplanets

At least for now: keeping all columns in case of later interest, but removing derived columns from predictions

Evaluations for comparison: koi_pdisposition (use only rows where value is FALSE POSITIVE or CANDIDATE, since others (mostly?) have incomplete data); koi_score

ID to use: kepoi_name
ID columns to ignore: kepid, kepler_name, rowid
Evaluations to ignore (for now): koi_disposition, koi_vet_stat, koi_vet_date, koi_disp_prov, koi_comment, koi_fittype, k_limbdark_mod, koi_parm_prov, koi_time0 (same info as koi_time0bk), koi_fittype (describes analytical methods - might be related to other variablesbut secondary), koi_limbdark_mod	(describes analytical method)
koi_ldm_coeff1, koi_ldm_coeff2, koi_ldm_coeff3, koi_ldm_coeff4 (fitted parameters...)
koi_tce_plnt_num, koi_tce_delivname, koi_quarters
koi_trans_mod, koi_model_dof
koi_datalink_dvr, koi_datalink_dvs	
koi_ror, koi_prad ... are calculated, but kept it in for now

In [None]:
#Remove rows (if any) where koi_pdisposition is not FALSE POSITIVE or CANDIDATE; koi_disposition has additional categories
exoplanets.koi_pdisposition.unique()
#None found in current file

In [None]:
#Make koi_pdisposition and koi_disposition numerical variables, see if they are the same (no, )
exoplanets_pdisp_cat = pd.get_dummies(exoplanets, prefix=['koi_pdisposition'], columns=['koi_pdisposition'])
#
exoplanets_pdisp_cat

In [None]:
exoplanets_disp_cat = pd.get_dummies(exoplanets_pdisp_cat, prefix=['koi_disposition'], columns=['koi_disposition'])
exoplanets_disp_cat.drop('koi_pdisposition_FALSE POSITIVE', axis=1, inplace=True)

exoplanets_disp_cat

**Remember to drop koi_pdisposition_CANDIDATE
For a basic set: X = exoplanets_disp_cat[["koi_fpflag_nt", "koi_fpflag_ss", "koi_fpflag_co", "koi_fpflag_ec", "koi_period", "koi_time0bk", "koi_impact","koi_duration", "koi_depth", "koi_prad", "koi_teq", "koi_insol", "koi_model_snr", "koi_steff", "koi_slogg", "koi_srad", "ra", "dec", "koi_kepmag"]]

For basic set, drop ("rowid", "kepid", "kepoi_name", "kepler_name", "koi_score", "koi_period_err1", "koi_period_err2", "koi_time0bk_err1", "koi_time0bk_err2", "koi_impact_err1", "koi_impact_err2", "koi_duration_err1", "koi_duration_err2", "koi_depth_err1", "koi_depth_err2", "koi_prad_err1", "koi_prad_err2", "koi_teq_err1", "koi_teq_err2", "koi_insol_err1", "koi_insol_err2", "koi_tce_plnt_num", "koi_tce_delivname","koi_steff_err1", "koi_steff_err2", "koi_slogg_err1", "koi_slogg_err2", "koi_srad_err1", "koi_srad_err2", "koi_disposition_CANDIDATE", "koi_disposition_CONFIRMED", "koi_disposition_FALSE POSITIVE")

y = exoplanets_disp_cat[[koi_pdisposition_CANDIDATE]]

In [None]:
exoplanets_basic = exoplanets_disp_cat.drop(["rowid", "kepoi_name", "kepler_name", "koi_score", "koi_period_err1", "koi_period_err2", "koi_time0bk_err1", "koi_time0bk_err2", "koi_impact_err1", "koi_impact_err2", "koi_duration_err1", "koi_duration_err2", "koi_depth_err1", "koi_depth_err2", "koi_prad_err1", "koi_prad_err2", "koi_teq_err1", "koi_teq_err2", "koi_insol_err1", "koi_insol_err2", "koi_tce_plnt_num", "koi_tce_delivname","koi_steff_err1", "koi_steff_err2", "koi_slogg_err1", "koi_slogg_err2", "koi_srad_err1", "koi_srad_err2", "koi_disposition_CANDIDATE", "koi_disposition_CONFIRMED", "koi_disposition_FALSE POSITIVE"], axis=1)
exoplanets_basic.dropna(axis=0)
#None found
#https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for
#exoplanets_basic = exoplanets_basic.reset_index()




In [None]:
np.any(np.isnan(exoplanets_basic))

In [None]:
np.all(np.isfinite(exoplanets_basic))

In [None]:
exoplanets_basic

In [None]:
y = exoplanets_basic["koi_pdisposition_CANDIDATE"]
target_names = ["negative", "positive"]

In [None]:
X = exoplanets_basic.drop("koi_pdisposition_CANDIDATE", axis=1)
X

##KNN

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=312)

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data

X_scaler = StandardScaler().fit(X_train)


In [None]:
# Transform the training and testing data using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_test_scaled

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
# Note that k: 13 seems to be the best choice for this dataset
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train_scaled, y_train)
print('k=13 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))