In [2]:
#import libs

import numpy as np
import pandas as pd
from sklearn.model_selection import (train_test_split,cross_val_score)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score,classification_report,confusion_matrix)
from sklearn.preprocessing import StandardScaler
import joblib

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [4]:
#data ingestion
df=pd.read_csv("https://raw.githubusercontent.com/abhi6174/ExoSeek/3d2c305aa9b9ca3536a0d10ed5e373167ae99182/dataset/kepler_exoplanet_detection.csv")
df.head()
df.info

<bound method DataFrame.info of       rowid     kepid kepoi_name   kepler_name koi_disposition  \
0         1  10797460  K00752.01  Kepler-227 b       CONFIRMED   
1         2  10797460  K00752.02  Kepler-227 c       CONFIRMED   
2         3  10811496  K00753.01           NaN  FALSE POSITIVE   
3         4  10848459  K00754.01           NaN  FALSE POSITIVE   
4         5  10854555  K00755.01  Kepler-664 b       CONFIRMED   
...     ...       ...        ...           ...             ...   
9559   9560  10031643  K07984.01           NaN  FALSE POSITIVE   
9560   9561  10090151  K07985.01           NaN  FALSE POSITIVE   
9561   9562  10128825  K07986.01           NaN       CANDIDATE   
9562   9563  10147276  K07987.01           NaN  FALSE POSITIVE   
9563   9564  10156110  K07989.01           NaN  FALSE POSITIVE   

     koi_pdisposition  koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  \
0           CANDIDATE      1.000              0              0              0   
1           C

In [5]:
#data preprocessing
df=df.drop(columns=['rowid','kepid','kepoi_name','kepler_name','koi_tce_delivname','koi_pdisposition','koi_score'])

threshold = 0.5  # remove columns with more than 90% missing
df = df.loc[:, df.isnull().mean() < threshold]

df = df.dropna()

# remove candidate part
df=df[df['koi_disposition']!= 'CANDIDATE']

# convert koi_diposition labels for binary classification
df['koi_disposition'] = df['koi_disposition'].map({'CONFIRMED':1,'FALSE POSITIVE':0})



df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,0.146,0.318,-0.146,2.9575,0.0819,-0.0819,615.8,19.5,-19.5,2.26,0.26,-0.15,793.0,93.59,29.45,-16.65,35.8,1.0,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,1,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,0.586,0.059,-0.443,4.507,0.116,-0.116,874.8,35.5,-35.5,2.83,0.32,-0.19,443.0,9.11,2.87,-1.62,25.8,2.0,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,0,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.7822,0.0341,-0.0341,10829.0,171.0,-171.0,14.6,3.92,-1.31,638.0,39.3,31.04,-10.49,76.3,1.0,5853.0,158.0,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,0,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8079.2,12.8,-12.8,33.46,8.5,-2.83,1395.0,891.96,668.95,-230.35,505.6,1.0,5805.0,157.0,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,1,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,0.701,0.235,-0.478,1.6545,0.042,-0.042,603.3,16.9,-16.9,2.75,0.88,-0.35,1406.0,926.16,874.33,-314.24,40.9,1.0,6031.0,169.0,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [6]:
df.describe()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
count,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0,6630.0
mean,0.342685,0.20724,0.313273,0.265309,0.165008,52.741045,0.001724292,-0.001724292,162.96264,0.008086,-0.008086,0.77531,2.087718,-0.320919,5.767242,0.291021,-0.291021,30470.29,145.654525,-145.654525,131.908041,21.418834,-43.607919,1153.382051,7882.761,3768.8,-3923.682,339.31368,1.228356,5717.105732,145.652338,-163.943741,4.303693,0.122494,-0.139513,1.747915,0.361208,-0.397904,292.166046,43.795307,14.248079
std,0.474643,0.405359,0.463859,0.441531,0.371215,116.11676,0.007239338,0.007239338,63.790106,0.01939,0.01939,3.46303,9.679288,1.182864,6.825624,0.621836,0.621836,92699.58,4818.958428,4818.958428,3614.592079,448.003347,1403.706405,881.486772,159376.3,53345.56,75414.97,911.646722,0.65064,829.184596,47.535893,76.07776,0.443971,0.134255,0.080347,5.592639,0.984761,1.796756,4.751499,3.599927,1.358264
min,0.0,0.0,0.0,0.0,0.0,0.299698,1.1e-08,-0.1568,120.515914,9e-06,-0.569,0.0,0.0,-37.53,0.167,0.0,-20.2,4.5,0.0,-388600.0,0.14,0.0,-77180.0,92.0,0.02,0.0,-5362422.0,1.6,1.0,2661.0,0.0,-1762.0,0.047,0.0,-1.007,0.116,0.0,-103.825,279.85272,36.577381,6.966
25%,0.0,0.0,0.0,0.0,0.0,2.181772,3.3865e-06,-0.00014605,132.540047,0.000834,-0.008068,0.224,0.036,-0.432,2.54165,0.0363,-0.279,186.225,8.9,-50.875,1.52,0.25,-2.9575,572.0,25.2925,12.7025,-413.4375,15.5,1.0,5302.25,107.0,-198.0,4.204,0.044,-0.195,0.826,0.125,-0.256,288.858303,40.734306,13.43725
50%,0.0,0.0,0.0,0.0,0.0,7.96739,2.0325e-05,-2.0325e-05,136.42939,0.003,-0.003,0.58,0.18,-0.175,3.892,0.107,-0.107,513.4,19.4,-19.4,2.7,0.6,-0.37,934.0,180.135,97.415,-54.915,32.0,1.0,5766.5,157.0,-161.0,4.4365,0.072,-0.128,0.999,0.248,-0.112,292.34729,43.65724,14.4975
75%,1.0,0.0,1.0,1.0,0.0,29.21587,0.00014605,-3.3865e-06,170.256968,0.008068,-0.000834,0.917,0.38175,-0.037,6.295623,0.279,-0.0363,2829.7,50.875,-8.9,25.4175,4.8,-0.15,1498.5,1190.72,793.8625,-7.2725,126.475,1.0,6123.0,175.0,-115.0,4.545,0.15,-0.087,1.369,0.357,-0.069,295.984557,46.722487,15.292
max,1.0,1.0,1.0,1.0,1.0,1071.232624,0.1568,-1.1e-08,1472.522306,0.569,-9e-06,100.806,85.54,0.0,138.54,20.2,0.0,1541400.0,388600.0,0.0,200346.0,21640.0,0.0,14667.0,10947550.0,3617133.0,0.0,9054.7,8.0,15896.0,676.0,0.0,5.283,1.472,0.0,180.013,33.091,0.0,301.72076,52.33601,19.065


In [7]:
#data normalisation
X=df.drop(columns=['koi_disposition'])
y=df['koi_disposition']

#splitting the data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
scaler=StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#convert back to df
X_train_scaled = pd.DataFrame(X_train_scaled,columns = X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled,columns = X.columns)
X_train_scaled.head()


Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,-0.508001,-0.679975,-0.600872,-0.444175,-0.369869,-0.233423,0.233423,-0.459578,-0.240758,0.240758,-0.225151,-0.170101,0.27238,-0.302418,-0.311862,0.311862,-0.320219,-0.021554,0.021554,-0.033966,-0.052874,0.029099,-0.508941,-0.046095,-0.065417,0.047686,-0.352484,-0.344979,-0.554398,0.800136,-0.255716,0.620096,-0.692346,0.345205,-0.161548,-0.231524,0.17428,0.007582,-0.018071,1.222339
1,-0.508001,1.470643,-0.600872,-0.444175,-0.44802,-0.239231,0.239231,-0.484162,-0.416375,0.416375,0.001856,-0.211125,0.250613,-0.585416,-0.499522,0.499522,1.677554,-0.030173,0.030173,-0.026309,-0.048553,0.027882,0.109076,-0.043053,-0.06343,0.046282,0.787366,-0.344979,-1.333318,-0.144203,0.325829,0.939785,-0.55513,1.375961,-0.210585,-0.34149,0.187273,-1.104653,0.925293,1.285216
2,-0.508001,1.470643,-0.600872,-0.444175,-0.443048,-0.239014,0.239014,-0.477712,-0.372301,0.372301,0.156127,-0.206818,0.21927,1.062445,-0.505101,0.505101,-0.267172,-0.020532,0.020532,-0.023886,-0.02343,0.02684,0.561676,-0.036194,-0.044104,0.042228,0.007193,-0.344979,0.463814,0.443386,-0.229869,0.463652,-0.585623,-0.305153,-0.142728,-0.135045,0.162326,1.145355,1.090584,-0.388783
3,-0.508001,-0.679975,-0.600872,-0.444175,-0.382542,-0.234204,0.234204,-0.445726,-0.234519,0.234519,0.012708,-0.201792,-0.209959,-0.415892,-0.311862,0.311862,-0.327536,-0.027108,0.027108,-0.034123,-0.052534,0.029025,-0.284939,-0.045509,-0.063774,0.047155,-0.356265,-0.344979,0.125631,0.233533,-0.359101,0.275466,-0.25783,-0.587384,-0.131159,-0.087324,0.128024,0.531883,1.472039,0.649792
4,-0.508001,1.470643,1.664248,-0.444175,-0.448347,-0.239115,0.239115,-0.492057,-0.364502,0.364502,0.145569,-0.166306,0.153101,-0.270044,-0.430726,0.430726,-0.249601,-0.018693,0.018693,-0.023932,-0.02167,0.026383,0.953394,-0.024548,-0.014648,0.033424,-0.297782,-0.344979,0.36004,0.905062,-0.87603,0.425108,-0.478899,-0.40332,-0.135994,-0.10911,0.151931,1.841521,0.040873,1.245271


In [8]:
#Model training

model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled,y_train)

#predict
y_pred = model.predict(X_test_scaled)

#Evaluation
print(f"\n Accuracy  : {accuracy_score(y_test,y_pred)}")
print("\n Classification report : ",classification_report(y_test,y_pred))
print(f"\n Confusion matrix   : {confusion_matrix(y_test,y_pred)}")



 Accuracy  : 0.9826546003016591

 Classification report :                precision    recall  f1-score   support

           0       0.97      1.00      0.99       872
           1       1.00      0.95      0.97       454

    accuracy                           0.98      1326
   macro avg       0.99      0.97      0.98      1326
weighted avg       0.98      0.98      0.98      1326


 Confusion matrix   : [[872   0]
 [ 23 431]]


In [9]:
scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print("Cross-validation accuracy:", scores)
print("Mean:", scores.mean(), "Std:", scores.std())

Cross-validation accuracy: [0.98868992 0.98586239 0.98114986 0.98114986 0.98962264]
Mean: 0.9852949335799265 Std: 0.003603894005743478


In [10]:
df[['koi_fpflag_co','koi_disposition']].corr()

Unnamed: 0,koi_fpflag_co,koi_disposition
koi_fpflag_co,1.0,-0.429576
koi_disposition,-0.429576,1.0


In [15]:
#feature importance

importances=pd.Series(model.feature_importances_,index=X.columns)
top_features = importances.sort_values(ascending=False)



['koi_fpflag_co', 'koi_steff_err2', 'koi_steff_err1', 'koi_fpflag_ss', 'koi_prad', 'koi_prad_err2', 'koi_prad_err1']


In [12]:
#save model
joblib.dump(model,'Exoplanet_prediction_model.pkl')

['Exoplanet_prediction_model.pkl']

In [34]:
#feature reduction
n=16
selected_features=top_features.head(n).index.tolist()
X_selected=X[selected_features]
X_selected.head()

X_train,X_test,y_train,y_test=train_test_split(X_selected,y,test_size=0.2,random_state=42,stratify=y)

scaler=StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#convert back to df
X_train_scaled = pd.DataFrame(X_train_scaled,columns = X_selected.columns)
X_test_scaled = pd.DataFrame(X_test_scaled,columns = X_selected.columns)
X_train_scaled.head()

#Model training

model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled,y_train)

#predict
y_pred = model.predict(X_test_scaled)

#Evaluation
print(f"\n Accuracy  : {accuracy_score(y_test,y_pred)}")
print("\n Classification report : ",classification_report(y_test,y_pred))
print(f"\n Confusion matrix   : {confusion_matrix(y_test,y_pred)}")



 Accuracy  : 0.9894419306184012

 Classification report :                precision    recall  f1-score   support

           0       0.98      1.00      0.99       872
           1       1.00      0.97      0.98       454

    accuracy                           0.99      1326
   macro avg       0.99      0.98      0.99      1326
weighted avg       0.99      0.99      0.99      1326


 Confusion matrix   : [[872   0]
 [ 14 440]]
