In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
arashnic_exoplanets_path = kagglehub.dataset_download('arashnic/exoplanets')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn Packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.ensemble import  RandomForestClassifier
from sklearn.metrics import *
import lightgbm as lgb
import xgboost as xgb
# Sklearn Evaluation Metrics
from sklearn import metrics


# Visualizes all the columns
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):



    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)

    if title:
        plt.title(title)

df = pd.read_csv('../input/exoplanets/exoplanets.csv')

# Print the shape of the dataset
print(df.shape)

# Select top of the dataset
df.head()

In [None]:
df = df.rename(columns={'kepid':'KepID',
'kepoi_name':'KOIName',
'kepler_name':'KeplerName',
'koi_disposition':'ExoplanetArchiveDisposition',
'koi_pdisposition':'DispositionUsingKeplerData',
'koi_score':'DispositionScore',
'koi_fpflag_nt':'NotTransit-LikeFalsePositiveFlag',
'koi_fpflag_ss':'koi_fpflag_ss',
'koi_fpflag_co':'CentroidOffsetFalsePositiveFlag',
'koi_fpflag_ec':'EphemerisMatchIndicatesContaminationFalsePositiveFlag',
'koi_period':'OrbitalPeriod_days',
'koi_period_err1':'OrbitalPeriodUpperUnc_days',
'koi_period_err2':'OrbitalPeriodLowerUnc_days',
'koi_time0bk':'TransitEpoch_BKJD',
'koi_time0bk_err1':'TransitEpochUpperUnc_BKJD',
'koi_time0bk_err2':'TransitEpochLowerUnc_BKJD',
'koi_impact':'ImpactParamete',
'koi_impact_err1':'ImpactParameterUpperUnc',
'koi_impact_err2':'ImpactParameterLowerUnc',
'koi_duration':'TransitDuration_hrs',
'koi_duration_err1':'TransitDurationUpperUnc_hrs',
'koi_duration_err2':'TransitDurationLowerUnc_hrs',
'koi_depth':'TransitDepth_ppm',
'koi_depth_err1':'TransitDepthUpperUnc_ppm',
'koi_depth_err2':'TransitDepthLowerUnc_ppm',
'koi_prad':'PlanetaryRadius_Earthradii',
'koi_prad_err1':'PlanetaryRadiusUpperUnc_Earthradii',
'koi_prad_err2':'PlanetaryRadiusLowerUnc_Earthradii',
'koi_teq':'EquilibriumTemperatureK',
'koi_teq_err1':'EquilibriumTemperatureUpperUncK',
'koi_teq_err2':'EquilibriumTemperatureLowerUncK',
'koi_insol':'InsolationFlux_Earthflux',
'koi_insol_err1':'InsolationFluxUpperUnc_Earthflux',
'koi_insol_err2':'InsolationFluxLowerUnc_Earthflux',
'koi_model_snr':'TransitSignal-to-Nois',
'koi_tce_plnt_num':'TCEPlanetNumbe',
'koi_tce_delivname':'TCEDeliver',
'koi_steff':'StellarEffectiveTemperatureK',
'koi_steff_err1':'StellarEffectiveTemperatureUpperUncK',
'koi_steff_err2':'StellarEffectiveTemperatureLowerUncK',
'koi_slogg':'StellarSurfaceGravity_log10(cm/s**2)',
'koi_slogg_err1':'StellarSurfaceGravityUpperUnc_log10(cm/s**2)',
'koi_slogg_err2':'StellarSurfaceGravityLowerUnc_log10(cm/s**2)',
'koi_srad':'StellarRadius_Solarradii',
'koi_srad_err1':'StellarRadiusUpperUnc_Solarradii',
'koi_srad_err2':'StellarRadiusLowerUnc_Solarradii',
'ra':'RA_decimaldegrees',
'dec':'Dec_decimaldegrees',
'koi_kepmag':'Kepler-band_mag'
})
df.head().T

In [None]:
df.isnull().sum()

>Let’s choose the targets we want to predict: Exoplanet Archive Disposition and Disposition Using Kepler Data. Since we are focusing on data collected by the Kepler mission, let’s use our second option. We also need a continuous target. Let’s transform our target into a binary feature using lambda.

In [None]:
df['ExoplanetCandidate'] = df['DispositionUsingKeplerData'].apply(lambda x: 1 if x == 'CANDIDATE' else 0)
df['ExoplanetConfirmed'] = df['ExoplanetArchiveDisposition'].apply(lambda x: 2 if x == 'CONFIRMED' else 1 if x == 'CANDIDATE' else 0 )


In [None]:
sns.countplot(df['ExoplanetCandidate'])

In [None]:
sns.countplot(df['ExoplanetConfirmed'])

In [None]:
df.drop(columns=['KeplerName','KOIName','EquilibriumTemperatureUpperUncK',
                 'KepID','ExoplanetArchiveDisposition','DispositionUsingKeplerData',
                 'NotTransit-LikeFalsePositiveFlag','koi_fpflag_ss','CentroidOffsetFalsePositiveFlag',
                 'EphemerisMatchIndicatesContaminationFalsePositiveFlag','TCEDeliver',
                 'EquilibriumTemperatureLowerUncK'], inplace=True)

In [None]:
df.dropna(inplace=True)
df.shape

In [None]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(df)

In [None]:
train, test = train_test_split(df, test_size=.15)
data = df.drop(columns=['ExoplanetConfirmed'])
train = train.drop(columns=['ExoplanetConfirmed'])
test = test.drop(columns=['ExoplanetConfirmed'])
target_trn = train.pop('ExoplanetCandidate')
target_tst = test.pop('ExoplanetCandidate')


In [None]:
train.shape, target_trn.shape, test.shape, target_tst.shape

In [None]:
train.info()

In [None]:
score_auc = []
score_recall = []
oof_rf = np.zeros(len(train))
pred_rf = np.zeros(len(test))

folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for fold_ , (train_ind, val_ind) in enumerate(folds.split(train, target_trn)):
    print('fold:', fold_, '  - Starting ...')
    trn_data, val_data = train.iloc[train_ind], train.iloc[val_ind]
    y_train, y_val = target_trn.iloc[train_ind], target_trn.iloc[val_ind]

    rf = RandomForestClassifier(n_estimators=150, max_depth=5, criterion='gini', max_features=0.8, n_jobs= -1, random_state=32)
    rf.fit(trn_data, y_train)
    oof_rf[val_ind] = rf.predict_proba(val_data)[:, 1]
    y = rf.predict_proba(trn_data)[:, 1]
    print('val auc:' , roc_auc_score(y_val, oof_rf[val_ind]))
    print('val recall:' , recall_score(y_val, np.where(oof_rf[val_ind] > 0.5, 1, 0)))

    score_auc.append(roc_auc_score(y_val, oof_rf[val_ind]))
    score_recall.append(recall_score(y_val, np.where(oof_rf[val_ind] > 0.5, 1, 0)))

    pred_rf += rf.predict_proba(test)[:, 1]/folds.n_splits

print(' Model auc: -------> ', np.mean(score_auc))
print(' Model recall: -------> ', np.mean(score_recall))

In [None]:
oof_nn_rd = np.where(oof_rf >= 0.5, 1, 0)
cf_matrix = confusion_matrix(target_trn, oof_nn_rd)

labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Zero', 'One']
plt.style.use('seaborn-poster')
sns.set(font_scale=1.4)
make_confusion_matrix(cf_matrix,
                      group_names=labels,
                      categories=categories,
                      cmap='vlag', figsize=(9, 6))


In [None]:
roc_auc_score(target_tst, pred_rf)

In [None]:
xgb_params = {

    'objective':'binary:logistic',
    'max_depth': 5,
    'learning_rate': 0.01,
    'booster':'gbtree',
    'eval_metric': 'auc',
    'max_leaves': 10,
    'colsample_bytree': 0.8, #feature fraction
    'subsample':0.7, # bagging fraction
    'lambda': 2,
    'alpha': 3,


}


xgb_scores = []

oof_xgb = np.zeros(len(train))
pred_xgb = np.zeros(len(test))

importances = pd.DataFrame()


folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4242)

for fold_, (train_ind, val_ind) in enumerate(folds.split(train, target_trn)):
    print('fold : ----------------------------------------', fold_)
    trn_data = xgb.DMatrix(data=train.iloc[train_ind], label=target_trn.iloc[train_ind])
    val_data = xgb.DMatrix(data= train.iloc[val_ind], label=target_trn.iloc[val_ind])


    xgb_model = xgb.train(xgb_params, trn_data, num_boost_round=1000, evals=[(trn_data, 'train'), (val_data, 'test')], verbose_eval=100, early_stopping_rounds=100)
    oof_xgb[val_ind] = xgb_model.predict(xgb.DMatrix(train.iloc[val_ind]),  ntree_limit= xgb_model.best_ntree_limit)

    print(roc_auc_score(target_trn.iloc[val_ind], oof_xgb[val_ind]))
    xgb_scores.append(roc_auc_score(target_trn.iloc[val_ind], oof_xgb[val_ind]))

    importance_score = xgb_model.get_score(importance_type='gain')
    importance_frame = pd.DataFrame({'Importance': list(importance_score.values()), 'Feature': list(importance_score.keys())})
    importance_frame['fold'] = fold_ +1
    importances = pd.concat([importances, importance_frame], axis=0, sort=False)

    pred_xgb += xgb_model.predict(xgb.DMatrix(test), ntree_limit= xgb_model.best_ntree_limit)/folds.n_splits

print('model auc:------------------>', np.mean(xgb_scores))

In [None]:
roc_auc_score(target_tst, pred_xgb)

In [None]:
mean_gain = importances[['Importance', 'Feature']].groupby('Feature').mean()
#importances['mean_score'] = importances['Feature'].map(mean_gain['Importance'])
mean_gain = mean_gain.reset_index()
sns.set(font_scale=1.2)
plt.figure(figsize=(17, 33))
sns.barplot(x='Importance', y='Feature', data=mean_gain.sort_values('Importance', ascending=False), palette='icefire')

In [None]:
oof_nn_rd = np.where(oof_xgb >= 0.5, 1, 0)
cf_matrix = confusion_matrix(target_trn, oof_nn_rd)

labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Zero', 'One']
plt.style.use('seaborn-poster')
sns.set(font_scale=1.4)
make_confusion_matrix(cf_matrix,
                      group_names=labels,
                      categories=categories,
                      cmap='vlag', figsize=(9, 6))


In [None]:
"""
from sklearn.linear_model import LogisticRegression
score_auc = []
score_recall = []
oof_lr = np.zeros(len(train))
pred_lr = np.zeros(len(test))

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_ , (train_ind, val_ind) in enumerate(folds.split(train, target_trn)):
    print('fold:', fold_, '  - Starting ...')
    trn_data, val_data = train.iloc[train_ind], train.iloc[val_ind]
    y_train, y_val = target_trn.iloc[train_ind], target_trn.iloc[val_ind]

    lr = LogisticRegression(C=1, max_iter=400, class_weight='balanced', random_state=32)
    lr.fit(trn_data, y_train)
    oof_lr[val_ind] = rf.predict_proba(val_data)[:, 1]
    y = lr.predict_proba(trn_data)[:, 1]
    print('val auc:' , roc_auc_score(y_val, oof_lr[val_ind]))
    print('val recall:' , recall_score(y_val, np.where(oof_lr[val_ind] >= 0.5, 1, 0)))

    score_auc.append(roc_auc_score(y_val, oof_lr[val_ind]))
    score_recall.append(recall_score(y_val, np.where(oof_lr[val_ind] >= 0.5, 1, 0)))

    pred_lr += lr.predict_proba(test)[:, 1]/folds.n_splits

print(' Model auc: -------> ', np.mean(score_auc))
print(' Model recall: -------> ', np.mean(score_recall))"""