# Preprocessing

This notebook covers the preprocessing for the cleaned data. The preprocessing will include PCA, and feature selection. It will have various pipelines that can be used to train different sets of processed data to compare the results of a machine learning model. 

Import Libraries

In [98]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
import xgboost as xgb

import warnings

warnings.filterwarnings('ignore')

Before we begin preprocessing, we will run some simple models to determine a baseline. This baseline can then be used as a benchmark after completing other preprocessing steps.

### Importing Cleaned Data

In [110]:
data = pd.read_csv('..\data\kois_cleaned.csv', index_col=0)
kois = data.copy()
kois.head()

Unnamed: 0_level_0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,koi_disposition_encoded
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
2,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1
3,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,...,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,2
4,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,2
5,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,...,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,1


In [100]:
#exclude error columns with .describe()
kois.loc[:, ~kois.columns.str.contains('_err')].describe()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag,koi_disposition_encoded
count,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0,9007.0
mean,0.167203,0.235039,0.197402,0.122238,75.093189,165.792699,0.722475,5.63654,22934.44,102.880978,1077.611302,7325.554,262.226624,5704.450094,4.310499,1.699245,292.047428,43.828852,14.268715,1.265238
std,0.373178,0.424047,0.39806,0.327579,1375.035264,67.589891,3.296483,6.409275,80785.73,3109.455176,843.714627,158437.7,803.003738,797.67067,0.430133,5.622511,4.760429,3.598752,1.348588,0.821739
min,0.0,0.0,0.0,0.0,0.241843,120.515914,0.0,0.167,4.5,0.14,25.0,0.0,1.6,2661.0,0.047,0.109,279.85272,36.577381,6.966,0.0
25%,0.0,0.0,0.0,0.0,2.741232,132.773544,0.196,2.454245,160.3,1.4,537.0,19.605,12.1,5301.0,4.215,0.827,288.66077,40.810164,13.469,1.0
50%,0.0,0.0,0.0,0.0,9.449426,137.15752,0.534,3.82353,418.3,2.38,875.0,138.34,23.0,5761.0,4.44,0.997,292.24902,43.720329,14.534,2.0
75%,0.0,0.0,0.0,0.0,37.597864,170.54512,0.886,6.308,1413.55,14.375,1372.0,835.915,76.65,6116.0,4.544,1.347,295.8564,46.722311,15.319,2.0
max,1.0,1.0,1.0,1.0,129995.7784,1472.522306,100.806,138.54,1541400.0,200346.0,14667.0,10947550.0,9054.7,15896.0,5.364,180.013,301.72076,52.33601,19.065,2.0


In [101]:
print(kois.shape)

(9007, 40)


### Splitting Data

In [102]:
#separate our target variable from the rest of the data
y = kois['koi_disposition_encoded']
X = kois.drop(['koi_disposition_encoded'], axis=1)

In [103]:
#print the percentage of values in each class
print(f"Percent of Total Dispositions for Each Target Variable: \n{y.value_counts(normalize=True)}")
print(f"\nTarget Variable Descriptive Statistics:\n{y.describe()}")


Percent of Total Dispositions for Each Target Variable: 
koi_disposition_encoded
2    0.505385
1    0.254469
0    0.240147
Name: proportion, dtype: float64

Target Variable Descriptive Statistics:
count    9007.000000
mean        1.265238
std         0.821739
min         0.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         2.000000
Name: koi_disposition_encoded, dtype: float64


In [104]:
#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #use a test size of 20%

### Instantiating Models

In [105]:
#Decision tree classifier
dtc = DecisionTreeClassifier()

#Random forest classifier
rfc = RandomForestClassifier()

#Logistic regression classifier
logreg = LogisticRegression()

#Support vector machine classifier
svc = SVC()

#K-nearest neighbors classifier
knn = KNeighborsClassifier()

#XGBoost classifier
xgbc = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

### Train the Models

In [106]:
#train Decision tree classifier
dtc.fit(X_train, y_train)

#train Random forest classifier
rfc.fit(X_train, y_train)

#train Logistic regression classifier
logreg.fit(X_train, y_train)

#train Support vector machine classifier
svc.fit(X_train, y_train)

#train K-nearest neighbors classifier
knn.fit(X_train, y_train)

#train XGBoost classifier
xgbc.fit(X_train, y_train)

### Check Model Performance

In [107]:
#Decision tree classifier score
dtc_score = dtc.score(X_test, y_test)

#Random forest classifier score
rfc_score = rfc.score(X_test, y_test)

#Logistic regression classifier score
logreg_score = logreg.score(X_test, y_test)

#Support vector machine classifier score
svc_score = svc.score(X_test, y_test)

#K-nearest neighbors classifier score
knn_score = knn.score(X_test, y_test)

#XGBoost classifier score
xgbc_score = xgbc.score(X_test, y_test)

In [108]:
#Print the scores

# Create a dictionary with the model names and their scores
model_scores = {
    "Decision Tree": dtc_score,
    "Random Forest": rfc_score,
    "Logistic Regression": logreg_score,
    "SVM": svc_score,
    "KNN": knn_score,
    "XGBoost": xgbc_score
}

# Convert the dictionary to a DataFrame
scores_df = pd.DataFrame(list(model_scores.items()), columns=['Model', 'Accuracy Score'])

# Sort the DataFrame by 'Accuracy Score' in descending order
scores_df = scores_df.sort_values(by='Accuracy Score', ascending=False)

# Display the DataFrame
print(scores_df)

base_report = classification_report(y_test, xgbc.predict(X_test))

print(f"\n{classification_report(y_test, dtc.predict(X_test))}")


                 Model  Accuracy Score
5              XGBoost        0.893452
1        Random Forest        0.891787
0        Decision Tree        0.849612
4                  KNN        0.623751
2  Logistic Regression        0.529967
3                  SVM        0.490566

              precision    recall  f1-score   support

           0       0.72      0.71      0.72       452
           1       0.73      0.76      0.74       466
           2       0.98      0.97      0.98       884

    accuracy                           0.85      1802
   macro avg       0.81      0.81      0.81      1802
weighted avg       0.85      0.85      0.85      1802



Now that we have the above scores, we have a benchmark to work with. We also know that the median is 2 for the target variable. And if you chose the median for every prediction you would be right approximately 50% of the time. Looking at the scores from our models, we can now see which models had the greatest gains over simply choosing the median every time. 

We can also see which models performed worse than our simplest guess. 

Before we cast judgment on the performance of these models, we have to remember that some are more sensitive to feature scaling than others. For example, Logistic Regression, SVM, and KNN all benefit greatly from feature scaling, while Decision Trees, Random Forest, and XGBoost all are typically invariant to feature scaling. 

What this tells us is that in order to utilize some of these models properly we should perform some sort of feature scaling on the data.

### Feature Scaling

For SVM and Logistic Regression, we will use the Z-score normalization or `StandardScaler()` from the Scikit Learn library. 

For KNN, we will use Min-Max scaling (normalization) or the `MinMaxScaler()` also from Scikit Learn. 

In [78]:
#fit the scalers to the train and test sets separately and transform both
scaler = StandardScaler()

X_train_stndrd = scaler.fit_transform(X_train)
X_test_stndrd = scaler.transform(X_test)

minmax = MinMaxScaler()

X_train_minmax = minmax.fit_transform(X_train)
X_test_minmax = minmax.transform(X_test)

### Retrain Models

We can now use the scaled data to retrain our models. We can compare it with our tree models without scaled data, as well as pass the scaled data through the tree models to evaluate performance. These scores can act as our new benchmarks.

In [79]:
#training our tree and ensemble models on the scaled data
#train Decision tree classifier
dtc.fit(X_train_stndrd, y_train)

#train Random forest classifier
rfc.fit(X_train_stndrd, y_train)

#train XGBoost classifier
xgbc.fit(X_train_stndrd, y_train)

In [80]:
#Training the remaining models on their preferred scaling method
#train Logistic regression classifier
logreg.fit(X_train_stndrd, y_train)

#train Support vector machine classifier
svc.fit(X_train_stndrd, y_train)

#train K-nearest neighbors classifier
knn.fit(X_train_minmax, y_train)

In [81]:
#Decision tree classifier score
dtc_score = dtc.score(X_test_stndrd, y_test)

#Random forest classifier score
rfc_score = rfc.score(X_test_stndrd, y_test)

#Logistic regression classifier score
logreg_score = logreg.score(X_test_stndrd, y_test)

#Support vector machine classifier score
svc_score = svc.score(X_test_stndrd, y_test)

#K-nearest neighbors classifier score
knn_score = knn.score(X_test_minmax, y_test)

#XGBoost classifier score
xgbc_score = xgbc.score(X_test_stndrd, y_test)

In [82]:
#Print the scores

# Create a dictionary with the model names and their scores
model_scores_scaled = {
    "Decision Tree": dtc_score,
    "Random Forest": rfc_score,
    "Logistic Regression": logreg_score,
    "SVM": svc_score,
    "KNN": knn_score,
    "XGBoost": xgbc_score
}

# Convert the dictionary to a DataFrame
scores_scaled = pd.DataFrame(list(model_scores_scaled.items()), columns=['Model-Scaling', 'Accuracy Score'])

# Sort the DataFrame by 'Accuracy Score' in descending order
scores_scaled = scores_scaled.sort_values(by='Accuracy Score', ascending=False)

# Display the DataFrame for both sets of scores
print(scores_df)
print(scores_scaled)

print(f"\n Classification Report using StandardScaler: \n{classification_report(y_test, dtc.predict(X_test_stndrd))}")
print(f"\n Classification Report using MinMaxScaler: \n{classification_report(y_test, knn.predict(X_test_minmax))}")

                 Model  Accuracy Score
5              XGBoost        0.893452
1        Random Forest        0.887902
0        Decision Tree        0.851276
4                  KNN        0.623751
2  Logistic Regression        0.529967
3                  SVM        0.490566
         Model-Scaling  Accuracy Score
5              XGBoost        0.893452
1        Random Forest        0.892342
2  Logistic Regression        0.876249
3                  SVM        0.869589
0        Decision Tree        0.846837
4                  KNN        0.809101

 Classification Report using StandardScaler: 
              precision    recall  f1-score   support

           0       0.72      0.71      0.71       452
           1       0.72      0.75      0.74       466
           2       0.98      0.97      0.97       884

    accuracy                           0.85      1802
   macro avg       0.81      0.81      0.81      1802
weighted avg       0.85      0.85      0.85      1802


 Classification Report us

We can see huge improvements in the accuracy just by applying scaling to our data for the the models that are typically sensitive to it. Our ensemble models maintained their performance rankings with only a minor change in the random forest performance and no change in the performance of XGBoost. 

Logistic Regression had the largest gain of 0.35. Every model is now performing in the 80% territory which is fairly strong when compared to our absolute base benchmark of always choosing the median. 

### Outliers

From the data cleaning and EDA notebook, we saw that there were potential outliers, not only in the data, but also in the error values. We opted to leave the outliers in due to a lack of domain knowledge. But as part of our preprocessing we can not remove some of those extreme values and compare performance.

We will start off with very conservative outlier removal, to see if this improves our performance. Since we saw vast improvements from our scaling, we will also scale the data once outliers have been removed. From what we saw in our data exploration notebook, most of the variables were left skewed, with their most extreme values being greater than the mean. Eliminating outliers outside of IQR + 3 * $\sigma$ can help remove these extreme values. 

In [83]:
# Calculate the Z-score for each feature in the DataFrame
z_scores = (kois - kois.mean()) / kois.std()

# Identify rows where any feature has a Z-score greater than 4
outliers = (z_scores > 3)

# Filter those rows from the DataFrame
kois_no_outliers = kois[~outliers.any(axis=1)]

In [84]:
#print the shape of our original data and our new dataset with no outliers
kois_no_outliers.shape

(7418, 40)

We can see from the above that we have eliminated about 1050 values that were categorized as outliers. We can now split our new data set, and apply our feature scaling.

In [85]:
#separate our target variable from the rest of the data
y_no_outliers = kois_no_outliers['koi_disposition_encoded']
X_not_outliers = kois_no_outliers.drop(['koi_disposition_encoded'], axis=1)

In [86]:
#split the data into training and testing sets
#NO suffix is for no outliers
X_train_NO, X_test_NO, y_train_NO, y_test_NO = train_test_split(X_not_outliers, y_no_outliers, test_size=0.2, random_state=42) #use a test size of 20%

Scale the data without outliers

In [87]:
#fit the scalers to the train and test sets separately and transform both

#standard scaling
X_train_stndrd_NO = scaler.fit_transform(X_train_NO)
X_test_stndrd_NO = scaler.transform(X_test_NO)

#minmax scaling
X_train_minmax_NO = minmax.fit_transform(X_train_NO)
X_test_minmax_NO = minmax.transform(X_test_NO)

In [88]:
#training our tree and ensemble models on the scaled data with outliers removed
#train Decision tree classifier
dtc.fit(X_train_stndrd_NO, y_train_NO)

#train Random forest classifier
rfc.fit(X_train_stndrd_NO, y_train_NO)

#train XGBoost classifier
xgbc.fit(X_train_minmax_NO, y_train_NO)

In [89]:
#Training the remaining models on their preferred scaling method with outliers removed
#train Logistic regression classifier
logreg.fit(X_train_stndrd_NO, y_train_NO)

#train Support vector machine classifier
svc.fit(X_train_stndrd_NO, y_train_NO)

#train K-nearest neighbors classifier
knn.fit(X_train_stndrd_NO, y_train_NO)

Performance of models with outliers removed, and scaled data

In [90]:
#Decision tree classifier score
dtc_score = dtc.score(X_test_stndrd_NO, y_test_NO)

#Random forest classifier score
rfc_score = rfc.score(X_test_stndrd_NO, y_test_NO)

#Logistic regression classifier score
logreg_score = logreg.score(X_test_stndrd_NO, y_test_NO)

#Support vector machine classifier score
svc_score = svc.score(X_test_stndrd_NO, y_test_NO)

#K-nearest neighbors classifier score
knn_score = knn.score(X_test_minmax_NO, y_test_NO)

#XGBoost classifier score
xgbc_score = xgbc.score(X_test_stndrd_NO, y_test_NO)

In [93]:
#Print the scores

# Create a dictionary with the model names and their scores
model_scores_scaled_NO = {
    "Decision Tree": dtc_score,
    "Random Forest": rfc_score,
    "Logistic Regression": logreg_score,
    "SVM": svc_score,
    "KNN": knn_score,
    "XGBoost": xgbc_score
}

# Convert the dictionary to a DataFrame
scores_scaled_NO = pd.DataFrame(list(model_scores_scaled_NO.items()), columns=['Model-Scaling-No Outliers', 'Accuracy Score'])

# Sort the DataFrame by 'Accuracy Score' in descending order
scores_scaled_NO = scores_scaled_NO.sort_values(by='Accuracy Score', ascending=False)

# Display the DataFrame for both sets of scores
print(scores_df)
print(scores_scaled)
print(scores_scaled_NO)

print(classification_report(y_test_NO, xgbc.predict(X_test_stndrd_NO)))

                 Model  Accuracy Score
5              XGBoost        0.893452
1        Random Forest        0.887902
0        Decision Tree        0.851276
4                  KNN        0.623751
2  Logistic Regression        0.529967
3                  SVM        0.490566
         Model-Scaling  Accuracy Score
5              XGBoost        0.893452
1        Random Forest        0.892342
2  Logistic Regression        0.876249
3                  SVM        0.869589
0        Decision Tree        0.846837
4                  KNN        0.809101
  Model-Scaling-No Outliers  Accuracy Score
1             Random Forest        0.884097
2       Logistic Regression        0.882075
3                       SVM        0.879380
0             Decision Tree        0.828841
5                   XGBoost        0.671833
4                       KNN        0.598383
              precision    recall  f1-score   support

           0       0.45      0.92      0.60       366
           1       1.00      0.02    

We saw drastic performance loss in some of our models, and even some performance loss in our top performaning models. This indicates that the so-called outliers, can not simply be filtered out using the methods that were employed. We will continue with the data as is, and look at other possibilities for preprocessing that may improve performance. 

### Logarithmic Scaling

As we saw in our data exploration, as well as when viewing the descriptive statistics of our cleaned dataset, there was a very wide range of values for many of the variables. Despite the scaling, these extreme values can still negatively impact our model performance. Another method for dealing with extreme values is log scaling. 

Log scaling also makes sense in the context of our data as it is measurements on an astronomical scale, where magnitudes can very logarithmically.

As there are no built in log-scalers, we can create our own. 

In [45]:
#Import relevant libraries
from sklearn.base import BaseEstimator, TransformerMixin

#Create a class to scale the data using the log function from numpy
class LogScaler(BaseEstimator, TransformerMixin):
    def __init__(self, epsilon=1e-5):
        self.epsilon = epsilon  # Small constant to handle zero values

    def fit(self, X, y=None):
        return self

    #ned to take the absolute value of the data to avoid errors
    def transform(self, X):
        return np.log1p(np.abs(X + self.epsilon))

In [46]:
#apply the log scaler to the data
log_scaler = LogScaler(epsilon=1e-5)

X_train_log = log_scaler.fit_transform(X_train)
X_test_log = log_scaler.transform(X_test)

We can train our models without our Standard or MinMax Scalers to compare performance:

In [47]:
#train Decision tree classifier
dtc.fit(X_train_log, y_train)

#train Random forest classifier
rfc.fit(X_train_log, y_train)

#train Logistic regression classifier
logreg.fit(X_train_log, y_train)

#train Support vector machine classifier
svc.fit(X_train_log, y_train)

#train K-nearest neighbors classifier
knn.fit(X_train_log, y_train)

#train XGBoost classifier
xgbc.fit(X_train_log, y_train)

In [48]:
#Decision tree classifier score
dtc_score = dtc.score(X_test_log, y_test)

#Random forest classifier score
rfc_score = rfc.score(X_test_log, y_test)

#Logistic regression classifier score
logreg_score = logreg.score(X_test_log, y_test)

#Support vector machine classifier score
svc_score = svc.score(X_test_log, y_test)

#K-nearest neighbors classifier score
knn_score = knn.score(X_test_log, y_test)

#XGBoost classifier score
xgbc_score = xgbc.score(X_test_log, y_test)

Compare with our scaled data using normal transformers:

In [50]:
#Print the scores

# Create a dictionary with the model names and their scores
model_scores_log = {
    "Decision Tree": dtc_score,
    "Random Forest": rfc_score,
    "Logistic Regression": logreg_score,
    "SVM": svc_score,
    "KNN": knn_score,
    "XGBoost": xgbc_score
}

# Convert the dictionary to a DataFrame
scores_log = pd.DataFrame(list(model_scores_log.items()), columns=['Model-Log-Scaling', 'Accuracy Score'])

# Sort the DataFrame by 'Accuracy Score' in descending order
scores_log = scores_log.sort_values(by='Accuracy Score', ascending=False)

# Display the DataFrame for both sets of scores
print(scores_scaled)
print(scores_log)

print(f"\n Classification Report for XGBoost: \n{classification_report(y_test, xgbc.predict(X_test_log))}")
print(f"\n Classification Report for Random Forest: \n{classification_report(y_test, rfc.predict(X_test_minmax))}")
print(f"\n Classification Report for Logistic Regression: \n{classification_report(y_test, logreg.predict(X_test_stndrd))}")
print(f"\n Classification Report for SVM: \n{classification_report(y_test, svc.predict(X_test_minmax))}")
print(f"\n Classification Report for Decision Tree: \n{classification_report(y_test, dtc.predict(X_test_minmax))}")
print(f"\n Classification Report for KNN: \n{classification_report(y_test, knn.predict(X_test_minmax))}")

         Model-Scaling  Accuracy Score
5              XGBoost        0.893452
1        Random Forest        0.890677
2  Logistic Regression        0.876249
3                  SVM        0.869589
0        Decision Tree        0.847392
4                  KNN        0.809101
     Model-Log-Scaling  Accuracy Score
5              XGBoost        0.896781
2  Logistic Regression        0.894562
1        Random Forest        0.891232
3                  SVM        0.865705
0        Decision Tree        0.846282
4                  KNN        0.798557

 Classification Report for XGBoost: 
              precision    recall  f1-score   support

           0       0.82      0.77      0.79       452
           1       0.80      0.83      0.82       466
           2       0.99      0.99      0.99       884

    accuracy                           0.90      1802
   macro avg       0.87      0.87      0.87      1802
weighted avg       0.90      0.90      0.90      1802


 Classification Report for Randome

Looking at the output from our `classification_report`, we see some obvious issues with the performance of our models. XGBoost out of the box is doing okay, but still favours the majority class. 

Random Forest had precision, recall and f1-score for 1 (a CONFIRMED exoplanet) of zero. Meaning it may have only ever predicted 0 and 2 for all test values. 

Logistic Regression has okay scores for class 2 (FALSE POSITIVE), but very poor scores for our CONFIRMED class. 

These types of patterns of poor performance are similar for the remaining models.

### Feature Selection

As a next step, we are going to perorm feature selection to see if we can out do any of the above scores. 

The most basic feature selection method we can apply, is simply removing columns we suspect may negatively impact model performance. Essentially we can complete this, then compare performance. 

To start off, we can remove the error columns, which constitute about 2/3rds of the columns. 

In [94]:
#remove error columns from kois
kois_no_err = kois.loc[:, ~kois.columns.str.contains('_err')]
print(kois_no_err.shape)
kois_no_err.head()

(9007, 20)


Unnamed: 0_level_0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag,koi_disposition_encoded
rowid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,0,0,9.488036,170.53875,0.146,2.9575,615.8,2.26,793.0,93.59,35.8,5455.0,4.467,0.927,291.93423,48.141651,15.347,1
2,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443.0,9.11,25.8,5455.0,4.467,0.927,291.93423,48.141651,15.347,1
3,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638.0,39.3,76.3,5853.0,4.544,0.868,297.00482,48.134129,15.436,2
4,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,891.96,505.6,5805.0,4.564,0.791,285.53461,48.28521,15.597,2
5,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406.0,926.16,40.9,6031.0,4.438,1.046,288.75488,48.2262,15.509,1


In [95]:
#separate our target variable from the rest of the data
y_no_err = kois_no_err['koi_disposition_encoded']
X_no_err = kois_no_err.drop(['koi_disposition_encoded'], axis=1)

#split the data into training and testing sets
X_train_no_err, X_test_no_err, y_train_no_err, y_test_no_err = train_test_split(X_no_err, y_no_err, test_size=0.2, random_state=42) #use a test size of 20%

We will stick to XGBoost moving forward as it is has been our best performing model.

In [109]:
#train XGBoost classifier
xgbc.fit(X_train_no_err, y_train_no_err)

#XGBoost classifier score
xgbc_score = xgbc.score(X_test_no_err, y_test_no_err)

#classification report for base model and XGBoost
print(f"\n Base Classification Report: \n{base_report}")
print(f"\n Classification Report for XGBoost: \n{classification_report(y_test_no_err, xgbc.predict(X_test_no_err))}")


 Base Classification Report: 
              precision    recall  f1-score   support

           0       0.81      0.77      0.79       452
           1       0.80      0.82      0.81       466
           2       0.99      0.99      0.99       884

    accuracy                           0.89      1802
   macro avg       0.86      0.86      0.86      1802
weighted avg       0.89      0.89      0.89      1802


 Classification Report for XGBoost: 
              precision    recall  f1-score   support

           0       0.83      0.79      0.81       452
           1       0.81      0.84      0.83       466
           2       0.99      0.99      0.99       884

    accuracy                           0.90      1802
   macro avg       0.88      0.88      0.88      1802
weighted avg       0.90      0.90      0.90      1802



We can see when comparing to the base report, that we see some slight performance improvements by dropping the error columns. 

We can apply a couple more methods before we make a decision as to the best approach for feature selection. 

In [131]:
#import feature selection tools
from sklearn.feature_selection import SelectKBest, f_classif

#apply feature selection to the data
fs = SelectKBest(score_func=f_classif, k=15)

#fit the feature selector to the training data
X_selected = fs.fit_transform(X_train, y_train)
#transform the test data
X_test_selected = fs.transform(X_test)

#check the shape of the selected data
print(X_selected.shape)

(7205, 15)


In [132]:
#train the XGBoost model using the subset of features
xgbc.fit(X_selected, y_train)

#XGBoost classifier score
xgbc_score_kbest = xgbc.score(X_test_selected, y_test)

#classification report for base model and XGBoost
xgboost_clf_report = classification_report(y_test, xgbc.predict(X_test_selected))

print(f"\n Base Classification Report: \n{base_report}")
print(f"\n Classification Report for XGBoost: \n{xgboost_clf_report}")


 Base Classification Report: 
              precision    recall  f1-score   support

           0       0.81      0.77      0.79       452
           1       0.80      0.82      0.81       466
           2       0.99      0.99      0.99       884

    accuracy                           0.89      1802
   macro avg       0.86      0.86      0.86      1802
weighted avg       0.89      0.89      0.89      1802


 Classification Report for XGBoost: 
              precision    recall  f1-score   support

           0       0.81      0.79      0.80       452
           1       0.81      0.82      0.81       466
           2       0.99      0.99      0.99       884

    accuracy                           0.90      1802
   macro avg       0.87      0.87      0.87      1802
weighted avg       0.90      0.90      0.90      1802

