# Dependecies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
import joblib

# Process Data

### Data Cleanup

In [2]:
data = pd.read_csv("../Resources/exoplanet_data.csv")

# Drop null columns
data = data.dropna(axis='columns', how='all')

# Drop null rows
data = data.dropna()

# Convert dtypes of int64 to float64
for column, content in data.items():
    if data[column].dtype == 'int64':
        data = data.astype({column: 'float64'})

### Pre-prossessing

In [3]:
# Assign data to X and y
X = data.drop("koi_disposition", axis=1)
y = data["koi_disposition"]

# Split data into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [4]:
# Scale X values
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Build the Model

### Train the Model

In [5]:
model_1 = RandomForestClassifier(n_estimators=200)
model_1.fit(X_train_scaled, y_train)

model_1_training_score = round(model_1.score(X_train_scaled, y_train)*100,3)
base_accuracy = round(model_1.score(X_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model_1_training_score} %")
print(f"Testing Data Score: {base_accuracy} %")

Training Data Score: 100.0 %
Testing Data Score: 89.588 %


### Select Features

In [6]:
# Determine which features ought to be kept
feature_names = X.columns.tolist()
preSelected_features = sorted(zip(model_1.feature_importances_, feature_names), reverse=True)
ranked_features = pd.DataFrame(preSelected_features, columns=['Score', 'Feature'])
ranked_features = ranked_features.set_index('Feature')
ranked_features

Unnamed: 0_level_0,Score
Feature,Unnamed: 1_level_1
koi_fpflag_co,0.109365
koi_fpflag_nt,0.091183
koi_fpflag_ss,0.070236
koi_model_snr,0.058444
koi_prad,0.050818
koi_fpflag_ec,0.033547
koi_prad_err1,0.032845
koi_steff_err1,0.03284
koi_duration_err1,0.031482
koi_duration_err2,0.029662


In [7]:
# Remove features with Score < 0.011
selected_features = []
for tup in preSelected_features:
    if tup[0] > 0.01:
        selected_features.append(tup[1])

In [8]:
# Use new data for all subsequent models
## Assign new data to X 
X_train_select = X_train[selected_features]
X_test_select = X_test[selected_features]

X_scaler = MinMaxScaler().fit(X_train_select)
X_train_scaled = X_scaler.transform(X_train_select)
X_test_scaled = X_scaler.transform(X_test_select)

## Train new model
model_2 = RandomForestClassifier(n_estimators=200)
model_2.fit(X_train_scaled, y_train)

model_2_training_score = round(model_2.score(X_train_scaled, y_train)*100,3)
select_features_accuracy = round(model_2.score(X_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model_2_training_score} %")
print(f"Testing Data Score: {select_features_accuracy} %")

Training Data Score: 100.0 %
Testing Data Score: 90.389 %


### Model Tuning

In [9]:
# Create the RandomSearchCV model
model_3 = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [200, 600, 1200, 1400],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [14, 15, 16, 17, 18, None]
}
grid = GridSearchCV(model_3, param_grid, cv=5, verbose=3, n_jobs=-1)

# Train the model with GridSearch
_ = grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


### Train Tuned Model

In [10]:
# Tuned parameters
max_features = grid.best_params_['max_features']
n_estimators = grid.best_params_['n_estimators']
max_depth = grid.best_params_['max_depth']
criterion = 'entropy'

# Tuned model
tuned_model = RandomForestClassifier(max_features=max_features, n_estimators=n_estimators, 
                                     criterion=criterion, max_depth=max_depth, random_state=42)
tuned_model.fit(X_train_scaled, y_train)

model_3_training_score = round(tuned_model.score(X_train_scaled, y_train)*100,3)
tuned_accuracy = round(tuned_model.score(X_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model_3_training_score} %")
print(f"Testing Data Score: {tuned_accuracy} %")

Training Data Score: 97.788 %
Testing Data Score: 90.446 %


# Model Predictions and Evaluations

### Predicitions

In [11]:
predictions = tuned_model.predict(X_test_scaled)
classifications = y_test.unique().tolist()

prediction_actual = {
    'Actual': y_test,
    'Prediction': predictions
}

PA_df = pd.DataFrame(prediction_actual)
PA_df = PA_df.set_index('Actual').reset_index()
PA_df.head(15)

Unnamed: 0,Actual,Prediction
0,CANDIDATE,CANDIDATE
1,FALSE POSITIVE,FALSE POSITIVE
2,FALSE POSITIVE,FALSE POSITIVE
3,FALSE POSITIVE,FALSE POSITIVE
4,CANDIDATE,CANDIDATE
5,FALSE POSITIVE,FALSE POSITIVE
6,CANDIDATE,CANDIDATE
7,FALSE POSITIVE,FALSE POSITIVE
8,FALSE POSITIVE,FALSE POSITIVE
9,FALSE POSITIVE,FALSE POSITIVE


### Evaluations

In [12]:
evaluations = {'': ['Base Model', 'Select Features Model', 'Tuned Model'],
               'Accuracy': [f"{base_accuracy}%", f"{select_features_accuracy}%", f"{tuned_accuracy}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

evaluations_df.to_csv('../Resources/RandomForestClassifier_eval.csv')
evaluations_df

Unnamed: 0,Accuracy
,
Base Model,89.588%
Select Features Model,90.389%
Tuned Model,90.446%


# Save the Model

In [13]:
filename = '../Models/BestModel_RandomForest.sav'
_ = joblib.dump(tuned_model, filename)