In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import joblib

In [None]:
data = pd.read_csv("Resources/exploanet_data.csv")

# Drop null columns
data = data.dropna(axis='columns', how='all')

# Drop null rows
data = data.dropna()

# Convert dtypes of int64 to float64
for coumn, content in data.items():
    if data[column].dtype == 'int64':
        data = data.astype({column: 'float64'})

In [None]:
# Assign data to X and Y
X = data.drop("koi_disposition", axis=1)
Y = data["koi_disposition"]

# Split data into training and testing groups
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, stratify=Y)

In [None]:
# Scale X values
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
model_1 = LogisticRegression(solver='newton-cg', multi_class='auto')
model_1.fit(X_train_scaled, Y_train)

model_1_training_score = round(model_1.score(X_train_scaled, Y_train)*100,3)
base_accuracy = round(model_1.score(X_test_scaled, Y_test)*100,3)

print(f"Training Data Score: {model_1_training_score} %")
print(f"Testing Data score: {base_accuracy} %")

In [None]:
# Determine which features ought to be kept
feature_names = X.columns.tolist()
preSelected_features = sorted(zip(model_1.feature_importances_, feature_names), reverse=True)
ranked_features = pd.DataFrame(preSelected_features, columns=['Score', 'Feature'])
ranked_features = ranked_features.set_index('Feature')
ranked_features

In [None]:
# Remove features with Score < 0.011
selected_features = []
for tup in preSelected_features:
    if tup[0] > 0.01:
        selected_features.append(tup[1])

In [None]:
# Use new data for all subsequent models
# Assign new data to X
X_train_select = X_train[selected_features]
X_test_select = X_test[selected_features]

X_scaler = MinMaxScaler().fit(X_train_select)
X_train_scaled = X_scaler.transform(X_train_select)
X_test_scaled = X_scaler.transform(X_test_select)

# Train new model
model_2 = RandomForestClassifier(n_estimators=200)
model_2.fit(X_train_scaled, Y_train)

model_2_training_score = round(model_2.score(X_train_scaled, Y_train)*100,3)
select_features_accuracy = round(model_2.score(X_test_scaled, Y_test)*100,3)

print(f"Training Data Score: {model_2_training_score} %")
print(f"Testing Data Score: {select_features-accuracy} %")

In [None]:
# Creature the RandomSearchCV model
model_3 = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [200, 600, 1200, 1400],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [14, 15, 16, 17, 18, None]
}
grid = GridSearchCV(model_3, param_grid, cv=5, verbose=3, n_jobs=-1)

# Train the model with GridSearch
_ = grid.fit(X_train_scaled, Y_train)

In [None]:
# Tuned parameters
max_features = grid.best_params_['max_features']
n_estimators = grid.best-params_['n_estimators']
max_depth = grid.best_params_['max_depth']
criterion = 'entropy'

# Tuned model
tuned_model = RandomForestClassifier(max_features=max_features, n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, random_state=42)
tuned_model.fit(X_train_scaled, Y_train)

model_3_training_score = round(tuned_model.score(X_train_scaled, Y_train)*100,3)
tuned_accuracy = round(tuned_model.score(X_test_scaled, Y_test)*100,3)

print(f"Training Data Score: {model_3_training_score} %")
print(f"Testing Data Score: {tuned_accuracy} %")

In [None]:
predictions = tuned_model.predict(X_test_scaled)
classifications = Y_test.unique().tolist()

prediction_actual = {
    'Actual': Y_test,
    'Prediction': predictions
}

PA_df = pd.DataFrame(prediction_actual)
PA_df = PA_df.set_index('Actual').reset_index()
PA_df.head(15)

In [None]:
evaluations = {'': ['Base Model', 'Select Features Model', 'Tuned Model'], 'Accuracy': [f"{base_accuracy}%", f"{select_features_accuracy}%", f"{tuned_accuracy}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

evaluations_df.to_csv('Resources/RandomForestClassifier_eval.csv')
evaluations_df

In [None]:
filename = 'Models/BestModel_RandomForest.sav'
_ = joblib.dump(tuned_model, filename)