## Random Forest Model

In [None]:
# Libraries
import pandas as pd

### Read the CSV and Basic Data Cleaning

In [None]:
df = pd.read_csv("../Resources/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

### Select features

In [None]:
# Separate column to predict and get dummies
data_multiclass = pd.get_dummies(df, columns=['koi_disposition'])
data_multiclass.head()

In [None]:
# Set features. This will also be used as your x values.
# We are choosing some parameters related to the light curve when a planet passes infront of a star and its period.
selected_features = data_multiclass[['koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_nt', 'koi_period', 'koi_time0bk', 'koi_depth']]

In [None]:
# Set y feature.
# The parameters to classify. Notice there are 3 classes.
y = data_multiclass[['koi_disposition_CANDIDATE', 'koi_disposition_CONFIRMED', 'koi_disposition_FALSE POSITIVE']]

### Create a Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(selected_features, y, random_state=1)

### Pre-processing

Scale data

In [None]:
# Scale data and define seed
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Train the Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, y_train)

### Evaluate the Model

In [None]:
# Evaluate model
rf.score(X_test_scaled, y_test)

In [None]:
# The features that most affected in the classification
sorted(zip(rf.feature_importances_, selected_features), reverse=True)

### Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [100,500], 'max_features': ['auto','sqrt', 'log2']}

In [None]:
# Train the model with GridSearch
grid = GridSearchCV(rf, param_grid, verbose=3)
grid.fit(X_train_scaled, y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

### Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'save_models/anan.sav'
joblib.dump(rf, filename)