In [52]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /Users/annmcnamara/tableauApp/virt/lib/python3.7/site-packages (0.0)


In [53]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

from sklearn.feature_selection import RFECV

%matplotlib inline

# Read the CSV and Perform Basic Data Cleaning

In [55]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
y = df['koi_disposition']
df = df.drop(columns=['koi_disposition'])
# Drop the null rows
df.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,-0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [56]:
df.columns

Index(['koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
       'koi_period', 'koi_period_err1', 'koi_period_err2', 'koi_time0bk',
       'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1',
       'koi_impact_err2', 'koi_duration', 'koi_duration_err1',
       'koi_duration_err2', 'koi_depth', 'koi_depth_err1', 'koi_depth_err2',
       'koi_prad', 'koi_prad_err1', 'koi_prad_err2', 'koi_teq', 'koi_insol',
       'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num',
       'koi_steff', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg',
       'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'koi_srad_err1',
       'koi_srad_err2', 'ra', 'dec', 'koi_kepmag'],
      dtype='object')

A full description of the columns here: https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html



In [57]:
# Set features. This will also be used as your x values.
# selected_features = df[['koi_kepmag', 'koi_period', 
#                         'koi_duration', 'ra', 'dec', 'koi_slogg', 'koi_prad', 'koi_insol',
#                         'koi_srad', 'koi_steff', 'koi_teq', 'koi_depth',]]



# Create a Train Test Split

Use `koi_disposition` for the y values

In [58]:
from sklearn.model_selection import train_test_split

In [59]:
X = df  # assuming all X data is numerical so x_dummies here would not change data.

In [60]:
X

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,-0.003520,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,1,0,0,19.899140,1.490000e-05,-1.490000e-05,175.850252,0.000581,-0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,-0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
3,0,0,0,0,2.525592,3.760000e-06,-3.760000e-06,171.595550,0.001130,-0.001130,...,-211,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
4,0,0,0,0,4.134435,1.050000e-05,-1.050000e-05,172.979370,0.001900,-0.001900,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,0,0,0,1,8.589871,1.846000e-04,-1.846000e-04,132.016100,0.015700,-0.015700,...,-152,4.296,0.231,-0.189,1.088,0.313,-0.228,298.74921,46.973351,14.478
6987,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,-0.000170,...,-166,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
6988,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,-0.007690,...,-220,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
6989,0,0,1,0,0.681402,2.430000e-06,-2.430000e-06,132.181750,0.002850,-0.002850,...,-236,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385


In [61]:
y

0            CONFIRMED
1       FALSE POSITIVE
2       FALSE POSITIVE
3            CONFIRMED
4            CONFIRMED
             ...      
6986    FALSE POSITIVE
6987    FALSE POSITIVE
6988         CANDIDATE
6989    FALSE POSITIVE
6990    FALSE POSITIVE
Name: koi_disposition, Length: 6991, dtype: object

In [62]:
#Split data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y)

In [63]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
4002,0,0,1,0,99.673478,0.0003463,-0.0003463,219.33483,0.0023,-0.0023,...,-148,4.777,0.04,-0.027,0.492,0.026,-0.027,293.05801,45.248821,15.801
4246,0,1,0,0,0.592244,9e-08,-9e-08,131.654831,0.000124,-0.000124,...,-146,4.664,0.056,-0.032,0.591,0.045,-0.045,290.28094,45.46426,15.653
548,0,1,1,0,9.991625,5.36e-06,-5.36e-06,137.447816,0.000445,-0.000445,...,-176,4.338,0.153,-0.187,1.096,0.309,-0.206,301.04239,45.022888,14.039
3953,0,1,0,0,178.41299,3.1e-05,-3.1e-05,218.225235,0.000127,-0.000127,...,-134,4.346,0.084,-0.126,1.148,0.202,-0.124,288.32785,38.627621,13.944
2362,0,0,0,0,45.294223,5.6e-05,-5.6e-05,138.678725,0.000987,-0.000987,...,-68,4.347,0.03,-0.03,1.044,0.057,-0.042,285.67938,50.241299,10.961


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [64]:
# Scale or Normalize your data. Use MinMaxScaler i

from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)


In [65]:
# Transform the training and testing data using the X_scaler and y_scaler models

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled  = X_scaler.transform(X_test)
# dont scale y values as they are catagories - scaling would not work

# Train the Model


In [66]:
model = LogisticRegression(max_iter = 1000)  #gives a warning without max_iter
model.fit(X_train_scaled, y_train)

model_training_score = round(model.score(X_train_scaled, y_train)*100,2)
base_accuracy        = round(model.score(X_test_scaled, y_test)  *100,2)

print(f"Training Data Score: {model_training_score} %")
print(f"Testing Data Score : {base_accuracy} %")

Training Data Score: 85.03 %
Testing Data Score : 85.07 %


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [67]:
# Create the RANDOM FOREST CLASSIFIER  Model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)

In [68]:
rf = rf.fit(X_train_scaled, y_train)

In [69]:
rf.score(X_test_scaled, y_test)

0.8953089244851259

In [40]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_
importances

array([0.09746036, 0.05915372, 0.10316696, 0.03590394, 0.02223312,
       0.01908639, 0.01939675, 0.01431682, 0.02184875, 0.02211263,
       0.01689058, 0.01147822, 0.01168642, 0.02623634, 0.03217698,
       0.03687702, 0.01717773, 0.0138828 , 0.01410481, 0.05826939,
       0.02776361, 0.03275025, 0.01751941, 0.01422499, 0.01752585,
       0.01211494, 0.05380129, 0.00345359, 0.00956558, 0.0307613 ,
       0.03389958, 0.01027652, 0.00831051, 0.0096209 , 0.01007168,
       0.01115486, 0.00862527, 0.01234622, 0.01188302, 0.01087092])

In [71]:
# Check which features the model thinks are most important
ranked_features = sorted(zip(rf.feature_importances_, df.columns), reverse=True)
#ranked_features

[(0.10655373188293113, 'koi_fpflag_co'),
 (0.10327002454292562, 'koi_fpflag_nt'),
 (0.0698731000377088, 'koi_fpflag_ss'),
 (0.05942773663497868, 'koi_model_snr'),
 (0.044528415674509284, 'koi_prad'),
 (0.035841410353988475, 'koi_fpflag_ec'),
 (0.03343811327278438, 'koi_prad_err1'),
 (0.03200636727825291, 'koi_duration_err1'),
 (0.03115353503986388, 'koi_duration_err2'),
 (0.03033935623017623, 'koi_steff_err1'),
 (0.028506887089842625, 'koi_prad_err2'),
 (0.025644635590576975, 'koi_time0bk_err1'),
 (0.025059777404088945, 'koi_duration'),
 (0.024375166405103094, 'koi_steff_err2'),
 (0.022414188298802804, 'koi_period'),
 (0.02135763319234444, 'koi_period_err1'),
 (0.020794542791340347, 'koi_insol_err1'),
 (0.02051959287092106, 'koi_time0bk_err2'),
 (0.01946943561900689, 'koi_depth'),
 (0.01767009033181218, 'koi_period_err2'),
 (0.01760483198152616, 'koi_impact'),
 (0.016112420252513734, 'koi_teq'),
 (0.014108402118777858, 'koi_depth_err1'),
 (0.013949203022197805, 'koi_depth_err2'),
 (0.0

In [89]:
# Remove features with Score < 0.015
selected_features = []
#print(len(ranked_features))

for i in range(len(ranked_features)):
    if ranked_features[i][0] > 0.015:
        selected_features.append(ranked_features[i][1])
#print(len(selected_features))

selected_features

['koi_fpflag_co',
 'koi_fpflag_nt',
 'koi_fpflag_ss',
 'koi_model_snr',
 'koi_prad',
 'koi_fpflag_ec',
 'koi_prad_err1',
 'koi_duration_err1',
 'koi_duration_err2',
 'koi_steff_err1',
 'koi_prad_err2',
 'koi_time0bk_err1',
 'koi_duration',
 'koi_steff_err2',
 'koi_period',
 'koi_period_err1',
 'koi_insol_err1',
 'koi_time0bk_err2',
 'koi_depth',
 'koi_period_err2',
 'koi_impact',
 'koi_teq']

In [90]:
## Assign new data to X 
X_train_select = X_train[selected_features]
X_test_select  = X_test[selected_features]

X_scaler       = MinMaxScaler().fit(X_train_select)

X_train_scaled = X_scaler.transform(X_train_select)
X_test_scaled  = X_scaler.transform(X_test_select)

## Train new model
model_2 = RandomForestClassifier(n_estimators=200)
model_2.fit(X_train_scaled, y_train)

model_2_training_score   = round(model_2.score(X_train_scaled, y_train)*100,2)
select_features_accuracy = round(model_2.score(X_test_scaled, y_test)*100,2)

print(f"Training Data Score: {model_2_training_score} %")
print(f"Testing Data Score: {select_features_accuracy} %")

Training Data Score: 100.0 %
Testing Data Score: 88.96 %


## Hypertuning
While model parameters are learned during training — such as the slope and intercept in a linear regression — hyperparameters must be set by the data scientist before training. In the case of a random forest, hyperparameters include the number of decision trees in the forest and the number of features considered by each tree when splitting a node. 

In [33]:
from pprint import pprint

In [34]:
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


#### Parameters
There are a lot of parameters to tune - this notebook will focus on the following:

* n_estimators = number of trees in the forest
* max_features = max number of features considered for splitting a node
* max_depth = max number of levels in each decision tree
* min_samples_split = min number of data points placed in a node before the node is split
* min_samples_leaf = min number of data points allowed in a leaf node
* bootstrap = method for sampling data points (with or without replacement)

In [35]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


On each iteration, the algorithm will choose a difference combination of the features. Altogether, there are 2 * 12 * 2 * 3 * 3 * 10 = 4320 settings! However, the benefit of a random search is that we are not trying every combination, but selecting at random to sample a wide range of values.

### Random Search Training
Now, we instantiate the random search and fit it like any Scikit-Learn model

In [93]:
# Create the RandomSearchCV model
# This takes a few minutes...
from sklearn.model_selection import GridSearchCV

random_forest = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [200, 600, 1200, 1400],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [14, 15, 16, 17, 18, None]
}
grid = GridSearchCV(random_forest, param_grid, cv=5, verbose=3, n_jobs=-1)

# Train the model with GridSearch
_ = grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 17.9min finished


## Train the Model
Now GridSearch gave the best parameters train the tuned model. 

In [94]:
# List the best parameters for this dataset
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 18, 'max_features': 'auto', 'n_estimators': 1400}
0.9000602172916409


In [95]:
# Tuned parameters

max_features = grid.best_params_['max_features']
n_estimators = grid.best_params_['n_estimators']
max_depth    = grid.best_params_['max_depth']
criterion    = 'entropy'

# Tuned model
tuned_model = RandomForestClassifier(max_features=max_features, n_estimators=n_estimators, 
                                     criterion=criterion, max_depth=max_depth, random_state=42)
tuned_model.fit(X_train_scaled, y_train)

random_forest_training_score = round(tuned_model.score(X_train_scaled, y_train)*100,2)
tuned_accuracy = round(tuned_model.score(X_test_scaled, y_test)*100,2)

print(f"Training Data Score: {random_forest_training_score} %")
print(f"Testing Data Score: {tuned_accuracy} %")

Training Data Score: 99.01 %
Testing Data Score: 89.36 %


In [99]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = tuned_model.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=['Candidate', 'Confirmed', 'False Positive']))


                precision    recall  f1-score   support

     Candidate       0.84      0.73      0.78       422
     Confirmed       0.78      0.85      0.81       450
False Positive       0.98      1.00      0.99       876

      accuracy                           0.89      1748
     macro avg       0.87      0.86      0.86      1748
  weighted avg       0.89      0.89      0.89      1748



# Save the Model

In [100]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'AM_Model_RandomForest.sav'
joblib.dump(tuned_model, filename)

['AM_Model_RandomForest.sav']