# Exoplanet Exploration -  Machine Learning Assignment

__Summary__

Create machine learning models capable of classifying candidate exoplanets from the raw dataset.

The algorithms used were:

- Support Vector Machines (SVM) with Linear and Gaussian kernels 

- Random Forests

Also, a hyper-parameter tuning with the `meta-estimator` function GridSearchCV() was used.


In [14]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

### Read the CSV and perform basic data cleaning

In [2]:
# Read the file and create a data frame
df = pd.read_csv("cumulative.csv")
# Remove unnecessary columns
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
# View results
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
# Print data frame info to see the type of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8744 entries, 0 to 9563
Data columns (total 41 columns):
koi_disposition      8744 non-null object
koi_fpflag_nt        8744 non-null int64
koi_fpflag_ss        8744 non-null int64
koi_fpflag_co        8744 non-null int64
koi_fpflag_ec        8744 non-null int64
koi_period           8744 non-null float64
koi_period_err1      8744 non-null float64
koi_period_err2      8744 non-null float64
koi_time0bk          8744 non-null float64
koi_time0bk_err1     8744 non-null float64
koi_time0bk_err2     8744 non-null float64
koi_impact           8744 non-null float64
koi_impact_err1      8744 non-null float64
koi_impact_err2      8744 non-null float64
koi_duration         8744 non-null float64
koi_duration_err1    8744 non-null float64
koi_duration_err2    8744 non-null float64
koi_depth            8744 non-null float64
koi_depth_err1       8744 non-null float64
koi_depth_err2       8744 non-null float64
koi_prad             8744 non-null float64

### Create a Train - Test Split

In [3]:
# Use `koi_disposition` for the y values
# koi_disposition. The pipeline flag that designates the most probable physical explanation of the KOI. 
# Typical values are FALSE POSITIVE, NOT DISPOSITIONED, and CANDIDATE, according to the documentation,
# but I used value_counts to see which values are present in the data set
target = df["koi_disposition"]
target_names = ["FALSE POSITIVE", "CONFIRMED", "CANDIDATE"]
df["koi_disposition"].value_counts()

FALSE POSITIVE    4358
CONFIRMED         2272
CANDIDATE         2114
Name: koi_disposition, dtype: int64

In [4]:
# Create features
data = df.drop("koi_disposition", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [5]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [7]:
# View the shape of the training set
X_train.shape

(6558, 40)

In [8]:
# View the results of the training set
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
8017,0,1,1,0,0.806277,4.947e-06,-4.947e-06,131.78567,0.00672,-0.00672,...,-184.0,4.471,0.054,-0.229,0.996,0.324,-0.108,290.81723,38.53912,13.614
1233,0,1,1,0,3.582077,4.318e-06,-4.318e-06,355.515064,0.000864,-0.000864,...,-235.0,4.422,0.09,-0.195,0.993,0.283,-0.131,296.07822,43.13694,15.193
2592,0,0,0,0,5.060923,2.616e-05,-2.616e-05,134.47316,0.00473,-0.00473,...,-112.0,4.492,0.048,-0.112,0.911,0.121,-0.06,289.91742,40.828606,13.346
4770,0,1,0,1,8.480304,3.32e-07,-3.32e-07,135.854534,3.1e-05,-3.1e-05,...,-169.0,3.946,0.195,-0.105,2.21,0.375,-0.563,298.8002,46.665539,7.631
6632,0,0,0,1,4.994716,4.495e-05,-4.495e-05,136.1833,0.0095,-0.0095,...,-194.0,3.706,0.32,-0.08,2.83,0.458,-1.068,282.58215,46.81551,13.352


### Pre-processing

In [6]:
# Scale the data using the MinMaxScaler
# MinMaxScaler rescales the data set such that all feature values are in the range [0, 1].
# However, this scaling compress all inliers in the narrow range [0, 0.005] for the transformed number of data.
# MinMaxScaler is very sensitive to the presence of outliers.
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

  return self.partial_fit(X, y)


In [10]:
# Print the scaling of the training data
X_train_scaled
#print(X_train_scaled.shape)

array([[0.        , 1.        , 1.        , ..., 0.50165877, 0.1151529 ,
        0.59014647],
       [0.        , 1.        , 1.        , ..., 0.74236463, 0.4100286 ,
        0.73031514],
       [0.        , 0.        , 0.        , ..., 0.4604898 , 0.26198635,
        0.56635597],
       ...,
       [0.        , 1.        , 1.        , ..., 0.59210368, 0.21362786,
        0.71193964],
       [0.        , 0.        , 0.        , ..., 0.30793754, 0.14966214,
        0.57292499],
       [0.        , 1.        , 0.        , ..., 0.74359035, 0.8834343 ,
        0.58295606]])

In [11]:
# View the mean and the standard deviation of the scaled training data set
print("Mean : %s " % X_train_scaled.mean(axis=0))
print("Standard Deviation : %s " % X_train_scaled.std(axis=0))

Mean : [1.55077768e-01 2.41994511e-01 2.03568161e-01 1.24733150e-01
 5.28786383e-02 1.18744388e-02 9.88125561e-01 3.24237496e-02
 1.64351648e-02 9.83564835e-01 7.06523445e-03 2.50172967e-02
 9.91222998e-01 3.87784860e-02 1.59828644e-02 9.84017136e-01
 2.51301146e-02 1.67731795e-03 9.98322682e-01 5.28083969e-04
 7.19977547e-04 9.99530764e-01 6.87270363e-02 7.68075333e-04
 9.54842296e-04 9.99222261e-01 2.94727443e-02 3.45270771e-02
 2.29481527e-01 2.13299038e-01 9.07025104e-01 8.01709639e-01
 8.22639665e-02 8.83357804e-01 8.99116393e-03 1.06161225e-02
 9.96248097e-01 5.61297390e-01 4.54656934e-01 6.47720819e-01] 
Standard Deviation : [0.3619788  0.42829098 0.40265142 0.33041609 0.11153083 0.04583673
 0.04583673 0.04961769 0.03842203 0.03842203 0.03050101 0.12025122
 0.03022716 0.04701797 0.03240192 0.03240192 0.08632736 0.0137291
 0.0137291  0.01674574 0.01663228 0.01746772 0.05871533 0.01669954
 0.01512141 0.01824802 0.09034696 0.09400901 0.05927387 0.06917693
 0.04062109 0.08142376 0.0

In [12]:
# Print the scaling of the testing data
X_test_scaled

array([[0.        , 1.        , 1.        , ..., 0.50601949, 0.46088351,
        0.55082113],
       [0.        , 0.        , 0.        , ..., 0.30913719, 0.34116743,
        0.75552597],
       [0.        , 0.        , 0.        , ..., 0.53539106, 0.71001706,
        0.70945406],
       ...,
       [0.        , 0.        , 0.        , ..., 0.16941497, 0.5689087 ,
        0.78455393],
       [1.        , 0.        , 0.        , ..., 0.33893883, 0.7762052 ,
        0.55676875],
       [0.        , 0.        , 0.        , ..., 0.68123182, 0.43567514,
        0.59928984]])

### Training and hyper- parameter tuning

#### Linear Support Vector Machine  - First algorithm

In [7]:
# Support vector machine classifier
# First try linear kernel to see the scores
model_lin = SVC(kernel='linear', class_weight='balanced')   
model_lin.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [8]:
# View the scores
print(f"SVM Linear - Training Data Score: {model_lin.score(X_train_scaled, y_train)}")
print(f"SVM Linear - Testing Data Score: {model_lin.score(X_test_scaled, y_test)}")

SVM Linear - Training Data Score: 0.8533089356511131
SVM Linear - Testing Data Score: 0.8444647758462946


In [9]:
# Use `GridSearchCV` to tune the `C` and `gamma` parameters
# Create the GridSearch estimator along with a parameter object containing the values to adjust
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid_lin = GridSearchCV(model_lin, param_grid, verbose=3)

In [10]:
# Fit the model using the grid search estimator. This will take the SVC model and try each combination of parameters
grid_lin.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8473491773308958, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8462242562929062, total=   0.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s


[CV] ....... C=1, gamma=0.0001, score=0.851258581235698, total=   0.7s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.8473491773308958, total=   0.7s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.8462242562929062, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ....... C=1, gamma=0.0005, score=0.851258581235698, total=   0.5s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8473491773308958, total=   0.5s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8462242562929062, total=   0.5s
[CV] C=1, gamma=0.001 ................................................
[CV] ........ C=1, gamma=0.001, score=0.851258581235698, total=   0.8s
[CV] C=1, gamma=0.005 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   57.0s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10, 50], 'gamma': [0.0001, 0.0005, 0.001, 0.005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [11]:
# Print the best parameters and score for this data set
print(f"SVM Linear - Best Training Parameters: {grid_lin.best_params_}")
print(f"SVM Linear - Best Training Score: {grid_lin.best_score_}")

SVM Linear - Best Training Parameters: {'C': 50, 'gamma': 0.0001}
SVM Linear - Best Training Score: 0.8827386398292162


In [12]:
# Make predictions with the hypertuned model
predictions_lin = grid_lin.predict(X_test_scaled)

In [13]:
# Calculate classification report
print(classification_report(y_test, predictions_lin,
                            target_names=target_names))

# The f1-score gives you the harmonic mean of precision and recall. 
# The scores corresponding to every class will tell you the accuracy 
# of the classifier in classifying the data points in that particular 
# class compared to all other classes.
# The support is the number of samples of the true response that lie in that class.

                precision    recall  f1-score   support

FALSE POSITIVE       0.84      0.66      0.74       523
     CONFIRMED       0.75      0.86      0.80       594
     CANDIDATE       0.98      1.00      0.99      1069

     micro avg       0.88      0.88      0.88      2186
     macro avg       0.86      0.84      0.84      2186
  weighted avg       0.88      0.88      0.88      2186



#### Radial Basis Function - Support Vector Machine - Second algorithm

In [44]:
# Gaussian kernel because we had three output classes
model_rbf = SVC(kernel='rbf', gamma='scale')   
model_rbf.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [45]:
# View the scores
print(f"SVM Radial Basis Function - Training Data Score: {model_rbf.score(X_train_scaled, y_train)}")
print(f"SVM Radial Basis Function - Testing Data Score: {model_rbf.score(X_test_scaled, y_test)}")

SVM Radial Basis Function - Training Data Score: 0.8325709057639524
SVM Radial Basis Function - Testing Data Score: 0.807868252516011


In [48]:
# Use `GridSearchCV` to tune the `C` and `gamma` parameters
grid_rbf = GridSearchCV(model_rbf, param_grid, verbose=3)

In [49]:
# Fit the model using the grid search estimator. This will take the SVC model and try each combination of parameters
grid_rbf.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.5013711151736746, total=   2.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.5016018306636155, total=   2.5s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.1s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.5016018306636155, total=   2.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.5013711151736746, total=   2.3s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.5016018306636155, total=   2.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.5016018306636155, total=   2.3s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.5013711151736746, total=   2.3s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.5016018306636155, total=   2.7s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.5016018306636155, total=   2.5s
[CV] C=1, gamma=0.005 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  2.4min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10, 50], 'gamma': [0.0001, 0.0005, 0.001, 0.005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [83]:
# Print the best parameters and score for this data set
print(f"SVM Radial Basis Function - Best Training Parameters: {grid_rbf.best_params_}")
print(f"SVM Radial Basis Function - Best Training Score: {grid_rbf.best_score_}")

SVM Radial Basis Function - Best Training Parameters: {'C': 50, 'gamma': 0.005}
SVM Radial Basis Function - Best Training Score: 0.8388228118328759


In [51]:
# Make predictions with the hypertuned model
predictions_rbf = grid_rbf.predict(X_test_scaled)

In [52]:
# Calculate classification report
print(classification_report(y_test, predictions_rbf,
                            target_names=target_names))

                precision    recall  f1-score   support

FALSE POSITIVE       0.70      0.54      0.61       523
     CONFIRMED       0.67      0.78      0.72       594
     CANDIDATE       0.98      1.00      0.99      1069

     micro avg       0.83      0.83      0.83      2186
     macro avg       0.78      0.77      0.77      2186
  weighted avg       0.83      0.83      0.82      2186



#### Random Forest Classification - Third algorithm

In [75]:
# Trying another classification to see which features are more important
rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train_scaled, y_train)
print(rf.feature_importances_)  # one value per feature

[0.09748275 0.06908554 0.11411909 0.03915132 0.0204243  0.01852027
 0.01814358 0.01399443 0.02528279 0.02236811 0.01744319 0.01125409
 0.01038127 0.02404738 0.03379978 0.02792812 0.0217225  0.01289029
 0.01263593 0.04536258 0.0376951  0.03331188 0.01455254 0.01454527
 0.01783791 0.01101116 0.05823571 0.00297803 0.00964056 0.02656834
 0.02617504 0.0090284  0.00857205 0.0106931  0.00926901 0.01141579
 0.00809619 0.01283917 0.01128213 0.01021529]


In [82]:
# List name feature and its importance
for feat, importance in zip(data.columns, rf.feature_importances_):
    print ('Feature: {f}, Importance: {i}'.format(f=feat, i=importance))

Feature: koi_fpflag_nt, Importance: 0.09748274732279168
Feature: koi_fpflag_ss, Importance: 0.06908554290116906
Feature: koi_fpflag_co, Importance: 0.11411908507572308
Feature: koi_fpflag_ec, Importance: 0.03915131721862272
Feature: koi_period, Importance: 0.020424300183261725
Feature: koi_period_err1, Importance: 0.018520272260697464
Feature: koi_period_err2, Importance: 0.018143580549395615
Feature: koi_time0bk, Importance: 0.013994429410855465
Feature: koi_time0bk_err1, Importance: 0.02528279300378118
Feature: koi_time0bk_err2, Importance: 0.02236810947471344
Feature: koi_impact, Importance: 0.017443186112309616
Feature: koi_impact_err1, Importance: 0.011254085152382435
Feature: koi_impact_err2, Importance: 0.010381274510559719
Feature: koi_duration, Importance: 0.024047380636213006
Feature: koi_duration_err1, Importance: 0.033799782669245944
Feature: koi_duration_err2, Importance: 0.027928119033274676
Feature: koi_depth, Importance: 0.021722503109936092
Feature: koi_depth_err1, Imp

In [69]:
# Use GridSearchCV with the random forest created before
parameters = {'max_features':['sqrt', 'log2', 10],
              'max_depth':[5, 7, 9]}

clf_grid = GridSearchCV(rf, parameters, cv=3, n_jobs=-1)
clf_grid.fit(X_train_scaled, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_features': ['sqrt', 'log2', 10], 'max_depth': [5, 7, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [70]:
# View the score for training data set
clf_grid.score(X_train_scaled, y_train)

0.9149130832570905

In [71]:
# View the score for testing data set
clf_grid.score(X_test_scaled, y_test)

0.8924977127172918

In [73]:
# Convert the results to a Pandas data frame and make a short view of the results
cvrf_results = pd.DataFrame(clf_grid.cv_results_)
cvrf_results_tiny = cvrf_results[['params', 'mean_test_score', 'std_test_score', 'rank_test_score', 'mean_train_score', 'std_train_score']]
cvrf_results_tiny.sort_values(by='mean_test_score', ascending=False).head()



Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
8,"{'max_depth': 9, 'max_features': 10}",0.89448,0.006631,1,0.924977,0.002762
5,"{'max_depth': 7, 'max_features': 10}",0.885941,0.008144,2,0.900198,0.002991
6,"{'max_depth': 9, 'max_features': 'sqrt'}",0.884416,0.008984,3,0.921316,0.003482
7,"{'max_depth': 9, 'max_features': 'log2'}",0.881519,0.007669,4,0.920097,0.004042
3,"{'max_depth': 7, 'max_features': 'sqrt'}",0.877097,0.006734,5,0.8973,0.004436
