# Exoplanet Exploration -  Machine Learning Assignment

__Summary__

Create machine learning models capable of classifying candidate exoplanets from the raw dataset.

The algorithms used were:

- Support Vector Machines (SVM) with Linear and Gaussian kernels 

- Random Forests

Also, a hyper-parameter tuning with the `meta-estimator` function GridSearchCV() was used.


In [1]:
# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

### Read the CSV and perform basic data cleaning

In [2]:
# Read the file and create a data frame
df = pd.read_csv("cumulative.csv")
# Remove unnecessary columns
df = df.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
# View results
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
# Print data frame info to see the type of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8744 entries, 0 to 9563
Data columns (total 41 columns):
koi_disposition      8744 non-null object
koi_fpflag_nt        8744 non-null int64
koi_fpflag_ss        8744 non-null int64
koi_fpflag_co        8744 non-null int64
koi_fpflag_ec        8744 non-null int64
koi_period           8744 non-null float64
koi_period_err1      8744 non-null float64
koi_period_err2      8744 non-null float64
koi_time0bk          8744 non-null float64
koi_time0bk_err1     8744 non-null float64
koi_time0bk_err2     8744 non-null float64
koi_impact           8744 non-null float64
koi_impact_err1      8744 non-null float64
koi_impact_err2      8744 non-null float64
koi_duration         8744 non-null float64
koi_duration_err1    8744 non-null float64
koi_duration_err2    8744 non-null float64
koi_depth            8744 non-null float64
koi_depth_err1       8744 non-null float64
koi_depth_err2       8744 non-null float64
koi_prad             8744 non-null float64

### Create a Train - Test Split

In [3]:
# Use `koi_disposition` for the y values
# koi_disposition. The pipeline flag that designates the most probable physical explanation of the KOI. 
# Typical values are FALSE POSITIVE, NOT DISPOSITIONED, and CANDIDATE, according to the documentation,
# but I used value_counts to see which values are present in the data set
target = df["koi_disposition"]
target_names = ["FALSE POSITIVE", "CONFIRMED", "CANDIDATE"]
df["koi_disposition"].value_counts()

FALSE POSITIVE    4358
CONFIRMED         2272
CANDIDATE         2114
Name: koi_disposition, dtype: int64

In [33]:
# https://scikit-learn.org/stable/modules/preprocessing_targets.html#preprocessing-targets
# 5.9. Transforming the prediction target (y)
# These are transformers that are not intended to be used on features, only on supervised learning targets. 
#    5.9.2. Label encoding
#    LabelEncoder is a utility class to help normalize labels such that they contain only values between 0 and n_classes-1.
# Use LabelEncoder for y values
le = LabelEncoder()
target_le = le.fit_transform(target)

In [34]:
# Print to confirm the classes in the LabelEncoder
list(le.classes_)

['CANDIDATE', 'CONFIRMED', 'FALSE POSITIVE']

In [35]:
# Example of transformation
le.transform(["CONFIRMED", "CONFIRMED", "FALSE POSITIVE"])

array([1, 1, 2], dtype=int64)

In [36]:
# Example of how to convert the numbers into labels
list(le.inverse_transform([2, 2, 1]))

['FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED']

In [37]:
# View the encoded values for target
print(target_le)

[1 1 2 ... 0 2 2]


In [4]:
# Create features
data = df.drop("koi_disposition", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [38]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(data, target_le, random_state=42)

In [7]:
# View the shape of the training set
X_train.shape

(6558, 40)

In [39]:
# View the results of the training set
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
8017,0,1,1,0,0.806277,4.947e-06,-4.947e-06,131.78567,0.00672,-0.00672,...,-184.0,4.471,0.054,-0.229,0.996,0.324,-0.108,290.81723,38.53912,13.614
1233,0,1,1,0,3.582077,4.318e-06,-4.318e-06,355.515064,0.000864,-0.000864,...,-235.0,4.422,0.09,-0.195,0.993,0.283,-0.131,296.07822,43.13694,15.193
2592,0,0,0,0,5.060923,2.616e-05,-2.616e-05,134.47316,0.00473,-0.00473,...,-112.0,4.492,0.048,-0.112,0.911,0.121,-0.06,289.91742,40.828606,13.346
4770,0,1,0,1,8.480304,3.32e-07,-3.32e-07,135.854534,3.1e-05,-3.1e-05,...,-169.0,3.946,0.195,-0.105,2.21,0.375,-0.563,298.8002,46.665539,7.631
6632,0,0,0,1,4.994716,4.495e-05,-4.495e-05,136.1833,0.0095,-0.0095,...,-194.0,3.706,0.32,-0.08,2.83,0.458,-1.068,282.58215,46.81551,13.352


### Pre-processing

In [40]:
# Scale the data using the MinMaxScaler
# MinMaxScaler rescales the data set such that all feature values are in the range [0, 1].
# However, this scaling compress all inliers in the narrow range [0, 0.005] for the transformed number of data.
# MinMaxScaler is very sensitive to the presence of outliers.
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

  return self.partial_fit(X, y)


In [10]:
# Print the scaling of the training data
X_train_scaled
#print(X_train_scaled.shape)

array([[0.        , 1.        , 1.        , ..., 0.50165877, 0.1151529 ,
        0.59014647],
       [0.        , 1.        , 1.        , ..., 0.74236463, 0.4100286 ,
        0.73031514],
       [0.        , 0.        , 0.        , ..., 0.4604898 , 0.26198635,
        0.56635597],
       ...,
       [0.        , 1.        , 1.        , ..., 0.59210368, 0.21362786,
        0.71193964],
       [0.        , 0.        , 0.        , ..., 0.30793754, 0.14966214,
        0.57292499],
       [0.        , 1.        , 0.        , ..., 0.74359035, 0.8834343 ,
        0.58295606]])

In [41]:
# View the mean and the standard deviation of the scaled training data set
print("Mean : %s " % X_train_scaled.mean(axis=0))
print("Standard Deviation : %s " % X_train_scaled.std(axis=0))

Mean : [1.55077768e-01 2.41994511e-01 2.03568161e-01 1.24733150e-01
 5.28786383e-02 1.18744388e-02 9.88125561e-01 3.24237496e-02
 1.64351648e-02 9.83564835e-01 7.06523445e-03 2.50172967e-02
 9.91222998e-01 3.87784860e-02 1.59828644e-02 9.84017136e-01
 2.51301146e-02 1.67731795e-03 9.98322682e-01 5.28083969e-04
 7.19977547e-04 9.99530764e-01 6.87270363e-02 7.68075333e-04
 9.54842296e-04 9.99222261e-01 2.94727443e-02 3.45270771e-02
 2.29481527e-01 2.13299038e-01 9.07025104e-01 8.01709639e-01
 8.22639665e-02 8.83357804e-01 8.99116393e-03 1.06161225e-02
 9.96248097e-01 5.61297390e-01 4.54656934e-01 6.47720819e-01] 
Standard Deviation : [0.3619788  0.42829098 0.40265142 0.33041609 0.11153083 0.04583673
 0.04583673 0.04961769 0.03842203 0.03842203 0.03050101 0.12025122
 0.03022716 0.04701797 0.03240192 0.03240192 0.08632736 0.0137291
 0.0137291  0.01674574 0.01663228 0.01746772 0.05871533 0.01669954
 0.01512141 0.01824802 0.09034696 0.09400901 0.05927387 0.06917693
 0.04062109 0.08142376 0.0

In [42]:
# Print the scaling of the testing data
X_test_scaled

array([[0.        , 1.        , 1.        , ..., 0.50601949, 0.46088351,
        0.55082113],
       [0.        , 0.        , 0.        , ..., 0.30913719, 0.34116743,
        0.75552597],
       [0.        , 0.        , 0.        , ..., 0.53539106, 0.71001706,
        0.70945406],
       ...,
       [0.        , 0.        , 0.        , ..., 0.16941497, 0.5689087 ,
        0.78455393],
       [1.        , 0.        , 0.        , ..., 0.33893883, 0.7762052 ,
        0.55676875],
       [0.        , 0.        , 0.        , ..., 0.68123182, 0.43567514,
        0.59928984]])

### Training and hyper- parameter tuning

#### Linear Support Vector Machine  - First algorithm

In [43]:
# Support vector machine classifier
# First try linear kernel to see the scores
model_lin = SVC(kernel='linear', class_weight='balanced')   
model_lin.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [44]:
# View the scores
print(f"SVM Linear - Training Data Score: {model_lin.score(X_train_scaled, y_train)}")
print(f"SVM Linear - Testing Data Score: {model_lin.score(X_test_scaled, y_test)}")

SVM Linear - Training Data Score: 0.8533089356511131
SVM Linear - Testing Data Score: 0.8444647758462946


In [45]:
# Use `GridSearchCV` to tune the `C` and `gamma` parameters
# Create the GridSearch estimator along with a parameter object containing the values to adjust
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid_lin = GridSearchCV(model_lin, param_grid, verbose=3)

In [46]:
# Fit the model using the grid search estimator. This will take the SVC model and try each combination of parameters
grid_lin.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.8473491773308958, total=   0.9s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.8462242562929062, total=   0.6s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.3s remaining:    0.0s


[CV] ....... C=1, gamma=0.0001, score=0.851258581235698, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.8473491773308958, total=   0.5s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.8462242562929062, total=   0.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ....... C=1, gamma=0.0005, score=0.851258581235698, total=   0.6s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8473491773308958, total=   0.6s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.8462242562929062, total=   0.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ........ C=1, gamma=0.001, score=0.851258581235698, total=   0.7s
[CV] C=1, gamma=0.005 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:   51.5s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10, 50], 'gamma': [0.0001, 0.0005, 0.001, 0.005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [47]:
# Print the best parameters and score for this data set
print(f"SVM Linear - Best Training Parameters: {grid_lin.best_params_}")
print(f"SVM Linear - Best Training Score: {grid_lin.best_score_}")

SVM Linear - Best Training Parameters: {'C': 50, 'gamma': 0.0001}
SVM Linear - Best Training Score: 0.8827386398292162


In [48]:
# Make predictions with the hypertuned model
predictions_lin = grid_lin.predict(X_test_scaled)

In [49]:
# Calculate classification report
print(classification_report(y_test, predictions_lin,
                            target_names=target_names))

# The f1-score gives you the harmonic mean of precision and recall. 
# The scores corresponding to every class will tell you the accuracy 
# of the classifier in classifying the data points in that particular 
# class compared to all other classes.
# The support is the number of samples of the true response that lie in that class.

                precision    recall  f1-score   support

FALSE POSITIVE       0.84      0.66      0.74       523
     CONFIRMED       0.75      0.86      0.80       594
     CANDIDATE       0.98      1.00      0.99      1069

     micro avg       0.88      0.88      0.88      2186
     macro avg       0.86      0.84      0.84      2186
  weighted avg       0.88      0.88      0.88      2186



#### Radial Basis Function - Support Vector Machine - Second algorithm

In [50]:
# Gaussian kernel because we had three output classes
model_rbf = SVC(kernel='rbf', gamma='scale')   
model_rbf.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [51]:
# View the scores
print(f"SVM Radial Basis Function - Training Data Score: {model_rbf.score(X_train_scaled, y_train)}")
print(f"SVM Radial Basis Function - Testing Data Score: {model_rbf.score(X_test_scaled, y_test)}")

SVM Radial Basis Function - Training Data Score: 0.8325709057639524
SVM Radial Basis Function - Testing Data Score: 0.807868252516011


In [52]:
# Use `GridSearchCV` to tune the `C` and `gamma` parameters
grid_rbf = GridSearchCV(model_rbf, param_grid, verbose=3)

In [53]:
# Fit the model using the grid search estimator. This will take the SVC model and try each combination of parameters
grid_rbf.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ...... C=1, gamma=0.0001, score=0.5013711151736746, total=   2.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.7s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.5016018306636155, total=   2.7s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.6s remaining:    0.0s


[CV] ...... C=1, gamma=0.0001, score=0.5016018306636155, total=   2.6s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.5013711151736746, total=   2.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.5016018306636155, total=   2.4s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ...... C=1, gamma=0.0005, score=0.5016018306636155, total=   2.5s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.5013711151736746, total=   2.3s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.5016018306636155, total=   2.4s
[CV] C=1, gamma=0.001 ................................................
[CV] ....... C=1, gamma=0.001, score=0.5016018306636155, total=   2.5s
[CV] C=1, gamma=0.005 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  2.3min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 5, 10, 50], 'gamma': [0.0001, 0.0005, 0.001, 0.005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [54]:
# Print the best parameters and score for this data set
print(f"SVM Radial Basis Function - Best Training Parameters: {grid_rbf.best_params_}")
print(f"SVM Radial Basis Function - Best Training Score: {grid_rbf.best_score_}")

SVM Radial Basis Function - Best Training Parameters: {'C': 50, 'gamma': 0.005}
SVM Radial Basis Function - Best Training Score: 0.8388228118328759


In [55]:
# Make predictions with the hypertuned model
predictions_rbf = grid_rbf.predict(X_test_scaled)

In [56]:
# Calculate classification report
print(classification_report(y_test, predictions_rbf,
                            target_names=target_names))

                precision    recall  f1-score   support

FALSE POSITIVE       0.70      0.54      0.61       523
     CONFIRMED       0.67      0.78      0.72       594
     CANDIDATE       0.98      1.00      0.99      1069

     micro avg       0.83      0.83      0.83      2186
     macro avg       0.78      0.77      0.77      2186
  weighted avg       0.83      0.83      0.82      2186



#### Random Forest Classification - Third algorithm

In [91]:
# Trying another classification to see which features are more important
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train)
print(rf.feature_importances_)  # one value per feature

[0.09456841 0.06697704 0.10917494 0.04000138 0.02189963 0.02027562
 0.01643269 0.01333596 0.02290214 0.02526901 0.01778774 0.011788
 0.01052751 0.02667801 0.02924951 0.03509888 0.02051898 0.01387978
 0.01299809 0.04810826 0.03578601 0.02701432 0.01632337 0.01447329
 0.01706526 0.01183873 0.05673382 0.002907   0.00942557 0.0291508
 0.02827311 0.00892698 0.0089915  0.01115869 0.00847752 0.01261306
 0.00860304 0.01302271 0.0113275  0.01041615]


In [92]:
# List name feature and its importance
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.10917494033151681, 'koi_fpflag_co'),
 (0.09456840810216313, 'koi_fpflag_nt'),
 (0.06697703960594788, 'koi_fpflag_ss'),
 (0.05673382463013196, 'koi_model_snr'),
 (0.04810826077520824, 'koi_prad'),
 (0.04000138325309776, 'koi_fpflag_ec'),
 (0.035786008422816734, 'koi_prad_err1'),
 (0.03509887576308981, 'koi_duration_err2'),
 (0.029249509485369137, 'koi_duration_err1'),
 (0.029150795846914906, 'koi_steff_err1'),
 (0.0282731071854581, 'koi_steff_err2'),
 (0.027014316599332507, 'koi_prad_err2'),
 (0.026678011378806873, 'koi_duration'),
 (0.02526900923370279, 'koi_time0bk_err2'),
 (0.02290214072748138, 'koi_time0bk_err1'),
 (0.02189962696942649, 'koi_period'),
 (0.020518977165241724, 'koi_depth'),
 (0.020275620290759213, 'koi_period_err1'),
 (0.0177877402690931, 'koi_impact'),
 (0.01706525836798413, 'koi_insol_err1'),
 (0.016432685198388898, 'koi_period_err2'),
 (0.016323372242003073, 'koi_teq'),
 (0.014473288976000113, 'koi_insol'),
 (0.013879781014868914, 'koi_depth_err1'),
 (0.0133359

In [93]:
# Use GridSearchCV with the random forest created before
parameters = {'max_features':['sqrt', 'log2', 10],
              'max_depth':[5, 7, 9]}

clf_grid = GridSearchCV(rf, parameters, cv=3, n_jobs=-1)
clf_grid.fit(X_train_scaled, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_features': ['sqrt', 'log2', 10], 'max_depth': [5, 7, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [94]:
# View the score for training data set
print('Training Data Score: {}'.format(clf_grid.score(X_train_scaled, y_train)))

Training Data Score: 0.9175053369929856


In [95]:
# View the score for testing data set
print('Testing Data Score: {}'.format(clf_grid.score(X_test_scaled, y_test)))

Testing Data Score: 0.8902104300091491


In [96]:
# Convert the results to a Pandas data frame and make a short view of the results
cvrf_results = pd.DataFrame(clf_grid.cv_results_)
cvrf_results_tiny = cvrf_results[['params', 'mean_test_score', 'std_test_score', 'rank_test_score', 'mean_train_score', 'std_train_score']]
cvrf_results_tiny.sort_values(by='mean_test_score', ascending=False).head()



Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
8,"{'max_depth': 9, 'max_features': 10}",0.89204,0.008033,1,0.923299,0.002656
5,"{'max_depth': 7, 'max_features': 10}",0.885941,0.008144,2,0.900579,0.003199
6,"{'max_depth': 9, 'max_features': 'sqrt'}",0.883196,0.007377,3,0.920173,0.003159
7,"{'max_depth': 9, 'max_features': 'log2'}",0.879841,0.00748,4,0.919487,0.002779
2,"{'max_depth': 5, 'max_features': 10}",0.876029,0.009909,5,0.883348,0.003257


#### Random Forest Classification - Comparison of Accuracy between using 40 features (all) vs. 8 features (threshold=0.03)

In [133]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.03
sfm = SelectFromModel(rf, threshold=0.03)

# Train the selector
sfm.fit(X_train_scaled, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=0.03)

In [134]:
# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(feature_names[feature_list_index])

koi_fpflag_nt
koi_fpflag_ss
koi_fpflag_co
koi_fpflag_ec
koi_duration_err2
koi_prad
koi_prad_err1
koi_model_snr


In [135]:
# Transform the data to create a new dataset containing only the most important features
X_important_train = sfm.transform(X_train_scaled)
X_important_test = sfm.transform(X_test_scaled)

In [136]:
# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [139]:
# Apply The Full Featured Classifier To The Test Data
y_pred = rf.predict(X_test_scaled)

# View The Accuracy Of Our Full Feature (40 Features) Model
print(f"RF - Accuracy Score 40 features: {accuracy_score(y_test, y_pred)}") 

RF - Accuracy Score 40 features: 0.8947849954254345


In [140]:
# Apply The Full Featured Classifier To The Test Data
y_important_pred = clf_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature Model
print(f"RF - Accuracy Score 8 features: {accuracy_score(y_test, y_important_pred)}") 

RF - Accuracy Score 8 features: 0.8847209515096066
