In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV


In [2]:
dataset = pd.read_csv("Resources/cumulative.csv")

#drop some of the unwanted columns
dataset = dataset.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
dataset = dataset.dropna(axis='columns', how='all')

# Drop null columns
dataset = dataset.dropna(axis='columns', how='all')

# Drop null rows
dataset = dataset.dropna()
dataset.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [3]:
#convert int64 to float64
for column, i in dataset.items():
    if dataset[column].dtype =='int64':
        dataset = dataset.astype({column: 'float64'})
        
dataset.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0.0,0.0,0.0,0.0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0.0,0.0,0.0,0.0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0.0,1.0,0.0,0.0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0.0,1.0,0.0,0.0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0.0,0.0,0.0,0.0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [4]:
# assign data to X and y
X = dataset.drop("koi_disposition", axis= 1)
y = dataset["koi_disposition"]

#split dataset to into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
# Scale X values
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
from sklearn.ensemble import RandomForestClassifier
#train the model
model1 = RandomForestClassifier(n_estimators=200)
model1.fit(X_train_scaled, y_train)

model1_training_score = round(model1.score(X_train_scaled, y_train)* 100,3)
base_accuracy = round(model1.score(X_test_scaled, y_test) * 100,3)

print(f"Training Data Score: {model1_training_score} %")
print(f"Testing Data Score: {base_accuracy} %")


Training Data Score: 100.0 %
Testing Data Score: 89.224 %


In [9]:
predictions = model1.predict(X_test_scaled)
classifications = y_test.unique().tolist()

prediction_actual = {
    'Actual': y_test,
    'Prediction': predictions
}

df = pd.DataFrame(prediction_actual)
df = df.set_index('Actual').reset_index()
df.head(15)

Unnamed: 0,Actual,Prediction
0,FALSE POSITIVE,FALSE POSITIVE
1,CANDIDATE,CANDIDATE
2,CONFIRMED,CONFIRMED
3,CONFIRMED,CONFIRMED
4,CANDIDATE,FALSE POSITIVE
5,FALSE POSITIVE,FALSE POSITIVE
6,CANDIDATE,CONFIRMED
7,CONFIRMED,CONFIRMED
8,CONFIRMED,CONFIRMED
9,FALSE POSITIVE,FALSE POSITIVE


In [17]:
# Determine which features ought to be kept
feature_names = X.columns.tolist()
preSelected_features = sorted(zip(model1.feature_importances_, feature_names), reverse=True)
ranked_features = pd.DataFrame(preSelected_features, columns=['Score', 'Feature'])
ranked_features = ranked_features.set_index('Feature')
ranked_features

Unnamed: 0_level_0,Score
Feature,Unnamed: 1_level_1
koi_fpflag_co,0.106552
koi_fpflag_nt,0.094046
koi_fpflag_ss,0.065752
koi_model_snr,0.057062
koi_prad,0.050997
koi_steff_err1,0.034997
koi_prad_err1,0.034485
koi_fpflag_ec,0.033948
koi_duration_err2,0.033501
koi_duration_err1,0.032255


In [18]:
# Remove features with score < 0.011
selected_features = []
for tup in preSelected_features:
    if tup[0] > 0.011:
        selected_features.append(tup[1])

In [19]:
# Use new data for all subsequent models
# Assign new data to X 
X_train_select = X_train[selected_features]
X_test_select = X_test[selected_features]

X_scaler = MinMaxScaler().fit(X_train_select)
X_train_scaled = X_scaler.transform(X_train_select)
X_test_scaled = X_scaler.transform(X_test_select)

## Train new model
model2 = RandomForestClassifier(n_estimators=200)
model2.fit(X_train_scaled, y_train)

model2_training_score = round(model2.score(X_train_scaled, y_train)*100,3)
select_features_accuracy = round(model2.score(X_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model2_training_score} %")
print(f"Testing Data Score: {select_features_accuracy} %")

Training Data Score: 100.0 %
Testing Data Score: 89.709 %


In [24]:
# Create the RandomSearchCV model
model3 = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [200, 600, 1200, 1400],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [14, 15, 16, 17, 18, None]
}
grid = GridSearchCV(model3, param_grid, cv=5, verbose=3, n_jobs=-1)

# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   37.4s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 13.7min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=42,
                                  

In [25]:
print(grid.best_params_)
print(grid.best_score_)

{'max_depth': 16, 'max_features': 'auto', 'n_estimators': 200}
0.8929669807608794


In [26]:
# Tuned parameters
max_features = grid.best_params_['max_features']
n_estimators = grid.best_params_['n_estimators']
max_depth = grid.best_params_['max_depth']
criterion = 'entropy'

# Tuned model
tuned_model = RandomForestClassifier(max_features=max_features, n_estimators=n_estimators, 
                                     criterion=criterion, max_depth=max_depth, random_state=42)
tuned_model.fit(X_train_scaled, y_train)
model_3_training_score = round(tuned_model.score(X_train_scaled, y_train)*100,3)
tuned_accuracy = round(tuned_model.score(X_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model_3_training_score} %")
print(f"Testing Data Score: {tuned_accuracy} %")


Training Data Score: 98.532 %
Testing Data Score: 89.328 %


In [27]:
predictions = tuned_model.predict(X_test_scaled)
classifications = y_test.unique().tolist()

prediction_actual = {
    'Actual': y_test,
    'Prediction': predictions
}

df = pd.DataFrame(prediction_actual)
df = df.set_index('Actual').reset_index()
df.head(15)

Unnamed: 0,Actual,Prediction
0,FALSE POSITIVE,FALSE POSITIVE
1,CANDIDATE,CANDIDATE
2,CONFIRMED,CONFIRMED
3,CONFIRMED,CANDIDATE
4,CANDIDATE,FALSE POSITIVE
5,FALSE POSITIVE,FALSE POSITIVE
6,CANDIDATE,CONFIRMED
7,CONFIRMED,CONFIRMED
8,CONFIRMED,CONFIRMED
9,FALSE POSITIVE,FALSE POSITIVE
