In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
import joblib

In [11]:
dataset = pd.read_csv("Resources/cumulative.csv")

#drop some of the unwanted columns
dataset = dataset.drop(columns=["rowid", "kepid", "kepoi_name", "kepler_name", "koi_pdisposition", "koi_score", "koi_tce_delivname"])
dataset = dataset.dropna(axis='columns', how='all')

# Drop null columns
dataset = dataset.dropna(axis='columns', how='all')

# Drop null rows
dataset = dataset.dropna()
dataset.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0,1,0,0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0,0,0,0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [12]:
#convert int64 to float64
for column, i in dataset.items():
    if dataset[column].dtype =='int64':
        dataset = dataset.astype({column: 'float64'})
        
dataset.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0.0,0.0,0.0,0.0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,CONFIRMED,0.0,0.0,0.0,0.0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,FALSE POSITIVE,0.0,1.0,0.0,0.0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,FALSE POSITIVE,0.0,1.0,0.0,0.0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,CONFIRMED,0.0,0.0,0.0,0.0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [13]:
# assign data to X and y
X = dataset.drop("koi_disposition", axis= 1)
y = dataset["koi_disposition"]

#split dataset to into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [15]:
# Scale X values
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
#train the model
# Currently the ‘multinomial’ option is supported only by the ‘lbfgs’ and ‘newton-cg’ solvers.
# So need to explicitly set solver to 'newton-cg' or 'lbfgs', since the default solver is 'liblinear'.
model1 = LogisticRegression(solver='newton-cg', multi_class='auto')
model1.fit(X_train_scaled, y_train)

model1_training_score = round(model1.score(X_train_scaled, y_train)* 100,3)
base_accuracy = round(model1.score(X_test_scaled, y_test) * 100,3)

print(f"Training Data Score: {model1_training_score} %")
print(f"Testing Data Score: {base_accuracy} %")


Training Data Score: 85.575 %
Testing Data Score: 84.442 %


In [24]:
predictions = model1.predict(X_test_scaled)
classifications = y_test.unique().tolist()

prediction_actual = {
    'Actual': y_test,
    'Prediction': predictions
}

df = pd.DataFrame(prediction_actual)
df = df.set_index('Actual').reset_index()
df.head(15)

Unnamed: 0,Actual,Prediction
0,FALSE POSITIVE,FALSE POSITIVE
1,CANDIDATE,CANDIDATE
2,CONFIRMED,CONFIRMED
3,CONFIRMED,CANDIDATE
4,CANDIDATE,FALSE POSITIVE
5,FALSE POSITIVE,FALSE POSITIVE
6,CANDIDATE,CONFIRMED
7,CONFIRMED,CONFIRMED
8,CONFIRMED,CANDIDATE
9,FALSE POSITIVE,FALSE POSITIVE


In [26]:
# Evaluate features
feature_names = X.columns.tolist()
selector = RFECV(estimator=model1, cv=5, step=1)
selector.fit(X_train_scaled, y_train)

In [27]:
# Determine which features ought to be kept
preSelected_features = sorted(zip(selector.ranking_, feature_names))
ranked_features = pd.DataFrame(preSelected_features, columns=['Ranking', 'Feature'])
ranked_features = ranked_features.set_index('Feature')
ranked_features

Unnamed: 0_level_0,Ranking
Feature,Unnamed: 1_level_1
dec,1
koi_depth,1
koi_depth_err1,1
koi_depth_err2,1
koi_duration,1
koi_duration_err1,1
koi_duration_err2,1
koi_fpflag_co,1
koi_fpflag_ec,1
koi_fpflag_nt,1


In [28]:
# Remove features with Ranking > 12
selected_features = []
for tup in preSelected_features:
    if tup[0] < 12:
        selected_features.append(tup[1])

In [29]:
# Use new data for all subsequent models
# Assign new data to X 
X_train_select = X_train[selected_features]
X_test_select = X_test[selected_features]

X_scaler = MinMaxScaler().fit(X_train_select)
X_train_scaled = X_scaler.transform(X_train_select)
X_test_scaled = X_scaler.transform(X_test_select)

## Train new model
model2 = LogisticRegression(solver='newton-cg', multi_class='auto')
model2.fit(X_train_scaled, y_train)

model2_training_score = round(model2.score(X_train_scaled, y_train)*100,3)
select_features_accuracy = round(model2.score(X_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model2_training_score} %")
print(f"Testing Data Score: {select_features_accuracy} %")

Training Data Score: 85.575 %
Testing Data Score: 84.442 %


In [32]:
#model tuning and training
#Create the GridSearchCV model
model3 = LogisticRegression(solver='newton-cg', multi_class='auto')

param_grid = {
    'C': np.logspace(0, 4, 10),
    'penalty': ['l2']
}
grid = GridSearchCV(model3, param_grid, cv=5, verbose=0)

# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)


GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='newton-cg',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                         'penalty': ['l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             s

In [33]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 464.15888336127773, 'penalty': 'l2'}
0.887678335660137


In [36]:
# Tuned parameters
C = grid.best_params_['C']
penalty = grid.best_params_['penalty']

# Tuned model
tuned_model = LogisticRegression(solver='newton-cg', multi_class='auto',
                                 C=C, penalty=penalty)
tuned_model.fit(X_train_scaled, y_train)
model3_training_score = round(tuned_model.score(X_train_scaled, y_train)*100,3)
tuned_accuracy = round(tuned_model.score(X_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model3_training_score} %")
print(f"Testing Data Score: {tuned_accuracy} %")


Training Data Score: 89.041 %
Testing Data Score: 88.115 %


In [37]:
predictions = tuned_model.predict(X_test_scaled)
classifications = y_test.unique().tolist()

prediction_actual = {
    'Actual': y_test,
    'Prediction': predictions
}

df = pd.DataFrame(prediction_actual)
df = df.set_index('Actual').reset_index()
df.head(15)

Unnamed: 0,Actual,Prediction
0,FALSE POSITIVE,FALSE POSITIVE
1,CANDIDATE,CANDIDATE
2,CONFIRMED,CONFIRMED
3,CONFIRMED,CANDIDATE
4,CANDIDATE,FALSE POSITIVE
5,FALSE POSITIVE,FALSE POSITIVE
6,CANDIDATE,CONFIRMED
7,CONFIRMED,CONFIRMED
8,CONFIRMED,CANDIDATE
9,FALSE POSITIVE,FALSE POSITIVE
