In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
import joblib

In [5]:
data = pd.read_csv("Resources/exoplanet_data.csv")

# Drop null columns
data = data.dropna(axis='columns', how='all')

# Drop null rows
data = data.dropna()

# Convert dtypes of int64 to float64
for column, content in data.items():
    if data[column].dtype == 'int64':
        data = data.astype({column: 'float64'})

In [6]:
# Assign data to X and Y
X = data.drop("koi_disposition", axis=1)
Y = data["koi_disposition"]

# Split data into training and testing groups
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, stratify=Y)

In [7]:
# Scale X values
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
model_1 = LogisticRegression(solver='newton-cg', multi_class='auto')
model_1.fit(X_train_scaled, Y_train)

model_1_training_score = round(model_1.score(X_train_scaled, Y_train)*100,3)
base_accuracy = round(model_1.score(X_test_scaled, Y_test)*100,3)

print(f"Training Data Score: {model_1_training_score} %")
print(f"Testing Data score: {base_accuracy} %")

Training Data Score: 85.504 %
Testing Data score: 86.213 %


In [11]:
# Evaluate Features
feature_names = X.columns.tolist()
selector = RFECV(estimator=model_1, cv=5, step=1)
_ = selector.fit(X_train_scaled, Y_train)

In [12]:
# determine whichfeatures ought to be kept
preSelected_features = sorted(zip(selector.ranking_, feature_names))
ranked_features = pd.DataFrame(preSelected_features, columns=['Ranking', 'Feature'])
ranked_features = ranked_features.set_index('Feature')
ranked_features

Unnamed: 0_level_0,Ranking
Feature,Unnamed: 1_level_1
dec,1
koi_depth,1
koi_duration,1
koi_duration_err1,1
koi_duration_err2,1
koi_fpflag_co,1
koi_fpflag_ec,1
koi_fpflag_nt,1
koi_fpflag_ss,1
koi_impact,1


In [13]:
# remove features with ranking > 16
selected_features = []
for tup in preSelected_features:
    if tup[0] < 17:
        selected_features.append(tup[1])

In [18]:
# Use new data for all subsequent models
# Assign new data to X
X_train_select = X_train[selected_features]
X_test_select = X_test[selected_features]

X_scaler = MinMaxScaler().fit(X_train_select)
X_train_scaled = X_scaler.transform(X_train_select)
X_test_scaled = X_scaler.transform(X_test_select)

# Train new model
model_2 = LogisticRegression(solver='newton-cg', multi_class='auto')
model_2.fit(X_train_scaled, Y_train)

model_2_training_score = round(model_2.score(X_train_scaled, Y_train)*100,3)
select_features_accuracy = round(model_2.score(X_test_scaled, Y_test)*100,3)

print(f"Training Data Score: {model_2_training_score} %")
print(f"Testing Data Score: {select_features_accuracy} %")

Training Data Score: 85.504 %
Testing Data Score: 86.213 %


In [20]:
# Create the GridSearchCV model
model_3 = LogisticRegression(solver='newton-cg', multi_class='auto')

param_grid = {
    'C': np.logspace(0, 4, 10),
    'penalty': ['12']
}
grid = GridSearchCV(model_3, param_grid, cv=5, verbose=0)

#Train the model with GridSearch
_ = grid.fit(X_train_scaled, Y_train)

Traceback (most recent call last):
  File "C:\Users\akspe\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\akspe\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\akspe\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 438, in _check_solver
    raise ValueError("Logistic Regression supports only penalties in %s,"
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got 12.



ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got 12.

In [23]:
# Tuned parameters
C = grid.best_params_['C']
penalty = grid.best_params_['penalty']

# Tuned model
tuned_model = LogisticRegression(solver='newton-cg', multi_class='auto', C=C, penalty=penalty)
tuned_model.fit(X_train_scaled, Y_train)

model_3_training_score = round(tuned_model.score(X-train_scaled, Y_train)*100,3)
tuned_accuracy = round(tuned_model.score(X_test_scaled, Y_test)*100,3)

print(f"Training Data Score: {model_3_training_score} %")
print(f"Testing Data Score: {tuned-accuracy} %")

ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got 12.

In [None]:
predictions = tuned_model.predict(X_test_scaled)
classifications = Y_test.unique().tolist()

prediction_actual = {
    'Actual': Y_test,
    'Prediction': predictions
}

PA_df = pd.DataFrame(prediction_actual)
PA_df = PA_df.set_index('Actual').reset_index()
PA_df.head(15)

In [None]:
evaluations = {'': ['Base Model', 'Select Features Model', 'Tuned Model'], 'Accuracy': [f"{base_accuracy}%", f"{select_features_accuracy}%", f"{tuned_accuracy}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

evaluations_df.to_csv('Resources/LogisticRegression_eval.csv')
evaluations_df

In [None]:
filename = 'Models/OtherModel_LogisticRegression.sav'
_ = joblib.dump(tuned_model, filename)