# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder

from sklearn.linear_model import LogisticRegressionCV

In [3]:
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.inspection import permutation_importance

# Read in data

In [4]:
df = pd.read_csv("../DATA/clean.csv")

# TTS

In [5]:
X, X_test, y, y_test = train_test_split(df.drop('class',axis=1),
                                        df['class'],test_size =.1,
                                       random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=.09,
                                                 random_state=42)

# Basline

In [6]:
baseline = y_train.value_counts(normalize=True).max()

# Logistic Regression

In [7]:
log = make_pipeline(OneHotEncoder(use_cat_names=True),
                    LogisticRegressionCV(n_jobs=-1,random_state=42))

In [8]:
log.fit(X_train,y_train);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Metrics

In [9]:
print(f"Baseline: {baseline}\n")
print(f"Train Score: {log.score(X_train,y_train)}")
print(f"Val score: {log.score(X_val,y_val)}")
print(f"Test Score: {log.score(X_test,y_test)}")

Baseline: 0.43687834736036724

Train Score: 0.6323641928079572
Val score: 0.642512077294686
Test Score: 0.6523101018010963


# Hyperparameter Tuning

In [10]:
params = {
    'logisticregressioncv__Cs':[2,5,7,10,12,15,17,20],
    'logisticregressioncv__fit_intercept':[True,False],
    'logisticregressioncv__dual':[True,False],
    'logisticregressioncv__penalty':['l1', 'l2', 'elasticnet'],
    'logisticregressioncv__solver':['newton-cg', "lbfgs", 'liblinear', 'sag', 'saga'],
    'logisticregressioncv__max_iter':[100,300,500,700,1000]
}

In [11]:
random_search = RandomizedSearchCV(
    estimator=log,
    param_distributions=params,
    n_iter = 200,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

In [12]:
#random_search.fit(X_train,y_train);

In [13]:
#hyper_log = random_search.best_estimator_

In [14]:
#hyper_log.fit(X_train,y_train);

In [15]:
#random_search.best_params_

In [16]:
tune_log = make_pipeline(OneHotEncoder(use_cat_names=True),
                    LogisticRegressionCV(n_jobs=-1,random_state=42,solver='newton-cg',
                                        penalty='l2',max_iter=500,fit_intercept=True,
                                        dual=False,Cs=7))

In [17]:
tune_log.fit(X_train,y_train);

In [18]:
print(f"Baseline: {baseline}\n")
print(f"Train Score: {tune_log.score(X_train,y_train)}")
print(f"Val score: {tune_log.score(X_val,y_val)}")
print(f"Test Score: {tune_log.score(X_test,y_test)}")

Baseline: 0.43687834736036724

Train Score: 0.6326511094108646
Val score: 0.6444444444444445
Test Score: 0.6483946750195771


In [20]:
import pickle

with open('../Models/log','wb') as myfile:
    pickle.dump(log,myfile)

with open('../Models/tuned_log','wb') as myfile:
    pickle.dump(tune_log,myfile)