# Logistic Regression

In [1]:
import pandas as pd
import numpy as np
# imports from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

lg = LogisticRegression()

In [2]:
#loading in the datasets
df_training_pca = pd.read_csv('../data/archive/train_pca.csv')
df_testing_pca = pd.read_csv('../data/archive/test_pca.csv')

#original datasets
df_training = pd.read_csv('../data/archive/train.csv')
df_testing = pd.read_csv('../data/archive/test.csv')

In [3]:
#defining x and y with pca dataset
x_pca_train = df_training_pca.iloc[:, :-1]
y_pca_train = df_training_pca.iloc[:,-1]

x_pca_test = df_testing_pca.iloc[:, :-1]
y_pca_test = df_testing_pca.iloc[:,-1]

In [4]:
#defining x and y with original dataset
x_train = df_training.iloc[:, :-2]
y_train = df_training.iloc[:,-1]

x_test = df_testing.iloc[:, :-2]
y_test = df_testing.iloc[:,-1]

## With PCA

In [5]:
#Making a grid of values we want our grid search to test to find the best parameters

grid_values = [{'penalty': ['l2'], 'C' :[.01,.05,.1,.3,.5,.8,1], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 'multi_class': ['multinomial'], 'max_iter' : [2000] }]


lg_classifier = GridSearchCV(estimator = lg, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit = False, verbose = 0)

lg_model = lg_classifier.fit(x_pca_train, y_pca_train)

In [6]:
#initializing our lists so we can later store the best params that achieve best scores across metrics
accuracy_best_params_pca = []
precision_best_params_pca = []
f1_best_params_pca = []

accuracy_best_params_pca.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_accuracy'])])
precision_best_params_pca.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_roc_auc_ovr'])])
f1_best_params_pca.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_f1_micro']) ])

In [7]:
accuracy_best_params_pca

[{'C': 1,
  'max_iter': 2000,
  'multi_class': 'multinomial',
  'penalty': 'l2',
  'solver': 'newton-cg'}]

In [8]:
precision_best_params_pca

[{'C': 1,
  'max_iter': 2000,
  'multi_class': 'multinomial',
  'penalty': 'l2',
  'solver': 'newton-cg'}]

In [9]:
f1_best_params_pca

[{'C': 1,
  'max_iter': 2000,
  'multi_class': 'multinomial',
  'penalty': 'l2',
  'solver': 'newton-cg'}]

In [10]:
#finding the accuracy metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score

    
def accuracy_metric(xtrain,ytrain,xtest,ytest):
    accuracy_test_score = []
    dt_clf = LogisticRegression(C=accuracy_best_params_pca[0]['C'], 
                                         multi_class = accuracy_best_params_pca[0]['multi_class'], 
                                         penalty = accuracy_best_params_pca[0]['penalty'],
                                         solver = accuracy_best_params_pca[0]['solver'],
                                         max_iter = accuracy_best_params_pca[0]['max_iter'])
    model = dt_clf.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    accuracy_test_score.append(accuracy_score(ytest, y_pred))
    return accuracy_test_score

def f1_metric(xtrain,ytrain,xtest,ytest):
    f1_test_score = []
    dt_clf = LogisticRegression(C=f1_best_params_pca[0]['C'], 
                                         multi_class = f1_best_params_pca[0]['multi_class'], 
                                         penalty = f1_best_params_pca[0]['penalty'],
                                         solver = f1_best_params_pca[0]['solver'],
                                         max_iter = f1_best_params_pca[0]['max_iter'])
    model = dt_clf.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    f1_test_score.append(f1_score(ytest, y_pred,average = 'micro'))
    return f1_test_score

def precision_metric(xtrain,ytrain,xtest,ytest):
    precision_test_score = []
    dt_clf = LogisticRegression(C=precision_best_params_pca[0]['C'], 
                                         multi_class = precision_best_params_pca[0]['multi_class'], 
                                         penalty = precision_best_params_pca[0]['penalty'],
                                         solver = precision_best_params_pca[0]['solver'],
                                         max_iter = precision_best_params_pca[0]['max_iter'])
    model = dt_clf.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    precision_test_score.append(precision_score(ytest, y_pred,average = 'micro'))
    return precision_test_score

In [11]:
print(accuracy_metric(x_pca_train,y_pca_train,x_pca_test,y_pca_test))
print(f1_metric(x_pca_train,y_pca_train,x_pca_test,y_pca_test))
print(precision_metric(x_pca_train,y_pca_train,x_pca_test,y_pca_test))

[0.9226331862911435]
[0.9226331862911435]
[0.9226331862911435]


## Bootstrap with PCA

In [12]:
#Separating X and y from testing since this won't be used in the for loop
x_train_full_pca = df_training_pca.iloc[:,:-1]
y_train_full_pca = df_training_pca.iloc[:,-1]

x_test_pca = df_testing_pca.iloc[:, :-1]
y_test_pca = df_testing_pca.iloc[:,-1]

In [13]:
#getting all of our randomized samples from the training
all_samples_pca = []

for sample in range(0,5):
    all_samples_pca.append(df_training.sample(n=5000, replace= True))

In [17]:
accuracy_best_params_pca = []
precision_best_params_pca = []
f1_best_params_pca = []


for sample in all_samples_pca:
    
    
    #Making a grid of values we want our grid search to test to find the best parameters
    grid_values = [{'penalty': ['l2'], 'C' :[.01,.05,.1,.3,.5,.8,1], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 'multi_class': ['multinomial'], 'max_iter' : [5000] }]


    lg_classifier = GridSearchCV(estimator = lg, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit = False, verbose = 0)

    x_train = sample.iloc[:, :-1]
    y_train = sample.iloc[:,-1]
    
    lg_model = lg_classifier.fit(x_train, y_train)
    
    
   

    accuracy_best_params_pca.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_accuracy'])])
    precision_best_params_pca.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params_pca.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_f1_micro']) ])
    
    

KeyboardInterrupt: 

In [None]:
accuracy_best_params_pca

In [None]:
f1_best_params_pca

In [None]:
precision_best_params_pca

In [None]:
print(accuracy_metric(x_train,y_train,x_test,y_test))
print(f1_metric(x_train,y_train,x_test,y_test))
print(precision_metric(x_train,y_train,x_test,y_test))

In [None]:
accuracy_test_score_pca = []

i = 0
for param in accuracy_best_params_pca:
 
    dt_clf = LogisticRegression(C=accuracy_best_params_pca[i]['C'], 
                                         multi_class = accuracy_best_params_pca[i]['multi_class'], 
                                         penalty = accuracy_best_params_pca[i]['penalty'],
                                         solver = accuracy_best_params_pca[i]['solver'],
                                         max_iter = accuracy_best_params_pca[i]['max_iter'])
    model = dt_clf.fit(x_train_full_pca, y_train_full_pca)
    y_pred = model.predict(x_test_pca)
    accuracy_test_score_pca.append(accuracy_score(y_test_pca, y_pred_pca))
    i += 1

In [None]:
accuracy_test_score_pca

In [None]:
f1_test_score_pca = []

i = 0
for param in f1_best_params_pca:
    
    dt_clf = LogisticRegression(C=f1_best_params_pca[0]['C'], 
                                         multi_class = f1_best_params_pca[i]['multi_class'], 
                                         penalty = f1_best_params_pca[i]['penalty'],
                                         solver = f1_best_params_pca[i]['solver'],
                                         max_iter = f1_best_params_pca[i]['max_iter'])
    model = dt_clf.fit(x_train_full_pca, y_train_full_pca)
    y_pred = model.predict(x_test_pca)
    f1_test_score_pca.append(f1_score(y_test_pca, y_pred_pca,average = 'micro'))
    i += 1

In [None]:
f1_test_score_pca

In [None]:
precision_test_score_pca = []

i = 0
for param in precision_best_params_pca:
    
    dt_clf = LogisticRegression(C=precision_best_params_pca[0]['C'], 
                                         multi_class = precision_best_params_pca[0]['multi_class'], 
                                         penalty = precision_best_params_pca[0]['penalty'],
                                         solver = precision_best_params_pca[0]['solver'],
                                         max_iter = precision_best_params_pca[0]['max_iter'])
    model = dt_clf.fit(x_train_full_pca, y_train_full_pca)
    y_pred = model.predict(x_test_pca)
    precision_test_score_pca.append(precision_score(y_test_pca, y_pred_pca,average = 'micro'))
    i += 1

In [None]:
precision_test_score_pca

# Logistic Regression (Non-PCA)

In [19]:
#Making a grid of values we want our grid search to test to find the best parameters

grid_values = [{'penalty': ['l2'], 'C' :[.01,.05,.1,.3,.5,.8,1], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 'multi_class': ['multinomial'], 'max_iter' : [2000] }]


lg_classifier = GridSearchCV(estimator = lg, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit = False, verbose = 0)

lg_model = lg_classifier.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [20]:
#initializing our lists so we can later store the best params that achieve best scores across metrics
accuracy_best_params = []
precision_best_params = []
f1_best_params = []

accuracy_best_params.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_accuracy'])])
precision_best_params.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_roc_auc_ovr'])])
f1_best_params.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_f1_micro']) ])

In [21]:
accuracy_best_params

[{'C': 1,
  'max_iter': 2000,
  'multi_class': 'multinomial',
  'penalty': 'l2',
  'solver': 'sag'}]

In [22]:
precision_best_params

[{'C': 1,
  'max_iter': 2000,
  'multi_class': 'multinomial',
  'penalty': 'l2',
  'solver': 'lbfgs'}]

In [23]:
f1_best_params

[{'C': 1,
  'max_iter': 2000,
  'multi_class': 'multinomial',
  'penalty': 'l2',
  'solver': 'sag'}]

In [24]:
def accuracy_metric(xtrain,ytrain,xtest,ytest):
    accuracy_test_score = []
    dt_clf = LogisticRegression(C=accuracy_best_params[0]['C'], 
                                         multi_class = accuracy_best_params[0]['multi_class'], 
                                         penalty = accuracy_best_params[0]['penalty'],
                                         solver = accuracy_best_params[0]['solver'],
                                         max_iter = accuracy_best_params[0]['max_iter'])
    model = dt_clf.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    accuracy_test_score.append(accuracy_score(ytest, y_pred))
    return accuracy_test_score

def f1_metric(xtrain,ytrain,xtest,ytest):
    f1_test_score = []
    dt_clf = LogisticRegression(C=f1_best_params[0]['C'], 
                                         multi_class = f1_best_params[0]['multi_class'], 
                                         penalty = f1_best_params[0]['penalty'],
                                         solver = f1_best_params[0]['solver'],
                                         max_iter = f1_best_params[0]['max_iter'])
    model = dt_clf.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    f1_test_score.append(f1_score(ytest, y_pred,average = 'micro'))
    return f1_test_score

def precision_metric(xtrain,ytrain,xtest,ytest):
    precision_test_score = []
    dt_clf = LogisticRegression(C=precision_best_params[0]['C'], 
                                         multi_class = precision_best_params[0]['multi_class'], 
                                         penalty = precision_best_params[0]['penalty'],
                                         solver = precision_best_params[0]['solver'],
                                         max_iter = precision_best_params[0]['max_iter'])
    model = dt_clf.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    precision_test_score.append(precision_score(ytest, y_pred,average = 'micro'))
    return precision_test_score

In [25]:
print(accuracy_metric(x_train,y_train,x_test,y_test))
print(f1_metric(x_train,y_train,x_test,y_test))
print(precision_metric(x_train,y_train,x_test,y_test))

ValueError: X has 561 features per sample; expecting 562

## Bootstrap without PCA

In [None]:
#Separating X and y from testing since this won't be used in the for loop
x_train_full = df_training.iloc[:,:-1]
y_train_full = df_training.iloc[:,-1]

x_test = df_testing.iloc[:, :-1]
y_test = df_testing.iloc[:,-1]

In [None]:
#getting all of our randomized samples from the training
all_samples = []

for sample in range(0,5):
    all_samples.append(df_training.sample(n=5000, replace= True))

In [None]:
accuracy_best_params = []
precision_best_params = []
f1_best_params = []


for sample in all_samples:
    
    
    #Making a grid of values we want our grid search to test to find the best parameters
    grid_values = [{'penalty': ['l2'], 'C' :[.01,.05,.1,.3,.5,.8,1], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 'multi_class': ['multinomial'], 'max_iter' : [2000] }]


    lg_classifier = GridSearchCV(estimator = lg, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'roc_auc_ovr', 'f1_micro'], refit = False, verbose = 0)

    x_train = sample.iloc[:, :-1]
    y_train = sample.iloc[:,-1]
    
    lg_model = lg_classifier.fit(x_train, y_train)
    
    
   

    accuracy_best_params.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_accuracy'])])
    precision_best_params.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_roc_auc_ovr'])])
    f1_best_params.append(lg_model.cv_results_['params'][ np.argmin(lg_model.cv_results_['rank_test_f1_micro']) ])

In [None]:
accuracy_best_params

In [None]:
f1_best_params

In [None]:
precision_best_params

In [None]:
print(accuracy_metric(x_train,y_train,x_test,y_test))
print(f1_metric(x_train,y_train,x_test,y_test))
print(precision_metric(x_train,y_train,x_test,y_test))

In [None]:
accuracy_test_score = []

i = 0
for param in accuracy_best_params:
 
    dt_clf = LogisticRegression(C=accuracy_best_params[i]['C'], 
                                         multi_class = accuracy_best_params[i]['multi_class'], 
                                         penalty = accuracy_best_params[i]['penalty'],
                                         solver = accuracy_best_params[i]['solver'],
                                         max_iter = accuracy_best_params[i]['max_iter'])
    model = dt_clf.fit(x_train_full, y_train_full)
    y_pred = model.predict(x_test)
    accuracy_test_score.append(accuracy_score(y_test, y_pred))
    i += 1

In [None]:
accuracy_test_score

In [None]:
f1_test_score = []

i = 0
for param in f1_best_params:
    
    dt_clf = LogisticRegression(C=f1_best_params[0]['C'], 
                                         multi_class = f1_best_params[i]['multi_class'], 
                                         penalty = f1_best_params[i]['penalty'],
                                         solver = f1_best_params[i]['solver'],
                                         max_iter = f1_best_params[i]['max_iter'])
    model = dt_clf.fit(x_train_full, y_train_full)
    y_pred = model.predict(x_test)
    f1_test_score.append(f1_score(y_test, y_pred,average = 'micro'))
    i += 1

In [None]:
f1_test_score

In [None]:
precision_test_score = []

i = 0
for param in precision_best_params:
    
    dt_clf = LogisticRegression(C=precision_best_params[0]['C'], 
                                         multi_class = precision_best_params[0]['multi_class'], 
                                         penalty = precision_best_params[0]['penalty'],
                                         solver = precision_best_params[0]['solver'],
                                         max_iter = precision_best_params[0]['max_iter'])
    model = dt_clf.fit(x_train_full, y_train_full)
    y_pred = model.predict(x_test)
    precision_test_score.append(precision_score(y_test, y_pred,average = 'micro'))
    i += 1

In [None]:
precision_test_score