# AML Project - Classification with Logistic Regression

In [62]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report, f1_score
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

np.random.seed(18091998)

In [63]:
# Returns a DataFrame containing the 'df' numeric variables
def numeric(df):
    return df.select_dtypes(include = np.number)

# Returns a DataFrame containing the 'df' categorical variables
def categoric(df):
    return df.select_dtypes(include = 'object')

def compute_metrics(y_true,y_pred):
    accuracy = accuracy_score(y_true,y_pred)
    f1_score_macro = f1_score(y_true,y_pred,average='macro')
    return [accuracy,f1_score_macro]
    
def confusion(true, pred):
    pred = pd.Series(pred)
    true = pd.Series(true)
    
    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

results = pd.DataFrame(columns=['Accuracy', 'F1-score (macro avg)'])

## Logistic Regression with Std and all numeric features (except missings)

In [64]:
# We will not use missing variables as a preemptive measure as it assumes multicollinearity 

# Loading train data
train = pd.read_csv('./data/preprocessed/trainStd.csv')
train_data = train.drop(['outcome','outcome_lived','outcome_died','outcome_euthanized'], axis=1)

cols = [c for c in numeric(train_data).columns if ('Discr' not in c) and ('Missing' not in c)]
train_data=train_data[cols]

train_label = train.outcome

# Loading test data
test = pd.read_csv('./data/preprocessed/testStd.csv')
test_data = test.drop(['outcome','outcome_lived','outcome_died','outcome_euthanized'], axis=1)
test_data=test_data[cols]

test_label = test.outcome

In [65]:
logreg = LogisticRegression()

lr_cv = GridSearchCV(estimator = logreg,  
                           param_grid = {
                               'penalty': ['l1','l2'], 
                               'C': [0.001,0.01,0.1,1,10,100,1000],
                               'class_weight': ['balanced'],
                               'solver': ["newton-cg",'lbfgs', 'liblinear']
                           },
                           scoring = ['accuracy', 'f1_macro'],
                        refit =False,
                           cv = 10,
                           verbose=0)

lr_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(lr_cv.cv_results_)

In [66]:
cols = ['param_penalty', 'param_C', 'param_class_weight', 'param_solver',
        'mean_test_accuracy', 'mean_test_f1_macro']
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_penalty,param_C,param_class_weight,param_solver,mean_test_accuracy,mean_test_f1_macro
4,l2,0.001,balanced,lbfgs,0.722414,0.646608
3,l2,0.001,balanced,newton-cg,0.722414,0.646608
11,l2,0.01,balanced,liblinear,0.712414,0.619667
9,l2,0.01,balanced,newton-cg,0.70908,0.637971
10,l2,0.01,balanced,lbfgs,0.70908,0.637971


In [67]:
lr = LogisticRegression(penalty='l2',C=0.001,class_weight='balanced',solver='lbfgs')
lr.fit(train_data, train_label)

test_predicted=lr.predict(test_data)

results.loc['LR-Std-all',:] = compute_metrics(test_label, test_predicted)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
LR-Std-all,0.671642,0.554042


## Logistic Regression with MinMax and all numeric features (except missings)

In [68]:
# We will not use missing variables as a preemptive measure as it assumes multicollinearity 

# Loading train data
train = pd.read_csv('./data/preprocessed/trainMinMax.csv')
train_data = train.drop(['outcome','outcome_lived','outcome_died','outcome_euthanized'], axis=1)

cols = [c for c in numeric(train_data).columns if ('Discr' not in c) and ('Missing' not in c)]
train_data=train_data[cols]

train_label = train.outcome

# Loading test data
test = pd.read_csv('./data/preprocessed/testMinMax.csv')
test_data = test.drop(['outcome','outcome_lived','outcome_died','outcome_euthanized'], axis=1)
test_data=test_data[cols]

test_label = test.outcome

In [69]:
logreg = LogisticRegression()

lr_cv = GridSearchCV(estimator = logreg,  
                           param_grid = {
                               'penalty': ['l1','l2'], 
                               'C': [0.001,0.01,0.1,1,10,100,1000],
                               'class_weight': ['balanced'],
                               'solver': ["newton-cg",'lbfgs', 'liblinear']
                           },
                           scoring = ['accuracy', 'f1_macro'],
                        refit =False,
                           cv = 10,
                           verbose=0)

lr_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(lr_cv.cv_results_)

In [70]:
cols = ['param_penalty', 'param_C', 'param_class_weight', 'param_solver',
        'mean_test_accuracy', 'mean_test_f1_macro']
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_penalty,param_C,param_class_weight,param_solver,mean_test_accuracy,mean_test_f1_macro
20,l1,1,balanced,liblinear,0.682299,0.607662
29,l2,10,balanced,liblinear,0.67908,0.59588
41,l2,1000,balanced,liblinear,0.67908,0.59692
38,l1,1000,balanced,liblinear,0.67908,0.601903
26,l1,10,balanced,liblinear,0.675747,0.597397


In [71]:
lr = LogisticRegression(penalty='l1',C=1,class_weight='balanced',solver='liblinear')
lr.fit(train_data, train_label)

test_predicted=lr.predict(test_data)

results.loc['LR-MinMax-all',:] = compute_metrics(test_label, test_predicted)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
LR-Std-all,0.671642,0.554042
LR-MinMax-all,0.656716,0.57215


## Logistic Regression with Std and minimal numeric features

In [72]:
# We will not use missing variables as a preemptive measure as it assumes multicollinearity 

# Loading train data
train = pd.read_csv('./data/preprocessed/trainStd.csv')
train_data = numeric(train).iloc[:,:5]
train_label = train.outcome

# Loading test data
test = pd.read_csv('./data/preprocessed/testStd.csv')
test_data = numeric(test).iloc[:,:5]
test_label = test.outcome


In [73]:
logreg = LogisticRegression()

lr_cv = GridSearchCV(estimator = logreg,  
                           param_grid = {
                               'penalty': ['l1','l2'], 
                               'C': [0.001,0.01,0.1,1,10,100,1000],
                               'class_weight': ['balanced'],
                               'solver': ["newton-cg",'lbfgs', 'liblinear']
                           },
                           scoring = ['accuracy', 'f1_macro'],
                        refit =False,
                           cv = 10,
                           verbose=0)

lr_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(lr_cv.cv_results_)

In [74]:
cols = ['param_penalty', 'param_C', 'param_class_weight', 'param_solver',
        'mean_test_accuracy', 'mean_test_f1_macro']
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_penalty,param_C,param_class_weight,param_solver,mean_test_accuracy,mean_test_f1_macro
11,l2,0.01,balanced,liblinear,0.692414,0.582706
20,l1,1.0,balanced,liblinear,0.692299,0.573018
17,l2,0.1,balanced,liblinear,0.685747,0.570725
14,l1,0.1,balanced,liblinear,0.685747,0.571151
41,l2,1000.0,balanced,liblinear,0.685632,0.57487


In [75]:
lr = LogisticRegression(penalty='l2',C=0.01,class_weight='balanced',solver='liblinear')
lr.fit(train_data, train_label)

test_predicted=lr.predict(test_data)

results.loc['LR-Std-minimal',:] = compute_metrics(test_label, test_predicted)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
LR-Std-all,0.671642,0.554042
LR-MinMax-all,0.656716,0.57215
LR-Std-minimal,0.731343,0.590409


## Logistic Regression with MinMax and minimal numeric features

In [76]:
# We will not use missing variables as a preemptive measure as it assumes multicollinearity 

# Loading train data
train = pd.read_csv('./data/preprocessed/trainMinMax.csv')
train_data = numeric(train).iloc[:,:5]
train_label = train.outcome

# Loading test data
test = pd.read_csv('./data/preprocessed/testMinMax.csv')
test_data = numeric(test).iloc[:,:5]
test_label = test.outcome


In [77]:
logreg = LogisticRegression()

lr_cv = GridSearchCV(estimator = logreg,  
                           param_grid = {
                               'penalty': ['l1','l2'], 
                               'C': [0.001,0.01,0.1,1,10,100,1000],
                               'class_weight': ['balanced'],
                               'solver': ["newton-cg",'lbfgs', 'liblinear']
                           },
                           scoring = ['accuracy', 'f1_macro'],
                        refit =False,
                           cv = 10,
                           verbose=0)

lr_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(lr_cv.cv_results_)

In [78]:
cols = ['param_penalty', 'param_C', 'param_class_weight', 'param_solver',
        'mean_test_accuracy', 'mean_test_f1_macro']
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_penalty,param_C,param_class_weight,param_solver,mean_test_accuracy,mean_test_f1_macro
23,l2,1,balanced,liblinear,0.69908,0.56858
26,l1,10,balanced,liblinear,0.692299,0.579426
20,l1,1,balanced,liblinear,0.692299,0.578085
41,l2,1000,balanced,liblinear,0.685632,0.57487
38,l1,1000,balanced,liblinear,0.685632,0.57487


In [79]:
lr = LogisticRegression(penalty='l2',C=1,class_weight='balanced',solver='liblinear')
lr.fit(train_data, train_label)

test_predicted=lr.predict(test_data)

results.loc['LR-MinMax-minimal',:] = compute_metrics(test_label, test_predicted)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
LR-Std-all,0.671642,0.554042
LR-MinMax-all,0.656716,0.57215
LR-Std-minimal,0.731343,0.590409
LR-MinMax-minimal,0.716418,0.551321


## Logistic Regression with Std and reduced features

In [88]:
# We will not use missing variables as a preemptive measure as it assumes multicollinearity 

# Loading train data
train = pd.read_csv('./data/preprocessed/trainStd.csv')
train_data = numeric(train).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
train_data = train_data.drop(['nasogastricTube_none','nasogastricTube_significant','nasogastricTube_slight',
                             'nasogastricReflux_<1liter','nasogastricReflux_>1liter','nasogastricReflux_none',
                             'rectalExamination_absent','rectalExamination_decreased','rectalExamination_increased',
                             'rectalExamination_normal','abdomen_distendedLarge','abdomen_distendedSmall',
                             'abdomen_firmFecesLarge','abdomen_normal','abdomen_other'], axis=1)

cols = [c for c in numeric(train_data).columns if ('Discr' not in c) and ('Missing' not in c)]
train_data=train_data[cols]
train_label=train.outcome

# Loading test 
test = pd.read_csv('./data/preprocessed/testStd.csv')
test_data = numeric(test).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
test_data = test_data.drop(['nasogastricTube_none','nasogastricTube_significant','nasogastricTube_slight',
                             'nasogastricReflux_<1liter','nasogastricReflux_>1liter','nasogastricReflux_none',
                             'rectalExamination_absent','rectalExamination_decreased','rectalExamination_increased',
                             'rectalExamination_normal','abdomen_distendedLarge','abdomen_distendedSmall',
                             'abdomen_firmFecesLarge','abdomen_normal','abdomen_other'], axis=1)
test_data=test_data[cols]
test_label=test.outcome

In [89]:
logreg = LogisticRegression()

lr_cv = GridSearchCV(estimator = logreg,  
                           param_grid = {
                               'penalty': ['l1','l2'], 
                               'C': [0.001,0.01,0.1,1,10,100,1000],
                               'class_weight': ['balanced'],
                               'solver': ["newton-cg",'lbfgs', 'liblinear']
                           },
                           scoring = ['accuracy', 'f1_macro'],
                        refit =False,
                           cv = 10,
                           verbose=0)

lr_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(lr_cv.cv_results_)

In [90]:
cols = ['param_penalty', 'param_C', 'param_class_weight', 'param_solver',
        'mean_test_accuracy', 'mean_test_f1_macro']
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_penalty,param_C,param_class_weight,param_solver,mean_test_accuracy,mean_test_f1_macro
11,l2,0.01,balanced,liblinear,0.725747,0.630133
3,l2,0.001,balanced,newton-cg,0.71908,0.644808
4,l2,0.001,balanced,lbfgs,0.71908,0.644808
17,l2,0.1,balanced,liblinear,0.715632,0.628448
9,l2,0.01,balanced,newton-cg,0.70908,0.643453


In [93]:
lr = LogisticRegression(penalty='l2',C=0.01,class_weight='balanced',solver='liblinear')
lr.fit(train_data, train_label)

test_predicted=lr.predict(test_data)

results.loc['LR-Std-reduced',:] = compute_metrics(test_label, test_predicted)
results

confusion(test_label,test_predicted)

Unnamed: 0,Accuracy,F1-score (macro avg)
LR-Std-all,0.671642,0.554042
LR-MinMax-all,0.656716,0.57215
LR-Std-minimal,0.731343,0.590409
LR-MinMax-minimal,0.716418,0.551321
LR-Std-reduced,0.746269,0.618031
LR-MinMax-reduced,0.671642,0.425978


predicted,died,euthanized,lived
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
died,8,1,3
euthanized,1,3,4
lived,5,3,39


## Logistic Regression with MinMax and reduced features

In [84]:
# We will not use missing variables as a preemptive measure as it assumes multicollinearity 

# Loading train data
train = pd.read_csv('./data/preprocessed/trainMinMax.csv')
train_data = numeric(train).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
train_data = train_data.drop(['nasogastricTube_none','nasogastricTube_significant','nasogastricTube_slight',
                             'nasogastricReflux_<1liter','nasogastricReflux_>1liter','nasogastricReflux_none',
                             'rectalExamination_absent','rectalExamination_decreased','rectalExamination_increased',
                             'rectalExamination_normal','abdomen_distendedLarge','abdomen_distendedSmall',
                             'abdomen_firmFecesLarge','abdomen_normal','abdomen_other'], axis=1)

cols = [c for c in numeric(train_data).columns if ('Discr' not in c) and ('Missing' not in c)]
train_data=train_data[cols]
train_label=train.outcome

# Loading test 
test = pd.read_csv('./data/preprocessed/testMinMax.csv')
test_data = numeric(test).drop(['outcome_lived','outcome_died','outcome_euthanized'], axis=1)
test_data = test_data.drop(['nasogastricTube_none','nasogastricTube_significant','nasogastricTube_slight',
                             'nasogastricReflux_<1liter','nasogastricReflux_>1liter','nasogastricReflux_none',
                             'rectalExamination_absent','rectalExamination_decreased','rectalExamination_increased',
                             'rectalExamination_normal','abdomen_distendedLarge','abdomen_distendedSmall',
                             'abdomen_firmFecesLarge','abdomen_normal','abdomen_other'], axis=1)
test_data=test_data[cols]
test_label=test.outcome

In [85]:
logreg = LogisticRegression()

lr_cv = GridSearchCV(estimator = logreg,  
                           param_grid = {
                               'penalty': ['l1','l2'], 
                               'C': [0.001,0.01,0.1,1,10,100,1000],
                               'class_weight': ['balanced'],
                               'solver': ["newton-cg",'lbfgs', 'liblinear']
                           },
                           scoring = ['accuracy', 'f1_macro'],
                        refit =False,
                           cv = 10,
                           verbose=0)

lr_cv.fit(train_data, train_label)
results_cv = pd.DataFrame(lr_cv.cv_results_)

In [86]:
cols = ['param_penalty', 'param_C', 'param_class_weight', 'param_solver',
        'mean_test_accuracy', 'mean_test_f1_macro']
results_cv[cols].sort_values(by='mean_test_accuracy',ascending=False).head(5)

Unnamed: 0,param_penalty,param_C,param_class_weight,param_solver,mean_test_accuracy,mean_test_f1_macro
14,l1,0.1,balanced,liblinear,0.705862,0.604969
20,l1,1.0,balanced,liblinear,0.689195,0.613962
23,l2,1.0,balanced,liblinear,0.679195,0.579276
26,l1,10.0,balanced,liblinear,0.675747,0.582592
17,l2,0.1,balanced,liblinear,0.675747,0.55019


In [87]:
lr = LogisticRegression(penalty='l1',C=0.1,class_weight='balanced',solver='liblinear')
lr.fit(train_data, train_label)

test_predicted=lr.predict(test_data)

results.loc['LR-MinMax-reduced',:] = compute_metrics(test_label, test_predicted)
results

Unnamed: 0,Accuracy,F1-score (macro avg)
LR-Std-all,0.671642,0.554042
LR-MinMax-all,0.656716,0.57215
LR-Std-minimal,0.731343,0.590409
LR-MinMax-minimal,0.716418,0.551321
LR-Std-reduced,0.746269,0.618031
LR-MinMax-reduced,0.671642,0.425978
