In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

from sklearn.linear_model import LogisticRegression

In [3]:
diabetes_data = pd.read_csv('data/dataset_diabetes/diabetic_data.csv')

In [4]:
diabetes_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [6]:
diabetes_data.shape

(101766, 50)

In [7]:
diabetes_target = diabetes_data['readmitted']
diabetes_attributes = diabetes_data.drop('readmitted', axis = 1)

In [8]:
diabetes_attributes = pd.get_dummies(diabetes_attributes)

In [9]:
diabetes_attributes.shape

(101766, 2472)

In [10]:
diabetes_attributes_scaled = MinMaxScaler().fit_transform(diabetes_attributes)

In [11]:
logistic_regression_base = LogisticRegression(C = 1e9)
logistic_regression_base.fit(diabetes_attributes_scaled, diabetes_target)
logistic_regression_base.score(diabetes_attributes_scaled, diabetes_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5953068804905371

In [12]:
logistic_regression_base = LogisticRegression(C = 0.001)
logistic_regression_base.fit(diabetes_attributes_scaled, diabetes_target)
logistic_regression_base.score(diabetes_attributes_scaled, diabetes_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5645402197197492

In [13]:
diabetes_attributes_train, diabetes_attributes_test, diabetes_target_train, diabetes_target_test = train_test_split(diabetes_attributes, diabetes_target, test_size = 10000, random_state = 30, stratify = diabetes_target)

In [14]:
diabetes_attributes_train.shape, diabetes_attributes_test.shape

((91766, 2472), (10000, 2472))

In [15]:
diabetes_target_train.shape, diabetes_target_test.shape

((91766,), (10000,))

In [16]:
diabetes_data.groupby('readmitted').size()

readmitted
<30    11357
>30    35545
NO     54864
dtype: int64

In [17]:
diabetes_target_train.groupby(diabetes_target_train).size() / len(diabetes_target_train)

readmitted
<30    0.111599
>30    0.349280
NO     0.539121
Name: readmitted, dtype: float64

In [18]:
diabetes_target_test.groupby(diabetes_target_test).size() / len(diabetes_target_test)

readmitted
<30    0.1116
>30    0.3493
NO     0.5391
Name: readmitted, dtype: float64

In [19]:
logistic_regression_split = LogisticRegression(C = 5)
logistic_regression_split.fit(diabetes_attributes_train, diabetes_target_train)

LogisticRegression(C=5)

In [20]:
logistic_regression_split.score(diabetes_attributes_train, diabetes_target_train)

0.5413551860166075

In [21]:
logistic_regression_split.score(diabetes_attributes_test, diabetes_target_test)

0.5436

In [22]:
# high bias as they are very similar

In [23]:
diabetes_predictions_test = logistic_regression_split.predict(diabetes_attributes_test)

In [24]:
print(classification_report(diabetes_target_test, diabetes_predictions_test))

              precision    recall  f1-score   support

         <30       0.00      0.00      0.00      1116
         >30       0.45      0.14      0.21      3493
          NO       0.56      0.92      0.69      5391

    accuracy                           0.54     10000
   macro avg       0.34      0.35      0.30     10000
weighted avg       0.46      0.54      0.45     10000



  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
params = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'fit_intercept': [True, False],
    'max_iter': [10, 100, 1000]
}

In [26]:
grid_search = GridSearchCV(estimator = LogisticRegression(), param_grid = params)

In [27]:
grid_search.fit(diabetes_attributes_train[:100], diabetes_target_train[:100])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.1, 1, 10, 100, 1000],
                         'fit_intercept': [True, False],
                         'max_iter': [10, 100, 1000]})

In [28]:
grid_search.best_estimator_

LogisticRegression(C=0.01, max_iter=10)

In [30]:
grid_search.best_estimator_.score(diabetes_attributes_test, diabetes_target_test)

0.5378

In [31]:
print(classification_report(diabetes_target_test, grid_search.best_estimator_.predict(diabetes_attributes_test)))

              precision    recall  f1-score   support

         <30       0.00      0.00      0.00      1116
         >30       0.40      0.03      0.06      3493
          NO       0.54      0.98      0.70      5391

    accuracy                           0.54     10000
   macro avg       0.31      0.34      0.25     10000
weighted avg       0.43      0.54      0.40     10000



  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
grid_search.cv_results_

{'mean_fit_time': array([0.17997503, 0.06403942, 0.06701779, 0.04983034, 0.06988449,
        0.07157297, 0.06085544, 0.06218419, 0.06781635, 0.0504847 ,
        0.05746408, 0.07029939, 0.05506029, 0.05610857, 0.06343541,
        0.06047397, 0.0635006 , 0.0720048 , 0.06397943, 0.06820183,
        0.06081071, 0.07073421, 0.06424093, 0.06873016, 0.05122352,
        0.07289701, 0.07121177, 0.055336  , 0.06346822, 0.07329526,
        0.06439762, 0.05660434, 0.06436262, 0.05356731, 0.05857096,
        0.05747204]),
 'std_fit_time': array([0.17117755, 0.00517839, 0.01375815, 0.00296324, 0.01902178,
        0.00732693, 0.01570345, 0.0146041 , 0.01334568, 0.00698539,
        0.00766566, 0.00982823, 0.00981914, 0.00816387, 0.00733646,
        0.0199084 , 0.01060445, 0.01209664, 0.00575975, 0.02052409,
        0.00718166, 0.02208718, 0.00968519, 0.0124881 , 0.00789364,
        0.01121707, 0.0055387 , 0.01570527, 0.01330005, 0.00409036,
        0.0240858 , 0.01075452, 0.01293016, 0.013449  , 0.015