## Libs

In [85]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (confusion_matrix,
                             classification_report)

## Data

In [4]:
data = pd.read_csv("patients_diabetes_data.csv")

## EDA

In [90]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 22 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   id                                           129 non-null    int64  
 1   gender                                       129 non-null    object 
 2   dead                                         129 non-null    int64  
 3   religion                                     128 non-null    object 
 4   ethnicity                                    129 non-null    object 
 5   insurance                                    129 non-null    object 
 6   number_of_emergency_stays                    129 non-null    int64  
 7   number_of_elective_stays                     129 non-null    int64  
 8   number_of_urgent_stays                       129 non-null    int64  
 9   A1c_hemoglobin_%_mean                        21 non-null     float64
 10  cr

In [93]:
data.describe()

Unnamed: 0,id,dead,number_of_emergency_stays,number_of_elective_stays,number_of_urgent_stays,A1c_hemoglobin_%_mean,creatinine_mg/dL_mean,glucose_mg/dL_mean,A1c_absolute_mean,number_of_abnormal_results_A1c_hemoglobin_%,number_of_abnormal_results_creatinine,number_of_abnormal_results_glucose,number_of_abnormal_results_A1c_absolute,number_of_A1c_hemoglobin_%_tests,number_of_creatinine_tests,number_of_glucose_tests,number_of_A1c_absolute_tests,diabetes
count,129.0,129.0,129.0,129.0,129.0,21.0,128.0,129.0,0.0,21.0,129.0,129.0,3.0,21.0,129.0,129.0,3.0,129.0
mean,28010.410853,1.0,2.79845,0.077519,0.015504,6.235714,1.374484,140.182349,,0.571429,19.209302,27.263566,0.0,1.333333,38.410853,37.573643,1.333333,0.379845
std,16048.502883,0.0,4.479781,0.268456,0.124027,1.320903,1.308696,39.312695,,0.74642,29.938916,38.162178,0.0,0.483046,49.507041,49.135459,0.57735,0.48724
min,10006.0,1.0,0.0,0.0,0.0,4.9,0.367,78.364,,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
25%,10088.0,1.0,1.0,0.0,0.0,5.4,0.66,112.323,,0.0,0.0,4.0,0.0,1.0,8.0,7.0,1.0,0.0
50%,40310.0,1.0,1.0,0.0,0.0,5.8,0.9655,134.692,,0.0,5.0,11.0,0.0,1.0,18.0,17.0,1.0,0.0
75%,42135.0,1.0,2.0,0.0,0.0,6.55,1.3125,158.572,,1.0,20.0,22.0,0.0,2.0,35.0,35.0,1.5,1.0
max,44228.0,1.0,15.0,1.0,1.0,10.1,10.083,344.0,,2.0,125.0,138.0,0.0,2.0,171.0,167.0,2.0,1.0


## Data cleaning

In [10]:
data.columns

Index(['id', 'gender', 'dead', 'religion', 'ethnicity', 'insurance',
       'number_of_emergency_stays', 'number_of_elective_stays',
       'number_of_urgent_stays', 'A1c_hemoglobin_%_mean',
       'creatinine_mg/dL_mean', 'glucose_mg/dL_mean', 'A1c_absolute_mean',
       'number_of_abnormal_results_A1c_hemoglobin_%',
       'number_of_abnormal_results_creatinine',
       'number_of_abnormal_results_glucose',
       'number_of_abnormal_results_A1c_absolute',
       'number_of_A1c_hemoglobin_%_tests', 'number_of_creatinine_tests',
       'number_of_glucose_tests', 'number_of_A1c_absolute_tests', 'diabetes'],
      dtype='object')

In [18]:
df = data[['gender', 'dead', 'religion', 'ethnicity', 'insurance', 'number_of_emergency_stays', 'number_of_elective_stays', 'number_of_urgent_stays', 'creatinine_mg/dL_mean', 'glucose_mg/dL_mean', 'number_of_abnormal_results_creatinine', 'number_of_abnormal_results_glucose', 'diabetes']]

In [19]:
df.head()

Unnamed: 0,gender,dead,religion,ethnicity,insurance,number_of_emergency_stays,number_of_elective_stays,number_of_urgent_stays,creatinine_mg/dL_mean,glucose_mg/dL_mean,number_of_abnormal_results_creatinine,number_of_abnormal_results_glucose,diabetes
0,F,1,CATHOLIC,BLACK/AFRICAN AMERICAN,Medicare,1,0,0,5.418,126.82,62,41,1
1,F,1,CATHOLIC,UNKNOWN/NOT SPECIFIED,Private,1,0,0,0.563,115.526,0,11,0
2,F,1,CATHOLIC,UNKNOWN/NOT SPECIFIED,Medicare,1,0,0,1.7,149.5,2,2,0
3,F,1,CATHOLIC,WHITE,Medicare,1,0,0,0.44,204.8,2,13,1
4,M,1,CATHOLIC,WHITE,Medicare,1,0,0,5.8,194.8,4,3,0


In [20]:
def encoding(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [21]:
df = encoding(df, 'gender')
df = encoding(df, 'religion')
df = encoding(df, 'ethnicity')
df = encoding(df, 'insurance')

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 34 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   dead                                                                129 non-null    int64  
 1   number_of_emergency_stays                                           129 non-null    int64  
 2   number_of_elective_stays                                            129 non-null    int64  
 3   number_of_urgent_stays                                              129 non-null    int64  
 4   creatinine_mg/dL_mean                                               128 non-null    float64
 5   glucose_mg/dL_mean                                                  129 non-null    float64
 6   number_of_abnormal_results_creatinine                               129 non-null    int64  
 7   number_of_abnorma

## Train, test split

In [45]:
data[['gender','number_of_emergency_stays', 'number_of_elective_stays', 'number_of_urgent_stays', 'creatinine_mg/dL_mean', 'glucose_mg/dL_mean', 'number_of_abnormal_results_creatinine', 'number_of_abnormal_results_glucose', 'diabetes']].describe()

Unnamed: 0,number_of_emergency_stays,number_of_elective_stays,number_of_urgent_stays,creatinine_mg/dL_mean,glucose_mg/dL_mean,number_of_abnormal_results_creatinine,number_of_abnormal_results_glucose,diabetes
count,129.0,129.0,129.0,128.0,129.0,129.0,129.0,129.0
mean,2.79845,0.077519,0.015504,1.374484,140.182349,19.209302,27.263566,0.379845
std,4.479781,0.268456,0.124027,1.308696,39.312695,29.938916,38.162178,0.48724
min,0.0,0.0,0.0,0.367,78.364,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.66,112.323,0.0,4.0,0.0
50%,1.0,0.0,0.0,0.9655,134.692,5.0,11.0,0.0
75%,2.0,0.0,0.0,1.3125,158.572,20.0,22.0,1.0
max,15.0,1.0,1.0,10.083,344.0,125.0,138.0,1.0


In [56]:
df[df.isnull().any(axis=1)]

Unnamed: 0,index,dead,number_of_emergency_stays,number_of_elective_stays,number_of_urgent_stays,creatinine_mg/dL_mean,glucose_mg/dL_mean,number_of_abnormal_results_creatinine,number_of_abnormal_results_glucose,diabetes,...,ethnicity_HISPANIC OR LATINO,ethnicity_HISPANIC/LATINO - PUERTO RICAN,ethnicity_OTHER,ethnicity_UNABLE TO OBTAIN,ethnicity_UNKNOWN/NOT SPECIFIED,ethnicity_WHITE,insurance_Government,insurance_Medicaid,insurance_Medicare,insurance_Private
53,53,1,1,0,0,,134.692,59,83,0,...,0,0,0,0,0,1,0,0,0,1


In [59]:
df = df.fillna(df.mean())

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 35 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   index                                                               129 non-null    int64  
 1   dead                                                                129 non-null    int64  
 2   number_of_emergency_stays                                           129 non-null    int64  
 3   number_of_elective_stays                                            129 non-null    int64  
 4   number_of_urgent_stays                                              129 non-null    int64  
 5   creatinine_mg/dL_mean                                               129 non-null    float64
 6   glucose_mg/dL_mean                                                  129 non-null    float64
 7   number_of_abnorma

In [61]:
df = df.reset_index()

In [62]:
x = df.loc[:, df.columns != 'diabetes']
y = df[['diabetes']]

In [63]:
x.head()

Unnamed: 0,level_0,index,dead,number_of_emergency_stays,number_of_elective_stays,number_of_urgent_stays,creatinine_mg/dL_mean,glucose_mg/dL_mean,number_of_abnormal_results_creatinine,number_of_abnormal_results_glucose,...,ethnicity_HISPANIC OR LATINO,ethnicity_HISPANIC/LATINO - PUERTO RICAN,ethnicity_OTHER,ethnicity_UNABLE TO OBTAIN,ethnicity_UNKNOWN/NOT SPECIFIED,ethnicity_WHITE,insurance_Government,insurance_Medicaid,insurance_Medicare,insurance_Private
0,0,0,1,1,0,0,5.418,126.82,62,41,...,0,0,0,0,0,0,0,0,1,0
1,1,1,1,1,0,0,0.563,115.526,0,11,...,0,0,0,0,1,0,0,0,0,1
2,2,2,1,1,0,0,1.7,149.5,2,2,...,0,0,0,0,1,0,0,0,1,0
3,3,3,1,1,0,0,0.44,204.8,2,13,...,0,0,0,0,0,1,0,0,1,0
4,4,4,1,1,0,0,5.8,194.8,4,3,...,0,0,0,0,0,1,0,0,1,0


In [64]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.3, random_state=13)

## Modelling

In [65]:
model_params={
    'svm':{
        'model': svm.SVC(gamma='auto',tol=1e-1, cache_size=2000,max_iter=150),
        'params': {
            'C': [1,3,10],
            'kernel': ['rbf','linear','poly']
        }
    },
    'random_forest':{
        'model': RandomForestClassifier(random_state=15),
        'params': {
            'n_estimators':[1,3,8,13],
            'min_samples_split': [3,5,10,20]
        }
    },
    'logistic_regession':{
        'model': LogisticRegression(random_state=15, solver='liblinear'),
        'params':{
            'C':[1,5,10],
            'penalty':['l1','l2','elasticnet']
        }
    }
}

In [76]:
results=[]
models = {}

for model_name, mod_par in model_params.items():
    print(model_name)
    clasifier=GridSearchCV(mod_par['model'],mod_par['params'],cv=5,return_train_score=False)
    clasifier.fit(x_train, y_train)
    results.append({
        'model': model_name,
        'best_score': clasifier.best_score_,
        'best_params': clasifier.best_params_
    })
    
    info = model_name + str(clasifier.best_params_)
    models[info] = clasifier

svm


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


random_forest


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

logistic_regession


Traceback (most recent call last):
  File "/Users/mslapek002/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mslapek002/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/mslapek002/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    raise ValueError("Only 'saga' solver supports elasticnet penalty,"
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
Traceback (most recent call last):
  File "/Users/mslapek002/opt/anaconda3/lib/python3.8/site-pa

In [77]:
answer=pd.DataFrame(results,columns=['model','best_score','best_params'])
answer

Unnamed: 0,model,best_score,best_params
0,svm,0.833333,"{'C': 3, 'kernel': 'rbf'}"
1,random_forest,0.811111,"{'min_samples_split': 3, 'n_estimators': 8}"
2,logistic_regession,0.744444,"{'C': 1, 'penalty': 'l2'}"


In [78]:
results

[{'model': 'svm',
  'best_score': 0.8333333333333334,
  'best_params': {'C': 3, 'kernel': 'rbf'}},
 {'model': 'random_forest',
  'best_score': 0.8111111111111111,
  'best_params': {'min_samples_split': 3, 'n_estimators': 8}},
 {'model': 'logistic_regession',
  'best_score': 0.7444444444444444,
  'best_params': {'C': 1, 'penalty': 'l2'}}]

## Predict

In [79]:
models

{"svm{'C': 3, 'kernel': 'rbf'}": GridSearchCV(cv=5,
              estimator=SVC(cache_size=2000, gamma='auto', max_iter=150,
                            tol=0.1),
              param_grid={'C': [1, 3, 10], 'kernel': ['rbf', 'linear', 'poly']}),
 "random_forest{'min_samples_split': 3, 'n_estimators': 8}": GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=15),
              param_grid={'min_samples_split': [3, 5, 10, 20],
                          'n_estimators': [1, 3, 8, 13]}),
 "logistic_regession{'C': 1, 'penalty': 'l2'}": GridSearchCV(cv=5,
              estimator=LogisticRegression(random_state=15, solver='liblinear'),
              param_grid={'C': [1, 5, 10],
                          'penalty': ['l1', 'l2', 'elasticnet']})}

In [81]:
y_hat = models['svm{\'C\': 3, \'kernel\': \'rbf\'}'].predict(x_test)

In [82]:
y_hat

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

## Metrics

In [84]:
matrix = confusion_matrix(y_test, y_hat)
print(matrix)

[[22  0]
 [12  5]]


In [87]:
report = classification_report(y_test, y_hat)
print(report)

              precision    recall  f1-score   support

           0       0.65      1.00      0.79        22
           1       1.00      0.29      0.45        17

    accuracy                           0.69        39
   macro avg       0.82      0.65      0.62        39
weighted avg       0.80      0.69      0.64        39

