In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from scipy import stats
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler
dataset = pd.read_csv('diabetic_data.csv')
dataset.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [2]:
dataset.shape

(101766, 50)

In [3]:
dataset.isnull().sum()

encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [4]:
dataset.replace("?", np.nan, inplace=True)

In [5]:
dataset['readmitted'] = dataset['readmitted'].map({'<30': 1, '>30': 0, 'NO': 0})

In [6]:
counts = dataset['readmitted'].value_counts()
print(counts)

readmitted
0    90409
1    11357
Name: count, dtype: int64


In [7]:
dataset.isnull().sum()


encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [8]:
dataset['A1Cresult'] = dataset['A1Cresult'].fillna('no data')
dataset['max_glu_serum'] = dataset['max_glu_serum'].fillna('no data')

In [9]:
dataset.isnull().sum()

encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [10]:
# Drop necessary columns
columns_to_drop = ['encounter_id','patient_nbr','weight','citoglipton','examide','payer_code','medical_specialty']

dataset.drop(columns_to_drop, axis=1,inplace=True)

In [11]:
dataset.shape

(101766, 43)

In [12]:
# Drop rows with missing values except for the specified columns
dataset.dropna(inplace=True)

In [13]:
dataset.shape

(98053, 43)

In [14]:
replaceDict = {'[0-10)' : 5,
'[10-20)' : 15,
'[20-30)' : 25, 
'[30-40)' : 35, 
'[40-50)' : 45, 
'[50-60)' : 55,
'[60-70)' : 65, 
'[70-80)' : 75,
'[80-90)' : 85,
'[90-100)' : 95}

dataset['age'] = dataset['age'].apply(lambda x : replaceDict[x])
print(dataset['age'].head())

1    15
2    25
3    35
4    45
5    55
Name: age, dtype: int64


In [15]:
dataset['change'] = dataset['change'].replace('Ch',1)
dataset['change'] = dataset['change'].replace('No',0)
dataset['diabetesMed'] = dataset['diabetesMed'].replace('Yes',1)
dataset['diabetesMed'] = dataset['diabetesMed'].replace('No',0)

In [16]:
dataset.head()


Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
1,Caucasian,Female,15,1,1,7,3,59,0,18,...,No,Up,No,No,No,No,No,1,1,0
2,AfricanAmerican,Female,25,1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,0,1,0
3,Caucasian,Male,35,1,1,7,2,44,1,16,...,No,Up,No,No,No,No,No,1,1,0
4,Caucasian,Male,45,1,1,7,1,51,0,8,...,No,Steady,No,No,No,No,No,1,1,0
5,Caucasian,Male,55,2,1,2,3,31,6,16,...,No,Steady,No,No,No,No,No,0,1,0


In [17]:
# Selecting numerical and categorical features
numerical_features = dataset.select_dtypes(include=['int', 'float']).columns
categorical_features = dataset.select_dtypes(include=['object']).columns

# Creating DataFrame subsets based on selected features
numerical_dataset = dataset[numerical_features]
categorical_dataset = dataset[categorical_features]

In [18]:
import numpy as np

# Choose the numerical columns you want to check for outliers
numerical_columns = dataset.select_dtypes(include=np.number).columns.tolist()

outliers_percentage = {}

# Iterate over numerical columns
for column in numerical_columns:
    # Calculate Z-Score Method outliers
    z_scores = np.abs((dataset[column] - dataset[column].mean()) / dataset[column].std())
    outliers_zscore = (z_scores > 3)
    
    # Calculate percentage of outliers
    total_outliers = outliers_zscore.sum()
    total_count = len(dataset)
    percentage = (total_outliers / total_count) * 100
    
    outliers_percentage[column] = percentage

# Print percentage of outliers for each column
print("Percentage of outliers in each column using Z-score method:")
for column, percentage in outliers_percentage.items():
    print(f"{column}: {percentage:.2f}%")

Percentage of outliers in each column using Z-score method:
age: 0.54%
admission_type_id: 0.34%
discharge_disposition_id: 3.57%
admission_source_id: 0.18%
time_in_hospital: 1.04%
num_lab_procedures: 0.04%
num_procedures: 0.00%
num_medications: 1.34%
number_outpatient: 1.46%
number_emergency: 0.95%
number_inpatient: 2.03%
number_diagnoses: 0.06%
change: 0.00%
diabetesMed: 0.00%
readmitted: 0.00%


In [19]:
time_in_hospital_description = dataset['number_diagnoses'].describe()
print(time_in_hospital_description)

count    98053.000000
mean         7.512060
std          1.832497
min          3.000000
25%          6.000000
50%          8.000000
75%          9.000000
max         16.000000
Name: number_diagnoses, dtype: float64


In [20]:
# Define the columns for which you want to remove outliers using Z-score
columns_with_zscore = ['num_lab_procedures','number_diagnoses']

# Remove outliers from the specified columns using Z-score
for column in columns_with_zscore:
    # Calculate Z-Score Method outliers
    z_scores = np.abs((dataset[column] - dataset[column].mean()) / dataset[column].std())
    outliers_zscore = (z_scores > 3)
    
    # Remove outliers from the specific column
    dataset = dataset[~outliers_zscore]

In [21]:
# Define the columns for which you want to apply IQR
columns_with_iqr = ['num_medications','age','time_in_hospital']
# Remove outliers from the specified columns using IQR
for column in columns_with_iqr:
    # Calculate the quantiles
    Q1 = dataset[column].quantile(0.25)
    Q3 = dataset[column].quantile(0.75)
    
    # Calculate the interquartile range (IQR)
    IQR = Q3 - Q1
    
    # Define the upper and lower bounds for outlier detection
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Remove outliers by filtering the DataFrame
    dataset = dataset[(dataset[column] >= lower_bound) & (dataset[column] <= upper_bound)]

In [22]:
dataset.shape

(92377, 43)