In [217]:
#Data Cleaning
import pandas as pd
import numpy as np
data = pd.read_csv("diabetic_data.csv")

In [218]:
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [219]:
data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [220]:
data.replace('?', pd.NA, inplace=True)

In [221]:
missing_values = data.isnull().sum()
print("Missing values per column:")
print(missing_values)

Missing values per column:
encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride          

In [222]:
data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [223]:
#Dropping dead patients data
data = data.loc[~data.discharge_disposition_id.isin([11,13,14,19,20,21])]

In [224]:
data['Output_Label'] = (data.readmitted == '<30').astype('int64')

In [225]:
data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [226]:
def calc_prevalence(y_actual):
    return(sum(y_actual/len(y_actual)))

print('Prevalence: %.3f' %calc_prevalence(data['Output_Label'].values))

Prevalence: 0.114


In [227]:
# 11% population is rehospitalized, we saw that using prevalance 

In [228]:
numerical_feature = ['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_outpatient','number_emergency','number_inpatient','number_diagnoses']

In [229]:
data[numerical_feature].isnull().sum()

time_in_hospital      0
num_lab_procedures    0
num_procedures        0
num_medications       0
number_outpatient     0
number_emergency      0
number_inpatient      0
number_diagnoses      0
dtype: int64

In [230]:
categorical_columns = []
for column in data.columns:
    # Check if the data type is object
    if data[column].dtype == 'object':
        # If it is, append the column name to the list of categorical columns
        categorical_columns.append(column)

# Print the list of categorical column names
print("Categorical columns:")
for column in categorical_columns:
    print(column)

Categorical columns:
race
gender
age
weight
payer_code
medical_specialty
diag_1
diag_2
diag_3
max_glu_serum
A1Cresult
metformin
repaglinide
nateglinide
chlorpropamide
glimepiride
acetohexamide
glipizide
glyburide
tolbutamide
pioglitazone
rosiglitazone
acarbose
miglitol
troglitazone
tolazamide
examide
citoglipton
insulin
glyburide-metformin
glipizide-metformin
glimepiride-pioglitazone
metformin-rosiglitazone
metformin-pioglitazone
change
diabetesMed
readmitted


In [231]:
categorical_features = ['race', 'gender',  'max_glu_serum', 'AlCresult',
                                        'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
                                        'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
                                        'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
                                        'tolazamide', 'insulin',
                                        'glyburide-metformin', 'glipizide-metformin',
                                        'glimepiride-pioglitazone', 'metformin-rosiglitazone',
                                        'metformin-pioglitazone', 'change', 'diabetesMed', 'payer_code']



In [232]:
data['race'] = data['race'].fillna('UNK')  
data['payer_code'] = data['payer_code'].fillna('UNK')  # Fills missing values in 'payer_code' column with 'UNK'
data['medical_specialty'] = data['medical_specialty'].fillna('UNK') # Fills missing values in 'medical_specialty' column with 'UNK'


In [233]:
print('Numbermedical_specialty:',data.medical_specialty.nunique())
data.groupby('medical_specialty').size().sort_values(ascending=False)

Numbermedical_specialty: 73


medical_specialty
UNK                                 48616
InternalMedicine                    14237
Emergency/Trauma                     7419
Family/GeneralPractice               7252
Cardiology                           5279
                                    ...  
Surgery-PlasticwithinHeadandNeck        1
Dermatology                             1
Proctology                              1
Psychiatry-Addictive                    1
Speech                                  1
Length: 73, dtype: int64

In [234]:
top_10 = ['UNK','InternalMedicine','Emergency/Trauma','Family/GeneralPractice ','Cardiology ','Surgery-General','Nephrology','Orthopdics','Orthopedics-Reconstructive','Radiologist']
data['med_spec'] = data['medical_specialty'].copy()
data.loc[~data.med_spec.isin(top_10),'med_spec']= 'Other'

In [235]:
data.groupby('med_spec').size()

med_spec
Emergency/Trauma               7419
InternalMedicine              14237
Nephrology                     1539
Orthopedics-Reconstructive     1230
Other                         22122
Radiologist                    1121
Surgery-General                3059
UNK                           48616
dtype: int64

In [238]:
data.shape

(99343, 52)

In [239]:
null_counts = data.isnull().sum()

# Print the number of null values in each column
print("Number of null values in each column:")
print(null_counts)

Number of null values in each column:
encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                      96218
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         20
diag_2                        356
diag_3                       1419
number_diagnoses                0
max_glu_serum               94191
A1Cresult                   82509
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepirid