In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 
  
# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) 

{'uci_id': 296, 'name': 'Diabetes 130-US hospitals for years 1999-2008', 'repository_url': 'https://archive.ics.uci.edu/dataset/296/diabetes+130-us+hospitals+for+years+1999-2008', 'data_url': 'https://archive.ics.uci.edu/static/public/296/data.csv', 'abstract': 'The dataset represents ten years (1999-2008) of clinical care at 130 US hospitals and integrated delivery networks. Each row concerns hospital records of patients diagnosed with diabetes, who underwent laboratory, medications, and stayed up to 14 days. The goal is to determine the early readmission of the patient within 30 days of discharge.\nThe problem is important for the following reasons. Despite high-quality evidence showing improved clinical outcomes for diabetic patients who receive various preventive and therapeutic interventions, many patients do not receive them. This can be partially attributed to arbitrary diabetes management in hospital environments, which fail to attend to glycemic control. Failure to provide pro

  df = pd.read_csv(data_url)


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [3]:
data = X.copy(deep=True)
y = data['diabetesMed']

### One-Hot Encoding and Removing Columns

In [4]:
data = pd.get_dummies(data, columns=['race'], prefix='race')
data = pd.get_dummies(data, columns=['age'], prefix='age') #sections, 0-10, 10-20, ...
data = pd.get_dummies(data, columns=['gender'], prefix='gender') #'Female', 'Male', 'Unknown/Invalid'
data = pd.get_dummies(data, columns=['metformin'], prefix='metformin') #it was no, steady , up or down for usage 
data = pd.get_dummies(data, columns=['A1Cresult'], prefix='A1Cresult')#measures blood sugar, none if not conducted, >8% , >7% and norm
data = pd.get_dummies(data, columns=['max_glu_serum'], prefix='max_glu_serum')#'None', '>300', 'Norm', '>200' , higher 200 diab
data = pd.get_dummies(data, columns=['weight'], prefix='weight')#nan, '[75-100)', '[50-75)', '[0-25)', ..., '>200'
data = pd.get_dummies(data, columns=['repaglinide'], prefix='repaglinide') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['nateglinide'], prefix='nateglinide') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['chlorpropamide'], prefix='chlorpropamide') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['glimepiride'], prefix='glimepiride') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['acetohexamide'], prefix='acetohexamide') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['glipizide'], prefix='glipizide') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['glyburide'], prefix='glyburide') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['tolbutamide'], prefix='tolbutamide') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['pioglitazone'], prefix='pioglitazone') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['rosiglitazone'], prefix='rosiglitazone') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['acarbose'], prefix='acarbose') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['miglitol'], prefix='miglitol') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['troglitazone'], prefix='troglitazone') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['tolazamide'], prefix='tolazamide') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['examide'], prefix='examide') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['citoglipton'], prefix='citoglipton') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['insulin'], prefix='insulin') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['glyburide-metformin'], prefix='glyburide-metformin') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['glipizide-metformin'], prefix='glipizide-metformin') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['glimepiride-pioglitazone'], prefix='glimepiride-pioglitazone') #'No', 'Up', 'Steady', 'Down'
data = pd.get_dummies(data, columns=['metformin-rosiglitazone'], prefix='metformin-rosiglitazone') #'No', 'Up', '
data = pd.get_dummies(data, columns=['metformin-pioglitazone'], prefix='metformin-pioglitazone') #'No', 'Up', '
data = pd.get_dummies(data, columns=['change'], prefix='change') #change or no change
#data = pd.get_dummies(data, columns=['readmitted'], prefix='readmitted') #'No', 'Up', '
data = pd.get_dummies(data, columns=['diabetesMed'], prefix='diabetesMed') #'yes or no

In [5]:
columns_to_drop = ['diag_1', 'diag_2', 'diag_3','admission_type_id','discharge_disposition_id','admission_source_id',
                   'payer_code','medical_specialty']
data = data.drop(columns_to_drop, axis=1)

### Balancing Data

In [6]:
positive = data[data['diabetesMed_Yes'] == 1]
negative = data[data['diabetesMed_Yes'] == 0]

target_positive = int(len(negative) / 0.4 * 0.6)

# Sample the entire positive class without replacement
positive_sample = positive.sample(target_positive, replace=False)

# Concatenate the sampled positive class with the negative class
balanced_data = pd.concat([positive_sample, negative])

# Shuffle the data
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)


print("\nBalanced Data Class Distribution:")
print(balanced_data['diabetesMed_Yes'].value_counts())


Balanced Data Class Distribution:
1    35104
0    23403
Name: diabetesMed_Yes, dtype: int64


In [7]:
percentage_positive = (balanced_data['diabetesMed_Yes'].value_counts()[1] / len(balanced_data)) * 100
percentage_negative = (balanced_data['diabetesMed_Yes'].value_counts()[0] / len(balanced_data)) * 100

print(f"\nPercentage of Patients With Diabetes Medication : {percentage_positive:.2f}%")
print(f"Percentage of Patients Without Diabetes Medication : {percentage_negative:.2f}%")


Percentage of Patients With Diabetes Medication : 60.00%
Percentage of Patients Without Diabetes Medication : 40.00%


In [8]:
balanced_data.to_csv('Preprocessed_data3.csv')