In [None]:
import pandas as pd

In [None]:
import matplotlib.pyplot as plt

In [None]:
import numpy as np

In [None]:
df = pd.read_csv("dataset/healthcare_dataset.csv")

In [None]:
unique_years = pd.to_datetime(df['Discharge Date']).dt.year.unique()

In [None]:
df

In [None]:
"""Rename the column names"""
column_mapping = {
       'Name': 'patient_name',
       'Age': 'patient_age',
       'Gender': 'patient_gender',
       'Blood Type': 'blood_type',
       'Medical Condition': 'medical_condition',
       'Date of Admission': 'admission_date',
       'Doctor': 'doctor_name',
       'Hospital': 'hospital_name',
       'Insurance Provider': 'insurance_provider',
       'Billing Amount': 'billing_amount',
       'Room Number': 'room_number',
       'Admission Type': 'admission_type',
       'Discharge Date': 'discharge_date',
       'Medication': 'medication',
       'Test Results': 'test_results'
}
df = df.rename(columns=column_mapping)

In [None]:
df

In [None]:
# Fix Patient names.
df['patient_name'] = df['patient_name'].apply(lambda x: ' '.join(word.capitalize() for word in str(x).lower().split()))

In [None]:
df.dtypes

In [None]:
# Remap genders
gender_map = {'male': 'M', 'female': 'F', 'm': 'M', 'f': 'F'}
df['patient_gender'] = df['patient_gender'].str.strip().str.lower().map(gender_map)
df["patient_gender"].unique()

In [None]:
df['admission_date'] = pd.to_datetime(df['admission_date'])  # Convert admission_date to date type
df.dtypes

In [None]:
df['discharge_date'] = pd.to_datetime(df['discharge_date']) # Convert discharge_date to date type.

In [None]:
"""
Remove symbols from hospital names.
"""
df['hospital_name'] = df['hospital_name'].apply(
    lambda name: ' '.join(
        word.capitalize() for word in str(name)\
        .replace(
            '"',' '
        ).replace(
            ',', ' '
        ).strip().split()
    )
)

In [None]:
df.drop(columns="room_number", inplace=True)

In [None]:
"""Add lenght_of_stay column. -- in days."""
df['length_of_stay'] = (df['discharge_date'] - df['admission_date']).dt.days

In [None]:
df

In [None]:
#df['patient_age'] = pd.to_numeric(df['patient_age'], errors='coerce')
#df['medical_condition'] = pd.to_numeric(df['medical_condition'], errors='coerce')

df.plot(x='patient_age', y='length_of_stay', kind='line')
plt.title('Second_EDA')
plt.xlabel('patient_age')
plt.ylabel('length_of_stay')
plt.grid(True)
plt.show()

In [None]:
features = df[["medical_condition", "patient_age", "admission_type", "billing_amount", "medication", "length_of_stay", "blood_type"]]
print(features.describe())

In [None]:
numeric_features = features.select_dtypes(include=['int64', 'float64'])

numeric_features.hist(figsize=(12, 8), bins=30)
plt.suptitle('Distribution', y=1.02)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(features["patient_age"], features["length_of_stay"], 
           alpha=0.2,  
           s=20)      
plt.xlabel("Patient Age")
plt.ylabel("Length of Stay")
plt.title("Patient Age vs Length of Stay")
plt.grid(True, alpha=0.3)  
plt.show()

In [None]:
features.hist(column="length_of_stay", by="billing_amount", figsize=(15, 10))
plt.show()
