In [31]:
#Install packages for the project
#!pip install pandas numpy seaborn matplotlib scikit-learn scipy networkx


## Importing required modules

In [32]:
# Import the required building blocks
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import matplotlib.pyplot as plt


## Read the source file

In [33]:
#reading the file

df = pd.read_csv('hospital_data_analysis.csv')
print(df.head())

   Patient_ID  Age  Gender      Condition                 Procedure   Cost  \
0           1   45  Female  Heart Disease               Angioplasty  15000   
1           2   60    Male       Diabetes           Insulin Therapy   2000   
2           3   32  Female  Fractured Arm          X-Ray and Splint    500   
3           4   75    Male         Stroke    CT Scan and Medication  10000   
4           5   50  Female         Cancer  Surgery and Chemotherapy  25000   

   Length_of_Stay Readmission    Outcome  Satisfaction  
0               5          No  Recovered             4  
1               3         Yes     Stable             3  
2               1          No  Recovered             5  
3               7         Yes     Stable             2  
4              10          No  Recovered             4  


## Data Profiling

In [34]:
# Get an overview of the dataset
print("Dataset Information:")
df.info()

# Display summary statistics
print("\nSummary Statistics:")
print(df.describe())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Patient_ID      984 non-null    int64 
 1   Age             984 non-null    int64 
 2   Gender          984 non-null    object
 3   Condition       984 non-null    object
 4   Procedure       984 non-null    object
 5   Cost            984 non-null    int64 
 6   Length_of_Stay  984 non-null    int64 
 7   Readmission     984 non-null    object
 8   Outcome         984 non-null    object
 9   Satisfaction    984 non-null    int64 
dtypes: int64(5), object(5)
memory usage: 77.0+ KB

Summary Statistics:
        Patient_ID         Age          Cost  Length_of_Stay  Satisfaction
count   984.000000  984.000000    984.000000      984.000000    984.000000
mean    500.329268   53.754065   8367.479675       37.663618      3.598577
std     288.979531   14.941135   7761.990976     

In [35]:
# Display first few rows
print("\nFirst Few Rows:")
print(df.head())


First Few Rows:
   Patient_ID  Age  Gender      Condition                 Procedure   Cost  \
0           1   45  Female  Heart Disease               Angioplasty  15000   
1           2   60    Male       Diabetes           Insulin Therapy   2000   
2           3   32  Female  Fractured Arm          X-Ray and Splint    500   
3           4   75    Male         Stroke    CT Scan and Medication  10000   
4           5   50  Female         Cancer  Surgery and Chemotherapy  25000   

   Length_of_Stay Readmission    Outcome  Satisfaction  
0               5          No  Recovered             4  
1               3         Yes     Stable             3  
2               1          No  Recovered             5  
3               7         Yes     Stable             2  
4              10          No  Recovered             4  


In [36]:
# Check unique values in Condition and Procedure
print("\nUnique Conditions:")
print(df["Condition"].unique())

print("\nUnique Procedures:")
print(df["Procedure"].unique())


Unique Conditions:
['Heart Disease' 'Diabetes' 'Fractured Arm' 'Stroke' 'Cancer'
 'Hypertension' 'Appendicitis' 'Fractured Leg' 'Heart Attack'
 'Allergic Reaction' 'Respiratory Infection' 'Prostate Cancer'
 'Childbirth' 'Kidney Stones' 'Osteoarthritis']

Unique Procedures:
['Angioplasty' 'Insulin Therapy' 'X-Ray and Splint'
 'CT Scan and Medication' 'Surgery and Chemotherapy'
 'Medication and Counseling' 'Appendectomy' 'Cast and Physical Therapy'
 'Cardiac Catheterization' 'Epinephrine Injection' 'Antibiotics and Rest'
 'Radiation Therapy' 'Delivery and Postnatal Care' 'Lithotripsy'
 'Physical Therapy and Pain Management']


In [None]:
# Define relevant procedures related to Medical Imaging & Radiotherapy
relevant_procedures = ["X-Ray and Splint", "CT Scan and Medication", "MRI", "Ultrasound", "Radiation Therapy","'Cardiac Catheterization'"]

# Filter dataset based on relevant procedures
filtered_df = df[df["Procedure"].isin(relevant_procedures)]

# Display the filtered dataset
print("\nFiltered Dataset:")
print(filtered_df)


Filtered Dataset:
     Patient_ID  Age  Gender        Condition               Procedure   Cost  \
2             3   32  Female    Fractured Arm        X-Ray and Splint    500   
3             4   75    Male           Stroke  CT Scan and Medication  10000   
11           12   65    Male  Prostate Cancer       Radiation Therapy  20000   
17           18   35  Female    Fractured Arm        X-Ray and Splint    500   
18           19   78    Male           Stroke  CT Scan and Medication  10000   
..          ...  ...     ...              ...                     ...    ...   
961         978   35  Female    Fractured Arm        X-Ray and Splint    500   
962         979   78    Male           Stroke  CT Scan and Medication  10000   
970         987   67    Male  Prostate Cancer       Radiation Therapy  20000   
976         993   30  Female    Fractured Arm        X-Ray and Splint    500   
977         994   70    Male           Stroke  CT Scan and Medication  10000   

     Length_of_Stay 

In [38]:
# Save the filtered dataset for further processing
filtered_df.to_csv("filtered_data.csv", index=False)

## Data Cleaning

In [39]:
# Remove duplicate rows
filtered_df = filtered_df.drop_duplicates()

# Check if duplicates are removed
print(f"Total records after removing duplicates: {filtered_df.shape[0]}")

Total records after removing duplicates: 197


In [40]:
# Since Procedure, Condition, and Outcome are key fields, we need to handle missing values:
# Check missing values in each column
print("\nMissing Values Before Cleaning:")
print(filtered_df.isnull().sum())


Missing Values Before Cleaning:
Patient_ID        0
Age               0
Gender            0
Condition         0
Procedure         0
Cost              0
Length_of_Stay    0
Readmission       0
Outcome           0
Satisfaction      0
dtype: int64


In [None]:
import re

# Standardizing text data to ensure consistency
filtered_df["Procedure"] = filtered_df["Procedure"].str.strip().str.title()
filtered_df["Condition"] = filtered_df["Condition"].str.strip().str.title()
filtered_df["Outcome"] = filtered_df["Outcome"].str.strip().str.title()

# Replace incorrect/misclassified values if needed
corrections = {
    "Ct Scan And Medication": "CT Scan and Medication", 
    "Xray And Splint": "X-Ray and Splint"
}
filtered_df["Procedure"] = filtered_df["Procedure"].replace(corrections)

# Function to split procedures into a list
def split_procedure(procedure):
    return [proc.strip() for proc in re.split(r"\s*and\s*|\s*,\s*", procedure, flags=re.IGNORECASE) if proc.strip()]

# Apply function to convert Procedure column into lists
filtered_df["Procedure"] = filtered_df["Procedure"].apply(split_procedure)

# Display updated DataFrame
filtered_df.head()




Unique Conditions:
['Heart Disease' 'Diabetes' 'Fractured Arm' 'Stroke' 'Cancer'
 'Hypertension' 'Appendicitis' 'Fractured Leg' 'Heart Attack'
 'Allergic Reaction' 'Respiratory Infection' 'Prostate Cancer'
 'Childbirth' 'Kidney Stones' 'Osteoarthritis']


In [21]:
# Save the cleaned dataset
filtered_df.to_csv("cleaned_data.csv", index=False)

print("\nData Cleaning Complete. Cleaned data saved as 'cleaned_data.csv'.")


Data Cleaning Complete. Cleaned data saved as 'cleaned_data.csv'.


## Data Reduction


In [22]:
# Drop cost-related columns
filtered_df = filtered_df.drop(columns=["Cost"], errors="ignore")


In [23]:
# Define relevant radiology and radiotherapy procedures
radiology_procedures = ["X-Ray", "CT Scan", "MRI", "Ultrasound"]
radiotherapy_procedures = ["Radiation Therapy"]

# Function to check if any procedure in the list is related to radiology/radiotherapy
def is_relevant_procedure(procedure_list):
    return any(proc in radiology_procedures or proc in radiotherapy_procedures for proc in procedure_list)

# Filter dataset to keep only relevant procedures
filtered_df = filtered_df[filtered_df["Procedure"].apply(is_relevant_procedure)]

In [None]:
# Display reduced dataset
print("Dataset after data reduction:")
print(filtered_df.head())

# Save the reduced dataset
filtered_df.to_csv("reduced_data.csv", index=False)


Dataset after data reduction:
    Patient_ID  Age  Gender        Condition              Procedure   Cost  \
2            3   32  Female    Fractured Arm        [X-Ray, Splint]    500   
3            4   75    Male           Stroke  [CT Scan, Medication]  10000   
11          12   65    Male  Prostate Cancer    [Radiation Therapy]  20000   
17          18   35  Female    Fractured Arm        [X-Ray, Splint]    500   
18          19   78    Male           Stroke  [CT Scan, Medication]  10000   

    Length_of_Stay Readmission    Outcome  Satisfaction  
2                1          No  Recovered             5  
3                7         Yes     Stable             2  
11               9          No  Recovered             3  
17               2         Yes  Recovered             5  
18               8          No     Stable             2  

Unique Conditions:
['Fractured Arm' 'Stroke' 'Prostate Cancer']


## Data Transformation

In [25]:
# Convert 'Condition' and 'Outcome' into categorical values
filtered_df["Condition"] = filtered_df["Condition"].astype("category")
filtered_df["Outcome"] = filtered_df["Outcome"].astype("category")

In [26]:
for proc in radiology_procedures:
    filtered_df[f"Procedure_{proc}"] = filtered_df["Procedure"].apply(lambda x: 1 if proc in x else 0)


In [None]:
# Normalize numerical values (Age, Length of Stay)
# scaler = MinMaxScaler()
# filtered_df[["Age", "Length_of_Stay"]] = scaler.fit_transform(filtered_df[["Age", "Length_of_Stay"]])


In [27]:
# Display transformed dataset
print("Dataset after transformation:")
print(filtered_df.head())

Dataset after transformation:
    Patient_ID  Age  Gender        Condition              Procedure  \
2            3   32  Female    Fractured Arm        [X-Ray, Splint]   
3            4   75    Male           Stroke  [CT Scan, Medication]   
11          12   65    Male  Prostate Cancer    [Radiation Therapy]   
17          18   35  Female    Fractured Arm        [X-Ray, Splint]   
18          19   78    Male           Stroke  [CT Scan, Medication]   

    Length_of_Stay Readmission    Outcome  Satisfaction  Procedure_X-Ray  \
2                1          No  Recovered             5                1   
3                7         Yes     Stable             2                0   
11               9          No  Recovered             3                0   
17               2         Yes  Recovered             5                1   
18               8          No     Stable             2                0   

    Procedure_CT Scan  Procedure_MRI  Procedure_Ultrasound  
2                   0    

## Data Enrichment


In [55]:
# Define condition severity levels and enrich data
def enrich_data(filtered_df):
    condition_severity = {
        "Fractured Arm": 1,
        "Stroke": 3,
        "Prostate Cancer": 2
    }

    # Calculate risk score 
    def calculate_risk(row):
        if row['Age'] < 50:
            age_factor = 1
        elif row['Age'] < 70:
            age_factor = 2
        else:
            age_factor = 3
        
        if row['Readmission'] == 'Yes':
            readmission_factor = 2
        else:
            readmission_factor = 0
        
        condition_factor = condition_severity.get(row['Condition'], 1)
        
        risk_score = age_factor + readmission_factor + condition_factor
        return risk_score

    # Apply the risk score calculation
    filtered_df['Risk_Score'] = filtered_df.apply(calculate_risk, axis=1)
    return filtered_df


filtered_df = enrich_data(filtered_df)

# Display the updated DataFrame
print(filtered_df.head())
    

    Patient_ID  Age  Gender        Condition              Procedure   Cost  \
2            3   32  Female    Fractured Arm        [X-Ray, Splint]    500   
3            4   75    Male           Stroke  [CT Scan, Medication]  10000   
11          12   65    Male  Prostate Cancer    [Radiation Therapy]  20000   
17          18   35  Female    Fractured Arm        [X-Ray, Splint]    500   
18          19   78    Male           Stroke  [CT Scan, Medication]  10000   

    Length_of_Stay Readmission    Outcome  Satisfaction  Risk_Score  
2                1          No  Recovered             5           2  
3                7         Yes     Stable             2           8  
11               9          No  Recovered             3           4  
17               2         Yes  Recovered             5           4  
18               8          No     Stable             2           6  


# Data Validation

In [None]:
#Ensure only relevant radiology-related records remain
print("Unique procedures in final dataset:", filtered_df["Procedure"].unique())

Unique procedures in final dataset: []


In [22]:
# Check for remaining missing values
print("Missing values after cleaning:\n", filtered_df.isnull().sum())

Missing values after cleaning:
 Patient_ID              0
Age                     0
Gender                  0
Condition               0
Procedure               0
Length_of_Stay          0
Readmission             0
Outcome                 0
Satisfaction            0
Procedure_X-Ray         0
Procedure_CT Scan       0
Procedure_MRI           0
Procedure_Ultrasound    0
dtype: int64


In [23]:
#Confirm categorical encoding
print("Data types after transformation:\n", filtered_df.dtypes)

Data types after transformation:
 Patient_ID                 int64
Age                      float64
Gender                    object
Condition               category
Procedure                 object
Length_of_Stay           float64
Readmission               object
Outcome                 category
Satisfaction               int64
Procedure_X-Ray            int64
Procedure_CT Scan          int64
Procedure_MRI              int64
Procedure_Ultrasound       int64
dtype: object


In [28]:
#Preview the final dataset
display(filtered_df.head())


Unnamed: 0,Patient_ID,Age,Gender,Condition,Procedure,Length_of_Stay,Readmission,Outcome,Satisfaction,Procedure_X-Ray,Procedure_CT Scan,Procedure_MRI,Procedure_Ultrasound
2,3,32,Female,Fractured Arm,"[X-Ray, Splint]",1,No,Recovered,5,1,0,0,0
3,4,75,Male,Stroke,"[CT Scan, Medication]",7,Yes,Stable,2,0,1,0,0
11,12,65,Male,Prostate Cancer,[Radiation Therapy],9,No,Recovered,3,0,0,0,0
17,18,35,Female,Fractured Arm,"[X-Ray, Splint]",2,Yes,Recovered,5,1,0,0,0
18,19,78,Male,Stroke,"[CT Scan, Medication]",8,No,Stable,2,0,1,0,0


In [None]:
# Save the final validated dataset
filtered_df.to_csv("cleaned_hospital_data.csv", index=False)