In [None]:
# Load libraries
import pandas as pd
import os

# Set the data folder path
data_folder = "data"

# Load datasets
diagnoses = pd.read_csv(os.path.join(data_folder, "0-1000 diagnoses.csv"))
meds = pd.read_csv(os.path.join(data_folder, "0-1000 meds.csv"))
patients = pd.read_csv(os.path.join(data_folder, "0-1000 patient.csv"))

# Merge datasets
merged_data = diagnoses.merge(meds, on="patient_id").merge(patients, on="patient_id")

# Handle missing values
# Fill numerical columns with mean
numerical_cols = merged_data.select_dtypes(include=["float64", "int64"]).columns
for col in numerical_cols:
    merged_data[col].fillna(merged_data[col].mean(), inplace=True)

# Fill categorical columns with mode
categorical_cols = merged_data.select_dtypes(include=["object"]).columns
for col in categorical_cols:
    merged_data[col].fillna(merged_data[col].mode()[0], inplace=True)

# Verify no missing values remain
print("Missing values per column after imputation:")
print(merged_data.isnull().sum())

# Save processed data
output_path = os.path.join(data_folder, "processed_data.csv")
merged_data.to_csv(output_path, index=False)

print(f"Processed data saved to {output_path}")
