# HealthAI Suite - Data Preprocessing Notebook

## Dataset Overview
- Total Samples: 100,000 patient records
- Features: 14 clinical attributes
- Target Variables: Disease Risk, Length of Stay, Patient Clusters

## Features Description

### Raw Features:
- age: Patient age in years
- gender: Male (1) or Female (0)
- bmi: Body Mass Index
- systolic_bp: Systolic Blood Pressure
- diastolic_bp: Diastolic Blood Pressure
- heart_rate: Beats per minute
- cholesterol: Total cholesterol level
- blood_sugar: Fasting blood glucose
- diagnosis: Primary clinical diagnosis
- medication: Current medications
- procedure: Recent procedures
- length_of_stay: Hospital stay duration (Target)
- disease_risk: Risk classification (Target)
- clinical_notes: Medical notes text

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

# Load dataset
df = pd.read_csv('healthcare_dataset.csv')
print('Dataset Shape:', df.shape)
print('\nMissing Values:\n', df.isnull().sum())

In [None]:
# Feature Engineering

# Age Groups
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 100], labels=[0, 1, 2, 3])

# BMI Categories
df['bmi_category'] = pd.cut(df['bmi'], bins=[0, 18.5, 25, 30, 100], labels=[0, 1, 2, 3])

# Blood Pressure Categories
df['bp_category'] = np.where((df['systolic_bp'] >= 140) | (df['diastolic_bp'] >= 90), 2,
                                  np.where((df['systolic_bp'] >= 130) | (df['diastolic_bp'] >= 80), 1, 0))

# Metabolic Risk
df['metabolic_risk'] = (df['bmi_category'] >= 2).astype(int) + (df['blood_sugar'] >= 120).astype(int) + (df['cholesterol'] >= 200).astype(int)

In [None]:
# Prepare features for ML models

feature_order = ['age', 'gender', 'bmi', 'systolic_bp', 'diastolic_bp',
                     'heart_rate', 'cholesterol', 'blood_sugar',
                     'age_group', 'bmi_category', 'bp_category', 'metabolic_risk']

X = df[feature_order]
y_los = df['length_of_stay']
y_risk = df['disease_risk']

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print('Features prepared:', X.shape)

In [None]:
# Train Length of Stay Model
from sklearn.ensemble import RandomForestRegressor

los_model = RandomForestRegressor(n_estimators=100, random_state=42)
los_model.fit(X_scaled, y_los)

# Save model
joblib.dump(los_model, 'models/los_model.pkl')
joblib.dump(scaler, 'models/los_scaler.pkl')

print('LOS Model trained and saved')

In [None]:
# Train Clustering Model
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Save clustering model
joblib.dump(kmeans, 'models/kmeans_cluster_model.pkl')
joblib.dump(scaler, 'models/cluster_scaler_final.pkl')

print('Clustering model trained and saved')

In [None]:
# Summary Statistics
print('Dataset Preprocessing Complete!')
print('Total Records:', len(df))
print('Clusters:', df['cluster'].unique())