In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
df=pd.read_csv("D:\\project\\Global Health Statistics\\cleaned_data.csv")

In [None]:
if 'id' in df.columns:
    df = df.drop('id', axis=1)
if 'disease_name' in df.columns:
    df = df.drop('disease_name', axis=1)
categorical_cols = [
    'country', 'age_group', 'gender', 
    'disease_category', 'treatment_type', 
    'availability_of_vaccines'
]
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("‚úÖ Data encoded successfully!")
print(df_encoded.dtypes.value_counts())











In [None]:
# --- 3Ô∏è‚É£ DEFINE TARGET & SPLIT ---
target = 'mortality_rate'
X = df_encoded.drop(target, axis=1)
y = df_encoded[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"‚úÖ Split done: Train={X_train.shape}, Test={X_test.shape}")


In [None]:
# --- 4Ô∏è‚É£ TRAIN MODEL ---
model = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=4,
    n_jobs=-1
)
model.fit(X_train, y_train)
print("‚úÖ Model trained successfully!")

In [None]:
# --- 5Ô∏è‚É£ EVALUATE PERFORMANCE ---
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\nüìä MODEL PERFORMANCE")
print(f"R¬≤ Score: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")


In [None]:
# --- 6Ô∏è‚É£ FEATURE IMPORTANCE ---
importances = pd.Series(model.feature_importances_, index=X_train.columns)
top_features = importances.sort_values(ascending=False).head(15)

print("\nüèÜ TOP 15 PREDICTORS OF MORTALITY RATE:")
print(top_features)# --- 6Ô∏è‚É£ FEATURE IMPORTANCE ---
importances = pd.Series(model.feature_importances_, index=X_train.columns)
top_features = importances.sort_values(ascending=False).head(15)

print("\nüèÜ TOP 15 PREDICTORS OF MORTALITY RATE:")
print(top_features)

In [None]:
# --- 7Ô∏è‚É£ OPTIONAL: VISUALIZE FEATURE IMPORTANCE ---
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
top_features.sort_values().plot(kind='barh')
plt.title("Top 15 Predictors of Mortality Rate")
plt.xlabel("Feature Importance")
plt.tight_layout()
plt.show()

In [None]:
# --- 8Ô∏è‚É£ (NEXT STEP) INTERVENTION SIMULATION EXAMPLE ---
# Example: simulate 20% increase in doctors per thousand
if 'doctors_per_thousand' in X_test.columns:
    X_future = X_test.copy()
    X_future['doctors_per_thousand'] *= 1.2

    y_pred_current = model.predict(X_test)
    y_pred_future = model.predict(X_future)

    impact = y_pred_current.mean() - y_pred_future.mean()
    print(f"\nüí° Predicted mortality drop with +20% doctors: {impact:.2f}")

print("\n‚úÖ Full pipeline executed successfully!")
