In [1]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load ML-ready dataset
ml_df = pd.read_csv("../data/processed/final_ml_dataset.csv")

# Encode target (repeatable & explicit)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ml_df["risk_encoded"] = le.fit_transform(ml_df["risk_label"])

# Baseline features (NO spatial)
baseline_features = ["cases_lag1", "cases_lag2"]
target = "risk_encoded"

# Temporal split
train_df = ml_df[ml_df["year"] <= 2022]
test_df  = ml_df[ml_df["year"] > 2022]

X_train = train_df[baseline_features]
y_train = train_df[target]

X_test = test_df[baseline_features]
y_test = test_df[target]


In [2]:
log_reg = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial"
)

log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)

print("Logistic Regression (Baseline)")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))


Logistic Regression (Baseline)
Accuracy: 0.5
              precision    recall  f1-score   support

        High       0.50      0.89      0.64         9
         Low       0.00      0.00      0.00         5
      Medium       0.50      0.38      0.43         8

    accuracy                           0.50        22
   macro avg       0.33      0.42      0.36        22
weighted avg       0.39      0.50      0.42        22



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [3]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest (Baseline, Non-Spatial)")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))


Random Forest (Baseline, Non-Spatial)
Accuracy: 0.5
              precision    recall  f1-score   support

        High       0.47      0.89      0.62         9
         Low       0.00      0.00      0.00         5
      Medium       0.60      0.38      0.46         8

    accuracy                           0.50        22
   macro avg       0.36      0.42      0.36        22
weighted avg       0.41      0.50      0.42        22



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [4]:
baseline_results = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest (Non-Spatial)"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_rf)
    ]
})

baseline_results


Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.5
1,Random Forest (Non-Spatial),0.5


In [5]:
baseline_results.to_csv(
    "../outputs/tables/baseline_model_performance.csv",
    index=False
)
