In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

In [11]:
df = pd.read_csv("../data/processed/processed_data.csv")

In [12]:
features = [
    'age', 'scholarship', 'hypertension', 'diabetes', 'alcoholism', 'handicap',
    'sms_received', 'days_wait', 'appointment_day_of_week', 'is_weekend',
    'scheduled_time', 'patient_total_appointments', 'patient_past_no_shows',
    'patient_no_show_ratio'
]

X = df[features]
y = df['target']

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [15]:
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train_lr, y_train_lr)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [16]:
y_pred_lr = log_reg.predict(X_test_lr)
y_prob_lr = log_reg.predict_proba(X_test_lr)[:, 1]

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test_lr, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test_lr, y_prob_lr))
print("\nClassification Report:\n", classification_report(y_test_lr, y_pred_lr))

Logistic Regression
Accuracy: 0.7945806568352484
ROC-AUC: 0.6752992753830771

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.99      0.88     17642
           1       0.38      0.03      0.05      4464

    accuracy                           0.79     22106
   macro avg       0.59      0.51      0.47     22106
weighted avg       0.72      0.79      0.72     22106



In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [19]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_rf, y_train_rf)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
y_pred_rf = rf.predict(X_test_rf)
y_prob_rf = rf.predict_proba(X_test_rf)[:, 1]

print("Random Forest")
print("Accuracy:", accuracy_score(y_test_rf, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test_rf, y_prob_rf))

Random Forest
Accuracy: 0.7801049488826562
ROC-AUC: 0.7104729419835119


In [21]:
rf_importance = pd.DataFrame({
    'feature': features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

rf_importance.head(10)

Unnamed: 0,feature,importance
0,age,0.327359
7,days_wait,0.287114
10,scheduled_time,0.150363
8,appointment_day_of_week,0.077025
11,patient_total_appointments,0.048333
12,patient_past_no_shows,0.029309
13,patient_no_show_ratio,0.020852
6,sms_received,0.016219
1,scholarship,0.011772
2,hypertension,0.010421


In [22]:
from sklearn.ensemble import GradientBoostingClassifier

In [23]:
gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)
gb.fit(X_train_rf, y_train_rf)

0,1,2
,loss,'log_loss'
,learning_rate,0.05
,n_estimators,200
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [24]:
y_pred_gb = gb.predict(X_test_rf)
y_prob_gb = gb.predict_proba(X_test_rf)[:, 1]

print("Gradient Boosting")
print("Accuracy:", accuracy_score(y_test_rf, y_pred_gb))
print("ROC-AUC:", roc_auc_score(y_test_rf, y_prob_gb))

Gradient Boosting
Accuracy: 0.8029494254953407
ROC-AUC: 0.746349862752173


In [25]:
gb_importance = pd.DataFrame({
    'feature': features,
    'importance': gb.feature_importances_
}).sort_values('importance', ascending=False)

gb_importance.head(10)

Unnamed: 0,feature,importance
7,days_wait,0.703375
0,age,0.104932
13,patient_no_show_ratio,0.091845
12,patient_past_no_shows,0.034519
6,sms_received,0.020574
11,patient_total_appointments,0.017088
10,scheduled_time,0.012746
4,alcoholism,0.005226
1,scholarship,0.004652
8,appointment_day_of_week,0.002506


In [26]:
results = pd.DataFrame({
    'model': ['Logistic Regression', 'Random Forest', 'Gradient Boosting'],
    'accuracy': [
        accuracy_score(y_test_lr, y_pred_lr),
        accuracy_score(y_test_rf, y_pred_rf),
        accuracy_score(y_test_rf, y_pred_gb)
    ],
    'roc_auc': [
        roc_auc_score(y_test_lr, y_prob_lr),
        roc_auc_score(y_test_rf, y_prob_rf),
        roc_auc_score(y_test_rf, y_prob_gb)
    ]
})

results

Unnamed: 0,model,accuracy,roc_auc
0,Logistic Regression,0.794581,0.675299
1,Random Forest,0.780105,0.710473
2,Gradient Boosting,0.802949,0.74635


In [27]:
# Cost assumptions
revenue_per_appointment = 200  # $200 lost per no-show (you can tweak this)

In [28]:
# Predicted probability of no-show from Gradient Boosting
df_model = df.copy()
df_model['gb_no_show_prob'] = gb.predict_proba(X)[:, 1]
df_model[['gb_no_show_prob']].head()

Unnamed: 0,gb_no_show_prob
0,0.203089
1,0.330502
2,0.287196
3,0.362288
4,0.288116


In [29]:
df_model['expected_loss_baseline'] = revenue_per_appointment * df_model['gb_no_show_prob']
df_model[['gb_no_show_prob', 'expected_loss_baseline']].head()

Unnamed: 0,gb_no_show_prob,expected_loss_baseline
0,0.203089,40.617819
1,0.330502,66.100421
2,0.287196,57.439217
3,0.362288,72.457613
4,0.288116,57.623171


In [30]:
total_loss_baseline = df_model['expected_loss_baseline'].sum()
avg_loss_per_appointment = df_model['expected_loss_baseline'].mean()

total_loss_baseline, avg_loss_per_appointment

(np.float64(4462206.9939023685), np.float64(40.37209906993195))

In [31]:
high_risk_threshold = 0.5  # you can tune this later (0.4, 0.6, etc.)
df_model['high_risk_flag'] = (df_model['gb_no_show_prob'] >= high_risk_threshold).astype(int)

df_model['high_risk_flag'].value_counts()

high_risk_flag
0    108494
1      2033
Name: count, dtype: int64

In [32]:
sms_effect = 0.30  # 30% reduction in no-show probability for high-risk who get SMS

# Start from original probability
df_model['no_show_prob_sms'] = df_model['gb_no_show_prob']

# Apply reduction ONLY to high-risk appointments
mask_high_risk = df_model['high_risk_flag'] == 1
df_model.loc[mask_high_risk, 'no_show_prob_sms'] = df_model.loc[mask_high_risk, 'gb_no_show_prob'] * (1 - sms_effect)

df_model[['gb_no_show_prob', 'no_show_prob_sms', 'high_risk_flag']].head()

Unnamed: 0,gb_no_show_prob,no_show_prob_sms,high_risk_flag
0,0.203089,0.203089,0
1,0.330502,0.330502,0
2,0.287196,0.287196,0
3,0.362288,0.362288,0
4,0.288116,0.288116,0


In [33]:
df_model['expected_loss_sms'] = revenue_per_appointment * df_model['no_show_prob_sms']

total_loss_sms = df_model['expected_loss_sms'].sum()
avg_loss_sms = df_model['expected_loss_sms'].mean()

total_loss_sms, avg_loss_sms

(np.float64(4389120.583718592), np.float64(39.71084516650766))

In [34]:
absolute_savings = total_loss_baseline - total_loss_sms
relative_savings_pct = absolute_savings / total_loss_baseline * 100

absolute_savings, relative_savings_pct

(np.float64(73086.41018377617), np.float64(1.637898248190843))