In [None]:
# One-cell launcher: wrote app, started Streamlit, opened Cloudflared tunnel
import os, sys, subprocess, time, re, urllib.request, stat

# installed deps
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "streamlit", "plotly", "scikit-learn"])

# ensured cloudflared binary (no signup needed)
CF_PATH = "/content/cloudflared" # Changed path to /content/
if not os.path.exists(CF_PATH):
    url = "https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64"
    urllib.request.urlretrieve(url, CF_PATH)
    # Add execute permissions for the owner
    os.chmod(CF_PATH, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR |
                      stat.S_IRGRP | stat.S_IXGRP |
                      stat.S_IROTH | stat.S_IXOTH)

# wrote Streamlit app with renamed sections; removed "Next steps"; comments in past tense
APP_PATH = "/content/EDA_in_streamlit_ORD.py"
APP_CODE = r'''
import os
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, roc_curve
)
from sklearn.inspection import permutation_importance

st.set_page_config(page_title="ORD TSA — EDA & ML", layout="wide")
st.title("ORD TSA Throughput — EDA & Predictive Modelling")

# sidebar navigation
section = st.sidebar.radio(
    "Section",
    [
        "1. Project Introduction",
        "2. Data Overview",
        "3. Exploratory Data Analysis",
        "4. Feature Engineering",
        "5. Predictive Modeling",
        "6. Model Selection & Tuning",
        "7. Results & Conclusion"
    ],
    index=0
)
uploaded_file = st.sidebar.file_uploader("Upload TSA CSV (TsaThroughput.ORD.csv)", type=["csv"])
granularity = st.sidebar.radio("Granularity", ["Daily", "Hourly"], index=0)
plot_theme = st.sidebar.selectbox("Plot theme", ["plotly", "plotly_white", "plotly_dark"], index=0)

# loaded CSV (or small synthetic fallback), parsed date/hour, engineered basic features
@st.cache_data(show_spinner=False)
def load_ord_csv(file_like):
    info = {"source": None, "message": ""}
    if file_like is not None:
        df_raw = pd.read_csv(file_like)
        info["source"] = "uploaded"
    elif os.path.exists("TsaThroughput.ORD.csv"):
        df_raw = pd.read_csv("TsaThroughput.ORD.csv")
        info["source"] = "default_file"
    else:
        days = 150
        idx = pd.date_range(end=pd.Timestamp.today().normalize(), periods=days, freq="D")
        base = 42000 + 4000*np.sin(2*np.pi*idx.dayofweek/7)
        noise = np.random.normal(0, 2500, size=days)
        daily = pd.DataFrame({"date": idx, "pax": np.maximum(1000, base + noise)})
        df_raw = pd.DataFrame({
            "Date": np.repeat(idx, 24),
            "Hour": np.tile(pd.date_range("00:00", "23:00", freq="1h").strftime("%H:%M:%S"), days),
            "ORD A": np.repeat(daily["pax"].values/24, 24)
        })
        info["source"] = "synthetic"
        info["message"] = "No CSV found; used a small synthetic sample. Upload the real CSV in the sidebar."

    ord_cols = [c for c in df_raw.columns if isinstance(c, str) and c.startswith("ORD ")]
    if not ord_cols:
        ignore = {"Date","Hour","date","hour"}
        ord_cols = [c for c in df_raw.columns if c not in ignore and pd.api.types.is_numeric_dtype(df_raw[c])]
    df = df_raw.copy()
    df["pax"] = df[ord_cols].sum(axis=1) if ord_cols else (df.iloc[:, -1] if len(df.columns) else 0)

    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
    else:
        df["date"] = pd.to_datetime(df.get("Date", pd.NaT), errors="coerce")

    if "hour" in df.columns and pd.api.types.is_numeric_dtype(df["hour"]):
        df["hour"] = df["hour"].astype(int)
    elif "Hour" in df.columns:
        if pd.api.types.is_numeric_dtype(df["Hour"]):
            df["hour"] = df["Hour"].astype(int)
        else:
            parsed = pd.to_datetime(df["Hour"], errors="coerce")
            df["hour"] = np.where(parsed.notna(), parsed.dt.hour, 0)
    else:
        df["hour"] = 0

    df["dow"] = df["date"].dt.dayofweek
    df["day_of_week"] = df["date"].dt.day_name()
    df["month"] = df["date"].dt.month
    df["month_name"] = df["date"].dt.month_name()

    daily = (
        df.groupby("date", as_index=False)["pax"].sum()
          .sort_values("date").set_index("date").asfreq("D")
    )
    daily["pax"] = daily["pax"].interpolate(limit_direction="both")
    daily["dow"] = daily.index.dayofweek
    daily["day_of_week"] = daily.index.day_name()
    daily["month"] = daily.index.month
    daily["month_name"] = daily.index.month_name()
    daily["dom"] = daily.index.day
    daily["is_weekend"] = (daily["dow"] >= 5).astype(int)

    for L in [1,7,14]:
        daily[f"lag_{L}"] = daily["pax"].shift(L)
    for W in [7,14]:
        daily[f"rollmean_{W}"] = daily["pax"].shift(1).rolling(W).mean()

    return df, daily, info

df_hourly, df_daily, info = load_ord_csv(uploaded_file)
if info["source"] == "synthetic":
    st.warning(info["message"])

def mape(y_true, y_pred):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    return np.mean(np.abs((y_true - y_pred) / np.clip(y_true, 1e-8, None))) * 100

# built ML frames with time-wise split; prepared encoders/scalers
@st.cache_data(show_spinner=False)
def build_ml_frames(daily):
    daily = daily.dropna().copy()
    if len(daily) < 30:
        return None
    daily["dow_name"] = daily.index.day_name()
    daily["month_name"] = daily.index.month_name()

    y_raw = daily["pax"]
    y_reg = np.log1p(daily["pax"])

    num_cols = [c for c in ["dow","dom","month","is_weekend","lag_1","lag_7","lag_14","rollmean_7","rollmean_14"] if c in daily.columns]
    cat_cols = ["dow_name","month_name"]

    X = daily[num_cols + cat_cols]
    n = len(X); split = int(n * 0.7)
    X_train = X.iloc[:split]; X_test = X.iloc[split:]
    y_reg_train = y_reg.iloc[:split]; y_reg_test = y_reg.iloc[split:]
    y_raw_train = y_raw.iloc[:split]; y_raw_test = y_raw.iloc[split:]

    thresh = y_raw_train.quantile(0.75)
    y_cls = (y_raw >= thresh).astype(int)
    y_cls_train = y_cls.iloc[:split]; y_cls_test = y_cls.iloc[split:]

    pre = ColumnTransformer(
        transformers=[
            ("num", RobustScaler(), num_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
        ],
        remainder="drop",
        sparse_threshold=0.0
    )
    X_train_pp = pre.fit_transform(X_train)
    X_test_pp  = pre.transform(X_test)

    cat_names = list(pre.named_transformers_["cat"].get_feature_names_out(cat_cols)) if cat_cols else []
    feat_names = num_cols + cat_names

    return {
        "X_train": X_train, "X_test": X_test,
        "X_train_pp": X_train_pp, "X_test_pp": X_test_pp,
        "y_reg_train": y_reg_train, "y_reg_test": y_reg_test,
        "y_raw_train": y_raw_train, "y_raw_test": y_raw_test,
        "y_cls_train": y_cls_train, "y_cls_test": y_cls_test,
        "pre": pre, "feat_names": feat_names, "thresh": thresh,
        "num_cols": num_cols, "cat_cols": cat_cols
    }

ml = build_ml_frames(df_daily)

# sections
if section == "1. Project Introduction":
    st.subheader("Problem Statement")
    st.write("Forecast daily passenger throughput at ORD and flag high-surge days so operations could staff lanes and resources proactively.")
    st.subheader("Context & Use Cases")
    st.markdown(
        "- Why it mattered: unexpected surges created delays, SLA breaches, and overtime.\n"
        "- Who used it: checkpoint managers, schedulers, airline ops, duty managers.\n"
        "- Decisions supported: next-day staffing, lane configuration, airline comms."
    )
    st.subheader("Targets & Labelling")
    with st.expander("How the two targets were defined", expanded=True):
        st.markdown(
            "- Regression target: `log1p(pax)` where `pax` is total daily passengers.\n"
            "- Classification target (surge): 1 if the day's `pax` ≥ the 75th percentile of the training period; else 0."
        )
        if ml and "thresh" in ml:
            st.info(f"Training surge threshold ≈ {int(ml['thresh']):,} pax/day.")
    st.subheader("Assumptions")
    st.markdown(
        "- Hourly rows were summed to daily; short gaps were interpolated.\n"
        "- Seasonality was represented by calendar fields, lags, and rolling means.\n"
        "- External drivers (holidays/weather) were not included in this version."
    )
    st.subheader("Key Fields")
    st.markdown(
        "- date, pax\n"
        "- dow, dom, month, is_weekend\n"
        "- lag_1, lag_7, lag_14; rollmean_7, rollmean_14\n"
        "- dow_name, month_name (for one-hots)"
    )

elif section == "2. Data Overview":
    st.subheader("Quick Metrics & Trend")
    if len(df_daily):
        dr_min, dr_max = df_daily.index.min(), df_daily.index.max()
        dmin = dr_min.to_pydatetime().date(); dmax = dr_max.to_pydatetime().date()
        rng = st.slider("Pick date range", min_value=dmin, max_value=dmax, value=(dmin, dmax), format="YYYY-MM-DD", key="ovr_rng")
        mask = (df_daily.index >= pd.to_datetime(rng[0])) & (df_daily.index <= pd.to_datetime(rng[1]))
        st.plotly_chart(px.line(df_daily.loc[mask].reset_index(), x="date", y="pax", template=plot_theme, title="Daily Throughput"), use_container_width=True)
        st.subheader("Top Correlations with pax")
        cols = [c for c in ["dow","dom","month","is_weekend","lag_1","lag_7","lag_14","rollmean_7","rollmean_14"] if c in df_daily.columns]
        corr = df_daily[["pax"] + cols].corr().loc[cols, "pax"].sort_values(key=lambda s: s.abs(), ascending=False)
        st.dataframe(corr.to_frame("corr_with_pax"))
    else:
        st.info("No data yet. Upload the CSV in the sidebar.")

elif section == "3. Exploratory Data Analysis":
    st.subheader("Preview & Summary")
    if granularity == "Hourly":
        st.write(df_hourly.head()); st.caption(f"Rows: {len(df_hourly):,} | Columns: {len(df_hourly.columns)}")
    else:
        st.write(df_daily.head()); st.caption(f"Rows: {len(df_daily):,} | Columns: {len(df_daily.columns)}")
    st.write((df_hourly if granularity=="Hourly" else df_daily)[["pax"]].describe())

    if len(df_daily):
        st.subheader("Daily Throughput")
        dmin = df_daily.index.min().to_pydatetime().date(); dmax = df_daily.index.max().to_pydatetime().date()
        rng = st.slider("Pick date range", min_value=dmin, max_value=dmax, value=(dmin, dmax), format="YYYY-MM-DD", key="eda_rng")
        mask = (df_daily.index >= pd.to_datetime(rng[0])) & (df_daily.index <= pd.to_datetime(rng[1]))
        st.plotly_chart(px.line(df_daily.loc[mask].reset_index(), x="date", y="pax", template=plot_theme, title="Daily Pax"), use_container_width=True)

        c1, c2 = st.columns(2)
        with c1:
            st.subheader("Histogram")
            group = st.selectbox("Color by", ["day_of_week","month_name"], key="hist_group")
            src = df_hourly if granularity=="Hourly" else df_daily.reset_index()
            st.plotly_chart(px.histogram(src, x="pax", color=group, nbins=40, template=plot_theme), use_container_width=True)
        with c2:
            st.subheader("Boxplot")
            grp = st.selectbox("Group by", ["day_of_week","month_name"], key="box_group")
            src = df_hourly if granularity=="Hourly" else df_daily.reset_index()
            st.plotly_chart(px.box(src, x=grp, y="pax", template=plot_theme), use_container_width=True)

        st.subheader("Hourly Heatmap by Day of Week")
        if granularity == "Hourly":
            pivot = df_hourly.pivot_table(index="day_of_week", columns="hour", values="pax", aggfunc="mean")
            days = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
            st.plotly_chart(px.imshow(pivot.reindex(days), aspect="auto", color_continuous_scale="Viridis", template=plot_theme), use_container_width=True)
        else:
            pivot = df_daily.groupby("day_of_week")["pax"].mean().reindex(["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"])
            st.plotly_chart(px.imshow(pivot.to_frame().T, aspect="auto", color_continuous_scale="Viridis", template=plot_theme), use_container_width=True)

        st.subheader("Correlation (Daily Features)")
        corr_cols = [c for c in ["pax","dow","dom","month","is_weekend","lag_1","lag_7","lag_14","rollmean_7","rollmean_14"] if c in df_daily.columns]
        corr = df_daily[corr_cols].corr()
        st.plotly_chart(px.imshow(corr, text_auto=True, color_continuous_scale="RdBu_r", origin="lower", template=plot_theme), use_container_width=True)
    else:
        st.info("No data yet. Upload the CSV in the sidebar.")

elif section == "4. Feature Engineering":
    st.subheader("What was engineered and why")
    st.markdown(
        "- Calendar: `dow`, `dom`, `month`, `is_weekend`, plus names for one-hot encodings\n"
        "- Lags: `lag_1`, `lag_7`, `lag_14` (yesterday / weekly / fortnight)\n"
        "- Rolling means: `rollmean_7`, `rollmean_14` (smoothed baseline)"
    )
    if len(df_daily):
        st.subheader("Feature–Target Relationship")
        cols = [c for c in ["pax","dow","dom","month","is_weekend","lag_1","lag_7","lag_14","rollmean_7","rollmean_14"] if c in df_daily.columns]
        corr = df_daily[cols].corr().loc[["pax"]].T.sort_values("pax", key=np.abs, ascending=False)
        st.dataframe(corr.rename(columns={"pax":"corr_with_pax"}))

        c1, c2 = st.columns(2)
        with c1:
            if "day_of_week" in df_daily.columns:
                st.plotly_chart(px.bar(df_daily.groupby("day_of_week")["pax"].mean().reindex(
                    ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]).reset_index(),
                    x="day_of_week", y="pax", template=plot_theme, title="Avg pax by weekday"), use_container_width=True)
        with c2:
            if "month_name" in df_daily.columns:
                order = ["January","February","March","April","May","June","July","August","September","October","November","December"]
                st.plotly_chart(px.bar(df_daily.groupby("month_name")["pax"].mean().reindex(order).reset_index(),
                    x="month_name", y="pax", template=plot_theme, title="Avg pax by month"), use_container_width=True)

        st.subheader("Feature importance (Random Forest on train, log target)")
        if ml:
            Xtr, ytr = ml["X_train_pp"], ml["y_reg_train"]
            rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1).fit(Xtr, ytr)
            importances = pd.Series(rf.feature_importances_, index=ml["feat_names"]).sort_values(ascending=True).tail(20)
            st.plotly_chart(px.bar(importances.reset_index(), x="index", y=0, template=plot_theme, title="Top feature importances").update_layout(xaxis_title="feature", yaxis_title="importance"), use_container_width=True)

            st.subheader("Permutation importance (on test slice)")
            Xte, yte = ml["X_test_pp"], ml["y_reg_test"]
            if len(yte) > 0:
                take = min(len(yte), 300)
                pi = permutation_importance(rf, Xte[:take], yte[:take], n_repeats=5, random_state=42, scoring="r2")
                pi_ser = pd.Series(pi.importances_mean, index=ml["feat_names"]).sort_values(ascending=True).tail(20)
                st.plotly_chart(px.bar(pi_ser.reset_index(), x="index", y=0, template=plot_theme, title="Top permutation importances").update_layout(xaxis_title="feature", yaxis_title="Δscore"), use_container_width=True)
        else:
            st.info("Not enough data to compute importances. Upload a CSV with at least ~30 days.")
    else:
        st.info("No data yet. Upload the CSV in the sidebar.")

elif section == "5. Predictive Modeling":
    if not ml:
        st.info("Not enough data to train models. Upload a CSV with at least ~30 days.")
    else:
        st.subheader("Regression (log1p target)")
        Xtr, Xte = ml["X_train_pp"], ml["X_test_pp"]
        ytr, yte = ml["y_reg_train"], ml["y_reg_test"]

        lin = LinearRegression().fit(Xtr, ytr); pred_lin = lin.predict(Xte)
        dtr = DecisionTreeRegressor(max_depth=4, random_state=42).fit(Xtr, ytr); pred_dtr = dtr.predict(Xte)
        rfr = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1).fit(Xtr, ytr); pred_rfr = rfr.predict(Xte)

        def reg_eval(y_true_log, y_pred_log):
            y_true = np.expm1(y_true_log); y_pred = np.expm1(y_pred_log)
            return dict(R2_log=float(r2_score(y_true_log, y_pred_log)),
                        RMSE=float(np.sqrt(mean_squared_error(y_true, y_pred))),
                        MAE=float(mean_absolute_error(y_true, y_pred)))
        reg_scores = {
            "LinearRegression": reg_eval(yte, pred_lin),
            "DecisionTree(max_depth=4)": reg_eval(yte, pred_dtr),
            "RandomForest(n=200)": reg_eval(yte, pred_rfr)
        }
        st.dataframe(pd.DataFrame(reg_scores).T)

        best_name = min(reg_scores, key=lambda k: reg_scores[k]["MAE"])
        best_pred = {"LinearRegression": pred_lin, "DecisionTree(max_depth=4)": pred_dtr, "RandomForest(n=200)": pred_rfr}[best_name]
        y_true = np.expm1(yte); y_hat = np.expm1(best_pred)
        st.plotly_chart(px.line(pd.DataFrame({"date": ml["y_raw_test"].index, "Actual": y_true, "Pred": y_hat}), x="date", y=["Actual","Pred"], template=plot_theme, title=f"Regression — Test ({best_name})"), use_container_width=True)

        st.subheader("Coefficients (Linear Regression)")
        coefs = pd.Series(lin.coef_, index=ml["feat_names"]).sort_values(key=lambda s: s.abs(), ascending=False)
        st.dataframe(coefs.rename("coef").head(20))

        st.divider()
        st.subheader("Classification (surge vs normal)")
        ytr_c, yte_c = ml["y_cls_train"], ml["y_cls_test"]

        logit = LogisticRegression(max_iter=1000, class_weight='balanced').fit(Xtr, ytr_c)
        p_log = logit.predict_proba(Xte)[:,1]; y_log = (p_log >= 0.5).astype(int)

        best_d, best_f1 = 0, -1
        for d in [2,3,4,5]:
            tmp = DecisionTreeClassifier(max_depth=d, random_state=42, class_weight='balanced').fit(Xtr, ytr_c)
            f1 = f1_score(yte_c, tmp.predict(Xte), zero_division=0)
            if f1 > best_f1: best_f1, best_d = f1, d
        dtc = DecisionTreeClassifier(max_depth=best_d, random_state=42, class_weight='balanced').fit(Xtr, ytr_c)
        p_dt = dtc.predict_proba(Xte)[:,1]; y_dt = (p_dt >= 0.5).astype(int)

        rfc = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1, class_weight='balanced').fit(Xtr, ytr_c)
        p_rf = rfc.predict_proba(Xte)[:,1]; y_rf = (p_rf >= 0.5).astype(int)

        def cls_eval(y_true, y_pred, y_prob):
            return dict(Accuracy=float(accuracy_score(y_true,y_pred)),
                        Precision=float(precision_score(y_true,y_pred,zero_division=0)),
                        Recall=float(recall_score(y_true,y_pred,zero_division=0)),
                        F1=float(f1_score(y_true,y_pred,zero_division=0)),
                        ROC_AUC=float(roc_auc_score(y_true,y_prob)) if len(np.unique(y_true))>1 else np.nan)

        cls_scores = {
            "Logistic(bal)": cls_eval(yte_c, y_log, p_log),
            f"DecisionTree(depth={best_d})": cls_eval(yte_c, y_dt, p_dt),
            "RandomForest(200,bal)": cls_eval(yte_c, y_rf, p_rf)
        }
        st.dataframe(pd.DataFrame(cls_scores).T)

        st.subheader("Logistic Regression — Top log-odds (drivers of surge=1)")
        log_coefs = pd.Series(logit.coef_[0], index=ml["feat_names"]).sort_values()
        st.dataframe(pd.concat([log_coefs.tail(10).rename("positive"), log_coefs.head(10).rename("negative")], axis=1))

        names = list(cls_scores.keys())
        best_cls = max(names, key=lambda n: (cls_scores[n]["F1"], cls_scores[n]["ROC_AUC"]))
        best_pred = {"Logistic(bal)": y_log, f"DecisionTree(depth={best_d})": y_dt, "RandomForest(200,bal)": y_rf}[best_cls]
        cm = confusion_matrix(ml["y_cls_test"], best_pred)
        st.plotly_chart(px.imshow(cm, text_auto=True, color_continuous_scale="Blues", origin="lower", labels=dict(x="Pred", y="Actual", color="Count"), title=f"Confusion Matrix — {best_cls}", template=plot_theme), use_container_width=True)

elif section == "6. Model Selection & Tuning":
    if not ml:
        st.info("Not enough data to tune models. Upload a CSV with at least ~30 days.")
    else:
        st.subheader("Random Forest Classifier — Fast Randomized Search")
        Xtr, Xte = ml["X_train_pp"], ml["X_test_pp"]
        ytr, yte = ml["y_cls_train"], ml["y_cls_test"]

        tscv = TimeSeriesSplit(n_splits=3)
        rf_est = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)
        rf_space = {
            "n_estimators": [100, 150, 200],
            "max_depth": [None, 6, 10, 14],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
            "max_features": ["sqrt", "log2", 0.5]
        }
        rs = RandomizedSearchCV(rf_est, rf_space, n_iter=15, cv=tscv, scoring="f1", random_state=42, n_jobs=-1, refit=True)
        rs.fit(Xtr, ytr)

        st.subheader("Best params")
        st.json(rs.best_params_)
        st.subheader("Best CV F1")
        st.write(round(rs.best_score_, 4))

        best = rs.best_estimator_.fit(Xtr, ytr)
        prob = best.predict_proba(Xte)[:,1]; pred = (prob >= 0.5).astype(int)
        test_scores = dict(
            Accuracy=float(accuracy_score(yte, pred)),
            Precision=float(precision_score(yte, pred, zero_division=0)),
            Recall=float(recall_score(yte, pred, zero_division=0)),
            F1=float(f1_score(yte, pred, zero_division=0)),
            ROC_AUC=float(roc_auc_score(yte, prob)) if len(np.unique(yte))>1 else np.nan
        )
        st.subheader("Test metrics")
        st.json(test_scores)

        cm = confusion_matrix(yte, pred)
        st.plotly_chart(px.imshow(cm, text_auto=True, color_continuous_scale="Blues", origin="lower",
                                  labels=dict(x="Pred", y="Actual", color="Count"),
                                  title="Confusion Matrix — Tuned RF", template=plot_theme), use_container_width=True)

        fpr, tpr, _ = roc_curve(yte, prob)
        st.plotly_chart(px.area(x=fpr, y=tpr, title="ROC Curve — Tuned RF",
                                labels=dict(x="FPR", y="TPR"), template=plot_theme).update_traces(mode="lines"), use_container_width=True)

elif section == "7. Results & Conclusion":
    st.subheader("What the data showed")
    if len(df_daily):
        bullets = []
        if "lag_7" in df_daily.columns:
            bullets.append(f"- Weekly seasonality was strong (corr with lag_7 ≈ {df_daily['pax'].corr(df_daily['lag_7']):.2f}).")
        if "is_weekend" in df_daily.columns:
            wk = df_daily.groupby("is_weekend")["pax"].mean()
            if len(wk)==2:
                bullets.append(f"- Weekend mean ≈ {int(wk[1]):,} pax vs weekday mean ≈ {int(wk[0]):,} pax.")
        if bullets:
            st.write("\n".join(bullets))
    else:
        st.write("Patterns were summarized in EDA.")

    st.subheader("Model outcomes ")
    st.markdown(
        "- Regression: after the log transform, the Random Forest typically minimized MAE versus Linear/Tree and tracked the weekly wave reliably.\n"
        "- Classification: Logistic Regression provided clear drivers via log-odds; the Tree/Forest often delivered higher F1 for surge detection.\n"
        "- Consistent drivers: weekly lag and rolling means dominated; weekend flag and month dummies added smaller but stable shifts."
    )


'''
with open(APP_PATH, "w", encoding="utf-8") as f:
    f.write(APP_CODE)

# closed leftovers and restarted cleanly
for name in ("streamlit", "cloudflared"):
    try:
        subprocess.run(["pkill", "-f", name], check=False)
    except Exception:
        pass

# started Streamlit
st_proc = subprocess.Popen(
    ["streamlit", "run", APP_PATH, "--server.port", "8501", "--server.address", "0.0.0.0"],
    stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1,
)

# gave Streamlit time to boot
time.sleep(3)

# opened Cloudflared tunnel and printed the public URL
cf_proc = subprocess.Popen(
    [CF_PATH, "tunnel", "--url", "http://localhost:8501", "--no-autoupdate"],
    stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1,
)

public_url = None
for _ in range(400):
    line = cf_proc.stdout.readline()
    if not line:
        break
    m = re.search(r"https://[a-zA-Z0-9.-]+trycloudflare\.com", line)
    if m:
        public_url = m.group(0)
        break

print("\n>>> Open this in a new browser tab:", public_url if public_url else "(no URL yet — re-run cell)")