In [29]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

# ----------------------
# Load data
# ----------------------
df = pd.read_csv("./data/macdonald.csv")

# Drop unwanted column if present
if "gOrml" in df.columns:
    df = df.drop(columns=["gOrml"])

# Separate metadata and numeric features
meta_cols = ["Menu Category", "Menu Items"]
X = df.drop(columns=meta_cols)
meta = df[meta_cols]

# ----------------------
# Handle missing values
# ----------------------
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# ----------------------
# Standardize data
# ----------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# ----------------------
# PCA
# ----------------------
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Explained variance for labels
explained_var_ratio = pca.explained_variance_ratio_ * 100
pcs = [f"PC{i+1}" for i in range(X_pca.shape[1])]

def pc_label(pc):
    idx = int(pc.replace("PC", "")) - 1
    return f"{pc} ({explained_var_ratio[idx]:.1f}%)"

# ----------------------
# 1. Explained variance plot
# ----------------------
cum_explained = np.cumsum(explained_var_ratio)
fig_var = px.line(
    x=range(1, len(cum_explained) + 1),
    y=cum_explained,
    markers=True,
    labels={"x": "Number of Components", "y": "Cumulative Explained Variance (%)"},
    title="Explained Variance by Components"
)
fig_var.update_traces(line=dict(color="royalblue"))
fig_var.update_layout(width=1200, height=700)
fig_var.show()

# ----------------------
# 2. Interactive 2D Score Plot (choose PCs separately for X and Y)
# ----------------------
score_df = pd.DataFrame(X_pca, columns=pcs)
score_df["Menu Category"] = meta["Menu Category"].values
score_df["Menu Items"] = meta["Menu Items"].values

# Default axes
x_pc, y_pc = "PC1", "PC2"

fig_score = px.scatter(
    score_df,
    x=x_pc, y=y_pc,
    color="Menu Category",
    text="Menu Items",
    title=f"Score Plot ({pc_label(x_pc)} vs {pc_label(y_pc)})",
    labels={"Menu Category": "Category"}
)
fig_score.update_traces(marker=dict(size=9, line=dict(width=0.5, color="DarkSlateGrey")),
                        textposition="top center")

# Dropdowns
x_buttons, y_buttons = [], []
for pc in pcs:
    x_buttons.append(dict(
        label=pc_label(pc), method="relayout",
        args=[{"xaxis.title.text": pc_label(pc),
               "xaxis.range": [score_df[pc].min()*1.1, score_df[pc].max()*1.1]}]
    ))
    y_buttons.append(dict(
        label=pc_label(pc), method="relayout",
        args=[{"yaxis.title.text": pc_label(pc),
               "yaxis.range": [score_df[pc].min()*1.1, score_df[pc].max()*1.1]}]
    ))

fig_score.update_layout(
    width=1200, height=700,
    xaxis=dict(title=pc_label(x_pc)),
    yaxis=dict(title=pc_label(y_pc)),
    updatemenus=[
        dict(buttons=x_buttons, direction="down", x=0.0, y=1.1, showactive=True, xanchor="left"),
        dict(buttons=y_buttons, direction="down", x=0.15, y=1.1, showactive=True, xanchor="left")
    ]
)
fig_score.show()

# ----------------------
# 3. Correlation Loading Plot
# ----------------------
loadings = pd.DataFrame(
    pca.components_.T[:, 0:2],
    columns=["PC1", "PC2"],
    index=X.columns
)

theta = np.linspace(0, 2 * np.pi, 300)
circle1 = go.Scatter(x=np.cos(theta), y=np.sin(theta),
                     mode="lines", name="r=1",
                     line=dict(color="black", dash="dash"))
circle05 = go.Scatter(x=0.5*np.cos(theta), y=0.5*np.sin(theta),
                      mode="lines", name="r=0.5",
                      line=dict(color="gray", dash="dot"))
points = go.Scatter(x=loadings["PC1"], y=loadings["PC2"],
                    mode="markers+text",
                    text=loadings.index,
                    textposition="top center",
                    marker=dict(color="crimson", size=9),
                    name="Variables")

fig_corr = go.Figure([circle1, circle05, points])
fig_corr.update_layout(
    title="Correlation Loading Plot (PC1 vs PC2)",
    width=1200, height=700,
    xaxis=dict(title=pc_label("PC1"), scaleanchor="y"),
    yaxis=dict(title=pc_label("PC2"), scaleratio=1)
)
fig_corr.show()

# ----------------------
# 4. Loading histograms (PC1, PC2, PC3)
# ----------------------
loading_df = pd.DataFrame(
    pca.components_.T[:, 0:3],
    columns=["PC1", "PC2", "PC3"],
    index=X.columns
).reset_index().rename(columns={"index": "Variable"})

for pc in ["PC1", "PC2", "PC3"]:
    fig_load = px.bar(
        loading_df, x="Variable", y=pc,
        title=f"Loadings Histogram for {pc_label(pc)}",
        text_auto=".2f"
    )
    fig_load.update_layout(xaxis_tickangle=-45,
                           width=1200, height=700)
    fig_load.show()

# ----------------------
# 5. Hotelling’s T² vs Q residuals (Outlier detection)
# ----------------------
scores = X_pca[:, :2]
T2 = np.sum((scores / np.std(scores, axis=0))**2, axis=1)
X_reconstructed = pca.inverse_transform(X_pca)
Q = np.sum((X_scaled - X_reconstructed)**2, axis=1)

outlier_df = pd.DataFrame({"Hotelling_T2": T2, "Q_residuals": Q,
                           "Menu Items": meta["Menu Items"],
                           "Menu Category": meta["Menu Category"]})

fig_outlier = px.scatter(
    outlier_df,
    x="Hotelling_T2", y="Q_residuals",
    color="Menu Category", text="Menu Items",
    title="Outlier Detection: Hotelling’s T² vs Q Residuals",
    labels={"Hotelling_T2": "Hotelling's T²", "Q_residuals": "Q Residuals"}
)
fig_outlier.update_traces(marker=dict(size=9, line=dict(width=0.5, color="DarkSlateGrey")),
                          textposition="top center")
fig_outlier.update_layout(width=1200, height=700)
fig_outlier.show()
