In [None]:
import sys, subprocess, json, math, os
from pathlib import Path

def pip_install(pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + pkgs)

pip_install([
    "pygwalker>=0.4.9",
    "duckdb>=0.10.0",
    "pandas>=2.0.0",
    "numpy>=1.24.0",
    "seaborn>=0.13.0"
])

import numpy as np
import pandas as pd
import seaborn as sns

df_raw = sns.load_dataset("titanic").copy()
print("Raw shape:", df_raw.shape)
display(df_raw.head(3))

In [None]:
def make_safe_bucket(series, bins=None, labels=None, q=None, prefix="bucket"):
    s = pd.to_numeric(series, errors="coerce")
    if q is not None:
        try:
            cuts = pd.qcut(s, q=q, duplicates="drop")
            return cuts.astype("string").fillna("Unknown")
        except Exception:
            pass
    if bins is not None:
        cuts = pd.cut(s, bins=bins, labels=labels, include_lowest=True)
        return cuts.astype("string").fillna("Unknown")
    return s.astype("float64")

def preprocess_titanic_advanced(df):
    out = df.copy()
    out.columns = [c.strip().lower().replace(" ", "_") for c in out.columns]

    for c in ["survived", "pclass", "sibsp", "parch"]:
        if c in out.columns:
            out[c] = pd.to_numeric(out[c], errors="coerce").fillna(-1).astype("int64")

    if "age" in out.columns:
        out["age"] = pd.to_numeric(out["age"], errors="coerce").astype("float64")
        out["age_is_missing"] = out["age"].isna()
        out["age_bucket"] = make_safe_bucket(
            out["age"],
            bins=[0, 12, 18, 30, 45, 60, 120],
            labels=["child", "teen", "young_adult", "adult", "mid_age", "senior"],
        )

    if "fare" in out.columns:
        out["fare"] = pd.to_numeric(out["fare"], errors="coerce").astype("float64")
        out["fare_is_missing"] = out["fare"].isna()
        out["log_fare"] = np.log1p(out["fare"].fillna(0))
        out["fare_bucket"] = make_safe_bucket(out["fare"], q=8)

    for c in ["sex", "class", "who", "embarked", "alone", "adult_male"]:
        if c in out.columns:
            out[c] = out[c].astype("string").fillna("Unknown")

    if "cabin" in out.columns:
        out["deck"] = out["cabin"].astype("string").str.strip().str[0].fillna("Unknown")
        out["deck_is_missing"] = out["cabin"].isna()
    else:
        out["deck"] = "Unknown"
        out["deck_is_missing"] = True

    if "ticket" in out.columns:
        t = out["ticket"].astype("string")
        out["ticket_len"] = t.str.len().fillna(0).astype("int64")
        out["ticket_has_alpha"] = t.str.contains(r"[A-Za-z]", regex=True, na=False)
        out["ticket_prefix"] = t.str.extract(r"^([A-Za-z\.\/\s]+)", expand=False).fillna("None").str.strip()
        out["ticket_prefix"] = out["ticket_prefix"].replace("", "None").astype("string")

    if "sibsp" in out.columns and "parch" in out.columns:
        out["family_size"] = (out["sibsp"] + out["parch"] + 1).astype("int64")
        out["is_alone"] = (out["family_size"] == 1)

    if "name" in out.columns:
        title = out["name"].astype("string").str.extract(r",\s*([^\.]+)\.", expand=False).fillna("Unknown").str.strip()
        vc = title.value_counts(dropna=False)
        keep = set(vc[vc >= 15].index.tolist())
        out["title"] = title.where(title.isin(keep), other="Rare").astype("string")
    else:
        out["title"] = "Unknown"

    out["segment"] = (
        out["sex"].fillna("Unknown").astype("string")
        + " | "
        + out["class"].fillna("Unknown").astype("string")
        + " | "
        + out["age_bucket"].fillna("Unknown").astype("string")
    )

    for c in out.columns:
        if out[c].dtype == bool:
            out[c] = out[c].astype("int64")
        if out[c].dtype == "object":
            out[c] = out[c].astype("string")

    return out

df = preprocess_titanic_advanced(df_raw)
print("Prepped shape:", df.shape)
display(df.head(3))

In [None]:
def data_quality_report(df):
    rows = []
    n = len(df)
    for c in df.columns:
        s = df[c]
        miss = int(s.isna().sum())
        miss_pct = (miss / n * 100.0) if n else 0.0
        nunique = int(s.nunique(dropna=True))
        dtype = str(s.dtype)
        sample = s.dropna().head(3).tolist()
        rows.append({
            "col": c,
            "dtype": dtype,
            "missing": miss,
            "missing_%": round(miss_pct, 2),
            "nunique": nunique,
            "sample_values": sample
        })
    return pd.DataFrame(rows).sort_values(["missing", "nunique"], ascending=[False, False])

dq = data_quality_report(df)
display(dq.head(20))

RANDOM_SEED = 42
MAX_ROWS_FOR_UI = 200_000

df_for_ui = df
if len(df_for_ui) > MAX_ROWS_FOR_UI:
    df_for_ui = df_for_ui.sample(MAX_ROWS_FOR_UI, random_state=RANDOM_SEED).reset_index(drop=True)

agg = (
    df.groupby(["segment", "deck", "embarked"], dropna=False)
      .agg(
          n=("survived", "size"),
          survival_rate=("survived", "mean"),
          avg_fare=("fare", "mean"),
          avg_age=("age", "mean"),
      )
      .reset_index()
)

for c in ["survival_rate", "avg_fare", "avg_age"]:
    agg[c] = agg[c].astype("float64")

Path("/content").mkdir(parents=True, exist_ok=True)
df_for_ui.to_csv("/content/titanic_prepped_for_ui.csv", index=False)
agg.to_csv("/content/titanic_agg_segment_deck_embarked.csv", index=False)

In [None]:
import pygwalker as pyg

SPEC_PATH = Path("/content/pygwalker_spec_titanic.json")

def load_spec(path):
    if path.exists():
        try:
            return json.loads(path.read_text())
        except Exception:
            return None
    return None

def save_spec(path, spec_obj):
    try:
        if isinstance(spec_obj, str):
            spec_obj = json.loads(spec_obj)
        path.write_text(json.dumps(spec_obj, indent=2))
        return True
    except Exception:
        return False

def launch_pygwalker(df, spec_path):
    spec = load_spec(spec_path)
    kwargs = {}
    if spec is not None:
        kwargs["spec"] = spec

    try:
        walker = pyg.walk(df, use_kernel_calc=True, **kwargs)
    except TypeError:
        walker = pyg.walk(df, **kwargs) if spec is not None else pyg.walk(df)

    captured = None
    for attr in ["spec", "_spec"]:
        if hasattr(walker, attr):
            try:
                captured = getattr(walker, attr)
                break
            except Exception:
                pass
    for meth in ["to_spec", "export_spec", "get_spec"]:
        if captured is None and hasattr(walker, meth):
            try:
                captured = getattr(walker, meth)()
                break
            except Exception:
                pass

    if captured is not None:
        save_spec(spec_path, captured)

    return walker

walker_rows = launch_pygwalker(df_for_ui, SPEC_PATH)
walker_agg = pyg.walk(agg)

In [2]:
HTML_PATH = Path("/content/pygwalker_titanic_dashboard.html")

def export_html_best_effort(df, spec_path, out_path):
    spec = load_spec(spec_path)
    html = None

    try:
        html = pyg.walk(df, spec=spec, return_html=True) if spec is not None else pyg.walk(df, return_html=True)
    except Exception:
        html = None

    if html is None:
        for fn in ["to_html", "export_html"]:
            if hasattr(pyg, fn):
                try:
                    f = getattr(pyg, fn)
                    html = f(df, spec=spec) if spec is not None else f(df)
                    break
                except Exception:
                    continue

    if html is None:
        return None

    if not isinstance(html, str):
        html = str(html)

    out_path.write_text(html, encoding="utf-8")
    return out_path

export_html_best_effort(df_for_ui, SPEC_PATH, HTML_PATH)

Raw shape: (891, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


Prepped shape: (891, 25)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,age_is_missing,age_bucket,fare_is_missing,log_fare,fare_bucket,deck_is_missing,family_size,is_alone,title,segment
0,0,3,male,22.0,1,0,7.25,S,Third,man,...,0,young_adult,0,2.110213,"(-0.001, 7.75]",1,2,0,Unknown,male | Third | young_adult
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,0,adult,0,4.280593,"(69.488, 512.329]",1,2,0,Unknown,female | First | adult
2,1,3,female,26.0,0,0,7.925,S,Third,woman,...,0,young_adult,0,2.188856,"(7.91, 9.841]",1,1,1,Unknown,female | Third | young_adult


Data quality report (top 20):


Unnamed: 0,col,dtype,missing,missing_%,nunique,sample_values
3,age,float64,177,19.87,88,"[22.0, 38.0, 26.0]"
12,embark_town,string,2,0.22,3,"[Southampton, Cherbourg, Southampton]"
6,fare,float64,0,0.0,248,"[7.25, 71.2833, 7.925]"
18,log_fare,float64,0,0.0,248,"[2.1102132003465894, 4.2805931204649, 2.188856..."
24,segment,string,0,0.0,41,"[male | Third | young_adult, female | First | ..."
21,family_size,int64,0,0.0,9,"[2, 2, 1]"
19,fare_bucket,string,0,0.0,8,"[(-0.001, 7.75], (69.488, 512.329], (7.91, 9.8..."
4,sibsp,int64,0,0.0,7,"[1, 1, 0]"
5,parch,int64,0,0.0,7,"[0, 0, 0]"
16,age_bucket,string,0,0.0,7,"[young_adult, adult, young_adult]"


Agg table shape: (85, 7)


Unnamed: 0,segment,deck,embarked,n,survival_rate,avg_fare,avg_age
0,female | First | Unknown,Unknown,C,5,1.0,101.57582,
1,female | First | Unknown,Unknown,S,4,1.0,73.128125,
2,female | First | adult,Unknown,C,13,1.0,147.123392,38.384615
3,female | First | adult,Unknown,Q,1,1.0,90.0,33.0
4,female | First | adult,Unknown,S,15,1.0,104.99278,36.666667


Saved: /content/titanic_prepped_for_ui.csv /content/titanic_agg_segment_deck_embarked.csv

--- Launching PyGWalker on PREPPED ROW-LEVEL table ---



Box(children=(HTML(value='\n<div id="ifr-pyg-0006496339579967Sz10b3EPNy6GlYCQ" style="height: auto">\n    <hea…

Spec capture found but could not be saved (API mismatch).

--- Launching PyGWalker on AGGREGATED table (rate exploration) ---



Box(children=(HTML(value='\n<div id="ifr-pyg-00064963395bbc16dZo6u5tMUjY3cHip" style="height: auto">\n    <hea…

Box(children=(HTML(value='\n<div id="ifr-pyg-00064963395f7bedsZrHpMR7zGbDg42C" style="height: auto">\n    <hea…

Exported HTML -> /content/pygwalker_titanic_dashboard.html

ADVANCED UI CHECKLIST (do these inside PyGWalker):

A) Missingness-as-signal (often overlooked)
   - Plot: survived (color) vs age (x) vs log_fare (y)
   - Add filter: age_is_missing, deck_is_missing
   - Question: do missing ages/decks correlate with survival?

B) Segment survival "heatmaps" without coding
   - Use the AGG table:
     * X: deck, Y: embarked, Color: survival_rate, Size: n
     * Filter segment (sex|class|age_bucket) to compare cohorts instantly

C) Title effects (social structure proxy)
   - Bar: title (x) vs mean(survived) (y), sorted descending
   - Add pclass filter to see if title still matters within class

D) Family size non-linearity
   - X: family_size, Y: mean(survived), with smoothing if available
   - Check if '2-4' family sizes outperform 'alone' and 'very large'

E) Ticket prefix as hidden stratifier
   - Bar: ticket_prefix (x) vs survival rate (mean survived)
   - Filter to top prefixes using n t