In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
# ======================================================
# Phase 1: The Danger of Randomness (Manual Split)
# ======================================================

# 1. Data Ingestion (The Population)
df = sns.load_dataset('titanic')
print(f"Total Population: {len(df)}")
print(f"Population Survival Rate: {df['survived'].mean():.4f}")

# ------------------------------------------------------
# 2. Manual Shuffle (Simulation of Sampling)
# ------------------------------------------------------
# Seed is fixed ONLY so we can observe and discuss the bias.
# In reality, this randomness happens naturally.
np.random.seed(2026)
indices = np.random.permutation(len(df))

# ------------------------------------------------------
# 3. Cut the Deck (80/20 Split)
# ------------------------------------------------------
split_point = int(0.8 * len(indices))

# Manual slicing of shuffled indices
train_idx = indices[:split_point]
test_idx  = indices[split_point:]

# Creating the subsets
train_set = df.iloc[train_idx]
test_set  = df.iloc[test_idx]

# ------------------------------------------------------
# 4. Bias Check (The Delta)
# ------------------------------------------------------
train_surv = train_set['survived'].mean()
test_surv  = test_set['survived'].mean()
delta = abs(train_surv - test_surv)

print(f"Train Survival Rate: {train_surv:.4f}")
print(f"Test Survival Rate:  {test_surv:.4f}")
print(f"Sampling Bias (Delta): {delta:.4f}")

Total Population: 891
Population Survival Rate: 0.3838
Train Survival Rate: 0.3736
Test Survival Rate:  0.4246
Sampling Bias (Delta): 0.0510


In [12]:
from sklearn.model_selection import train_test_split

# ------------------------------------------------------
# Phase 2: Stratification (The Fix)
# ------------------------------------------------------
# We stratify on 'pclass' to force identical class distributions
X_train, X_test = train_test_split(
    df,
    test_size=0.2,
    random_state=2026,
    stratify=df['pclass']
)

print("\n--- Stratified Split ---")
print("Train Class Dist:\n", X_train['pclass'].value_counts(normalize=True))
print("Test Class Dist:\n", X_test['pclass'].value_counts(normalize=True))




--- Stratified Split ---
Train Class Dist:
 pclass
3    0.550562
1    0.242978
2    0.206461
Name: proportion, dtype: float64
Test Class Dist:
 pclass
3    0.553073
1    0.240223
2    0.206704
Name: proportion, dtype: float64


In [13]:
from scipy.stats import chisquare

# ------------------------------------------------------
# SRM Forensic Check
# ------------------------------------------------------

# Observed counts from the experiment
observed = [450, 550]  # Control, Treatment

# Expected counts under a true 50/50 split
expected = [500, 500]

# Chi-Square test
chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

print(f"Chi-Square Statistic: {chi2_stat:.4f}")
print(f"P-value: {p_value:.6f}")

# Decision rule
if p_value < 0.01:
    print("CRITICAL FAILURE: Sample Ratio Mismatch (SRM) Detected. Check Load Balancer.")
else:
    print("Variance is within natural limits.")


Chi-Square Statistic: 10.0000
P-value: 0.001565
CRITICAL FAILURE: Sample Ratio Mismatch (SRM) Detected. Check Load Balancer.
