<a href="https://colab.research.google.com/github/cassiecinzori/ECON3916/blob/main/Labs/Lab6/The_Architecture_of_Bias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Architecture of Bias

### Cassandra Cinzori

## Phase 1: The Danger of Randomness (Manual Split)

#### Step 1: Ingestion and Manual Shuffling

In [None]:
import seaborn as sns
import pandas as pd

import numpy as np

# 1. Data Ingestion (The Population)
df = sns.load_dataset('titanic')
print(f"Total Population: {len(df)}")
print(f"Population Survival Rate: {df['survived'].mean():.4f}")

# 2. Manual Shuffle (Simulation of Sampling)
# We set a seed to ensure reproducibility for the lesson,
# but in production, this variance happens naturally.
np.random.seed(2026)
indices = np.random.permutation(len(df))

#### Step 2: The Split and The Bias Check

In [None]:
# 3. Cut the deck (80/20 Split)
split_point = int(0.8 * len(df))

# Slicing the shuffled indices
train_idx = indices[:split_point]
test_idx = indices[split_point:]

# Creating the subsets
train_set = df.iloc[train_idx]
test_set = df.iloc[test_idx]

# 4. Bias Check (The Delta)
train_surv = train_set['survived'].mean()
test_surv = test_set['survived'].mean()
delta = abs(train_surv - test_surv)

print(f"Train Survival Rate: {train_surv:.4f}")
print(f"Test Survival Rate:  {test_surv:.4f}")
print(f"Sampling Bias (Delta): {delta:.4f}")

## Phase 2: Stratification (The Fix)

#### Step 3: Fixing Covariate Shift

In [None]:
from sklearn.model_selection import train_test_split

# Stratify by 'pclass' ensures the distribution of classes is identical
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['pclass'])

print("\n--- Stratified Split ---")
print("Train Class Dist:\n", X_train['pclass'].value_counts(normalize=True))
print("Test Class Dist:\n", X_test['pclass'].value_counts(normalize=True))

#### Step 4: The SRM Diagnostic (Forensics)

In [None]:
from scipy.stats import chisquare

# Observed counts from the A/B test
observed = [450, 550]

# Expected counts under a perfect 50/50 split (1000 total users)
expected = [500, 500]

chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)

print(f"\nChi-Square Statistic: {chi2_stat:.4f}")
print(f"P-Value: {p_value:.4f}")

if p_value < 0.01:
    print("CRITICAL FAILURE: Sample Ratio Mismatch (SRM) Detected. Check Load Balancer.")
else:
  print("Variance is within natural limits.")