# Simpson's paradox

Example reproduced from http://www.degeneratestate.org/posts/2017/Oct/22/generating-examples-of-simpsons-paradox/

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

import utils as u
from IPython.core.display import HTML


In [None]:

def generate_p_values(n_subgroups):
    """
    Generates a set of conditional probabilities that obey
    
     - p(Y=1|x=1, z) > p(Y=1|x=0, z) for all z
     - p(Y=1|x=1, z=j) > p(Y=1|x=1, k) when j > k
     - p(Y=1|x=1, z=0) < p(Y=1|x=0, z=n) where n = max(z)
     
    :param n_subgroups: int. The number of values $Z$ can take.
    :return: dictionary, p[x][z] = p(Y=1|x,z)
    """
    p = defaultdict(dict)

    boundaries = np.random.uniform(0, 1, size=2 * n_subgroups)
    boundaries = [(n + b) / (2*n_subgroups) for n, b in enumerate(boundaries)]

    for i in range(n_subgroups):
        p[0][i] = boundaries[i*2]
        p[1][i] = boundaries[i*2+1]

    return p

In [None]:
def get_q_weights(ps, target):
    """
    Generates a mixture of the values in ps which is the solution to
    
    \sum_{i} p[i]q[i] = target
    
    :param ps: list of number
    :param target: goal of the sum
    :return: qs: list of weightings of ps
    """
    if len(ps) <= 1:
        raise ValueError("ps cannot be shorter than 2")

    if len(ps) == 2:
        p0, p1 = ps
        q0 = (p1 - target) / (p1 - p0)
        return q0, (1 - q0)

    rest, last = ps[:-1], ps[-1]
    mid_target = np.random.uniform(low=rest[0], high=min(target, rest[-1]))

    q0, q1 = get_q_weights([mid_target, last], target)
    remaining_qs = get_q_weights(rest, mid_target)
    qs = [q0 * q for q in remaining_qs] + [q1]

    return qs

In [None]:
def generate_gaussian_simpsons_paradox(n_subgroups=3, n_samples=1000):

    overall_cov = 3*np.array([[1,0.9], [0.9,1]])

    means = np.random.multivariate_normal(mean=[0,0], cov=overall_cov, size=n_subgroups)
    
    weights = np.random.uniform(size=n_subgroups)
    weights /= np.sum(weights)
    covs = [np.random.uniform(0.2,0.8) for _ in range(n_subgroups)]
    covs = [np.array([[1,-c], [-c,1]]) for c in covs]


    samples = []

    for sg, (mean, cov, w) in enumerate(zip(means, covs, weights)):
        n = int(round(n_samples * w))
        sample = np.random.multivariate_normal(mean=mean, cov=cov, size=n)
        sample = pd.DataFrame(sample, columns=["x", "y"])
        sample["z"] = sg
        samples.append(sample)
        
    df = pd.concat(samples)
    
    return df

In [None]:
df = generate_gaussian_simpsons_paradox()

print("Total Covariance: {:.3f}".format(df[["x", "y"]].cov().iloc[0,1]))
for z in df.z.unique():
    print("Subgroup {} covariance: {:.3f}".format(z,df[df.z==z][["x", "y"]].cov().iloc[0,1]))

In [None]:
sns.regplot(data=df, x="x", y="y");

In [None]:
fig, ax = plt.subplots()

plt.xlim(-6,6);
plt.ylim(-6,6);

for z in df.z.unique():
    sns.regplot(data=df[df.z==z], x="x", y="y", ax=ax)