#### Mean squared error

$$
    \text{MSE}(\hat{\theta}) = \text{Var}_\theta(\hat{\theta})
    + \text{Bias}(\hat{\theta}, \theta)^2
$$

In [None]:
import numpy as np
import pandas as pd
import scipy

s1 = np.array([85, 90, 78, 92, 88, 84, 89, 92, 85, 87])
s2 = np.array([92, 94, 89, 95, 91, 93, 91, 95, 90, 92])
s1, s2

In [None]:
s1.mean(), s2.mean()

In [None]:
s1.std(), s2.std()

In [None]:
std_err_difference = ((s1.std() ** 2) / len(s1) + (s2.std() ** 2) / len(s2)) ** 0.5
std_err_difference

In [None]:
t_stat = (s1.mean() - s2.mean()) / std_err_difference


In [None]:
import scipy.stats as stats

df = 2 * len(s1) - 2
p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df))
df, p_value


In [None]:
# shuffling the columns independently does not preserve contingency table!
# instead, we get values closer to the "expected frequencies"!
df = pd.DataFrame(
    {
        "Color": np.random.permutation(["Red"] * 100 + ["Blue"] * 200),
        "Accident": np.random.permutation([True] * 80 + [False] * 220),
    }
)
df = pd.DataFrame(
    {
        "Color": ["Red"] * 100 + ["Blue"] * 200,
        "Accident": ["Yes"] * 30 + ["No"] * 70 + ["Yes"] * 50 + ["No"] * 150,
    }
)
df = df.sample(frac=1).reset_index(drop=True)
df


In [None]:
contingency_table = pd.crosstab(df["Color"], df["Accident"])
contingency_table


In [None]:
alpha = 0.5
chi2stat, pvalue, dof, expected_frequency = scipy.stats.chi2_contingency(contingency_table, correction=False)
chi2stat, pvalue, dof, expected_frequency, pvalue <= alpha

In [None]:
# (
#     (30 - 26.67) ** 2 / 26.67
#     + (70 - 73.33) ** 2 / 73.33
#     + (50 - 53.33) ** 2 / 53.33
#     + (150 - 146.67) ** 2 / 146.67
# )
for x in expected_frequency.flatten():
    print(x)

chi2 = np.power((contingency_table.to_numpy() - expected_frequency), 2) / expected_frequency
np.sum(chi2)