In [None]:
import pandas as pd
import seaborn as sns
from scipy import stats as st

## Загружаем данные

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/allatambov/PyPerm24/main/Cowles.csv")

## Числовые данные: cравниваем распределения и средние в двух группах

In [None]:
print(data["sex"].value_counts())
print()
print(data["volunteer"].value_counts())

In [None]:
data.groupby("sex")["extraversion"].describe()

In [None]:
data.groupby("volunteer")["extraversion"].describe()

In [None]:
# https://seaborn.pydata.org/generated/seaborn.set_theme.html

sns.set_theme(style = "whitegrid")

In [None]:
# https://seaborn.pydata.org/generated/seaborn.histplot.html

sns.histplot(data, x = "extraversion");

In [None]:
sns.histplot(data, x = "extraversion", hue = "volunteer");

In [None]:
sns.kdeplot(data, 
            x = "extraversion", 
            hue = "volunteer", 
            multiple = "stack");

In [None]:
# https://seaborn.pydata.org/generated/seaborn.displot.html#seaborn.displot

sns.displot(data, x = "extraversion", 
            hue = "volunteer", 
            col = "volunteer");

In [None]:
sns.displot(data, 
            x = "extraversion", 
            hue = "volunteer", 
            col = "volunteer", 
            kind = "kde", 
            multiple = "stack");

In [None]:
# вспоминаем, что необходимо для дов интервала для среднего

tab = data.groupby("volunteer")["extraversion"].agg(["count", "mean", "std"])
tab

In [None]:
n1 = tab.loc["no", "count"]
n2 = tab.loc["yes", "count"]

mean1 = tab.loc["no", "mean"]
mean2 = tab.loc["yes", "mean"]

s1 = tab.loc["no", "std"]
s2 = tab.loc["yes", "std"]

se1 = s1 / n1 ** 0.5
se2 = s2 / n2 ** 0.5

print(round(mean1, 2), "±", round(se1, 2))
print(round(mean2, 2), "±", round(se2, 2))

In [None]:
print(st.t.interval(0.95, loc = mean1, scale = se1, df = n1 - 1))
print(st.t.interval(0.95, loc = mean2, scale = se2, df = n2 - 1))

In [None]:
# делим на 2 группы, вспоминаем про варианты t-тестов

one = data[data["volunteer"] == "no"]["extraversion"]
two = data[data["volunteer"] == "yes"]["extraversion"]

In [None]:
st.ttest_ind(one, two)

In [None]:
st.ttest_1samp(one, popmean = 12)

## Качественные данные: сравниваем частоты и доли в двух группах

In [None]:
data.groupby("sex")["volunteer"].value_counts()

In [None]:
tab = pd.DataFrame(data.groupby("sex")["volunteer"].value_counts())
tab

In [None]:
tab.rename(columns = {"volunteer" : "counts"}, inplace = True)
tab

In [None]:
fin = tab.reset_index(level = [0, 1])
fin

In [None]:
# https://seaborn.pydata.org/generated/seaborn.barplot.html

sns.barplot(fin, 
            x = "volunteer", 
            y = "counts", 
            hue = "sex",
            palette = ["salmon", "steelblue"]);

In [None]:
data["sex"].value_counts()

In [None]:
n_female = data["sex"].value_counts()["female"]
n_male = data["sex"].value_counts()["male"]
print(n_female, n_male)

In [None]:
data.groupby("sex")["volunteer"].value_counts()

In [None]:
data.groupby("sex")["volunteer"].value_counts(normalize = True)

In [None]:
res = data.groupby("sex")["volunteer"].value_counts(normalize = True)

In [None]:
print(res.index)

In [None]:
print(res.index.get_level_values(0))

In [None]:
print(res.index.get_level_values(1))

In [None]:
res.loc[("female", "yes")]

In [None]:
res["female"]["yes"]

In [None]:
res.xs('female', level = 0)

In [None]:
res.xs('yes', level = 1)

In [None]:
res.xs('yes', level = 1) * 100

In [None]:
(res.xs('yes', level = 1) * 100).reset_index()

In [None]:
p_female = res["female"]["yes"]
p_male = res["male"]["yes"]
print(p_female, p_male)

In [None]:
se_female = (p_female * (1 - p_female) / n_female) ** 0.5
se_male = (p_male * (1 - p_male) / n_male) ** 0.5
print(se_female, se_male)

In [None]:
print(round(p_female, 2), "±", round(se_female, 2))
print(round(p_male, 2), "±", round(se_male, 2))

In [None]:
print(st.norm.interval(0.90, loc = p_female, scale = se_female))
print(st.norm.interval(0.90, loc = p_male, scale = se_male))

In [None]:
st.binom_test(349, n_female, p = 0.4)

In [None]:
from statsmodels.stats.proportion import proportions_ztest

In [None]:
proportions_ztest(count = [349, 248], 
    nobs = [n_female, n_male])