In [1]:
import pandas as pd
from scipy.stats import f_oneway
from scipy.stats import bartlett
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
df = pd.read_csv("./data/bike.csv")
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [3]:
df["season"].unique()

array([1, 2, 3, 4], dtype=int64)

In [5]:
f_oneway(df.loc[df["season"] == 1, "registered"],
            df.loc[df["season"] == 2, "registered"],
            df.loc[df["season"] == 3, "registered"],
            df.loc[df["season"] == 4, "registered"])

F_onewayResult(statistic=167.97539126005708, pvalue=1.8882994650328087e-106)

In [6]:
formula1 = "registered ~ C(season)"
formula2 = "registered ~ season"

In [7]:
model1 = ols(formula1, df).fit()
model2 = ols(formula2, df).fit()

In [8]:
anova_lm(model1)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(season),3.0,10990190.0,3663398.0,167.975391,1.8882990000000002e-106
Residual,10882.0,237327000.0,21809.14,,


In [9]:
anova_lm(model2)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
season,1.0,6679598.0,6679598.0,300.866825,1.665411e-66
Residual,10884.0,241637600.0,22201.18,,


In [11]:
print(pairwise_tukeyhsd(df["registered"], df["season"]))

 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower    upper  reject
-----------------------------------------------------
     1      2  66.9509  0.001  56.6414 77.2605   True
     1      3  81.3432  0.001  71.0336 91.6527   True
     1      4  69.5538  0.001  59.2451 79.8624   True
     2      3  14.3922 0.0018   4.1275  24.657   True
     2      4   2.6029    0.9   -7.661 12.8667  False
     3      4 -11.7894 0.0167 -22.0532 -1.5256   True
-----------------------------------------------------


In [13]:
# 등분산 검정
bartlett(df.loc[df["season"] == 1, "registered"],
            df.loc[df["season"] == 2, "registered"],
            df.loc[df["season"] == 3, "registered"],
            df.loc[df["season"] == 4, "registered"])

BartlettResult(statistic=527.1434314603279, pvalue=6.251500440844118e-114)