# Two-way ANOVA

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Example 1: Store Quality of Service based on a Day of Week & Store

In [3]:
# Below data shows Customer satisfaction score on the Quality of Service 
# from each store on a given day

raw_data  = [['Day','Store-A','Store-B','Store-C','Store-D','Store-E'], [1,79, 81, 74, 77, 66],\
           [2, 78, 86, 89, 97, 86], [3, 81, 87, 84, 94, 82], [4, 80, 83, 81, 88, 83], [5, 70, 74, 77, 89, 68]]

cols = raw_data.pop(0)

store_df = pd.DataFrame(raw_data, columns=cols)
store_df

Unnamed: 0,Day,Store-A,Store-B,Store-C,Store-D,Store-E
0,1,79,81,74,77,66
1,2,78,86,89,97,86
2,3,81,87,84,94,82
3,4,80,83,81,88,83
4,5,70,74,77,89,68


In [9]:
day_values = store_df['Day'].values
A_values = store_df['Store-A'].values
B_values = store_df['Store-B'].values
C_values = store_df['Store-C'].values
D_values = store_df['Store-D'].values
E_values = store_df['Store-E'].values

ValueError: Wrong number of items passed 5, placement implies 1

In [10]:
# Splitting the data
df_a = pd.DataFrame({'Day': day_values,
                      'Store': 'A',
                      'QoS': A_values})
df_b = pd.DataFrame({'Day': day_values,
                    'Store': 'B',
                    'QoS': B_values})
df_c = pd.DataFrame({'Day': day_values,
                    'Store': 'C',
                    'QoS': C_values})
df_d = pd.DataFrame({'Day': day_values,
                     'Store': 'D',
                     'QoS': D_values})

df_e = pd.DataFrame({'Day': day_values,
                    'Store': "E",
                    "QoS": E_values})

df_a, df_b

(   Day Store  QoS
 0    1     A   79
 1    2     A   78
 2    3     A   81
 3    4     A   80
 4    5     A   70,
    Day Store  QoS
 0    1     B   81
 1    2     B   86
 2    3     B   87
 3    4     B   83
 4    5     B   74)

In [12]:
# Appending all the above individual store values to DF
QoS_df = pd.DataFrame()

QoS_df = QoS_df.append(df_a)
QoS_df = QoS_df.append(df_b)
QoS_df = QoS_df.append(df_c)
QoS_df = QoS_df.append(df_d)
QoS_df = QoS_df.append(df_e)

QoS_df

Unnamed: 0,Day,Store,QoS
0,1,A,79
1,2,A,78
2,3,A,81
3,4,A,80
4,5,A,70
0,1,B,81
1,2,B,86
2,3,B,87
3,4,B,83
4,5,B,74


In [13]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [14]:
formula = 'QoS ~ C(Day) + C(Store)'
model_ols = ols(formula=formula, data=QoS_df).fit()
model_ols.summary()

0,1,2,3
Dep. Variable:,QoS,R-squared:,0.792
Model:,OLS,Adj. R-squared:,0.689
Method:,Least Squares,F-statistic:,7.636
Date:,"Sat, 17 Jul 2021",Prob (F-statistic):,0.000309
Time:,12:34:46,Log-Likelihood:,-65.79
No. Observations:,25,AIC:,149.6
Df Residuals:,16,BIC:,160.5
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,71.6400,2.522,28.408,0.000,66.294,76.986
C(Day)[T.2],11.8000,2.658,4.439,0.000,6.165,17.435
C(Day)[T.3],10.2000,2.658,3.837,0.001,4.565,15.835
C(Day)[T.4],7.6000,2.658,2.859,0.011,1.965,13.235
C(Day)[T.5],0.2000,2.658,0.075,0.941,-5.435,5.835
C(Store)[T.B],4.6000,2.658,1.730,0.103,-1.035,10.235
C(Store)[T.C],3.4000,2.658,1.279,0.219,-2.235,9.035
C(Store)[T.D],11.4000,2.658,4.289,0.001,5.765,17.035
C(Store)[T.E],-0.6000,2.658,-0.226,0.824,-6.235,5.035

0,1,2,3
Omnibus:,0.387,Durbin-Watson:,2.842
Prob(Omnibus):,0.824,Jarque-Bera (JB):,0.475
Skew:,0.252,Prob(JB):,0.789
Kurtosis:,2.551,Cond. No.,6.65


In [15]:
annova_table = anova_lm(model_ols, typ=2)
print(annova_table)

          sum_sq    df         F    PR(>F)
C(Day)    617.36   4.0  8.737051  0.000614
C(Store)  461.76   4.0  6.534956  0.002575
Residual  282.64  16.0       NaN       NaN


In [18]:
# One more data set
table2  = [['Day','Store-A','Store-B','Store-C','Store-D','Store-E'], [1,69, 71, 64, 67, 56],\
           [2, 68, 76, 79, 87, 76], [3, 71, 77, 74, 84, 72], [4, 70, 73, 71, 78, 73], [5, 60, 64, 67, 79, 58]]

In [16]:
print(f"""
p-Value of Day: {annova_table['PR(>F)'][0]} is less then 0.05, we reject the Ho in favor of Ha.
Conclude that the Day of Week is having effect on the QoS
p-Value of Store: {annova_table['PR(>F)'][1]} is less then 0.05, we reject the Ho in favor of Ha.
Conclude that the Store is having effect on the QoS
""")


p-Value of Day: 0.0006137845612060659 is less then 0.05, we reject the Ho in favor of Ha.
Conclude that the Day of Week is having effect on the QoS
p-Value of Store: 0.0025754076569080254 is less then 0.05, we reject the Ho in favor of Ha.
Conclude that the Store is having effect on the QoS



## Compare the data with Tukey Analysis

In [19]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [24]:
QoS_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Day     25 non-null     int64 
 1   Store   25 non-null     object
 2   QoS     25 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 800.0+ bytes


In [25]:
QoS_df['Day-Store'] = QoS_df['Day'].astype(str)+"-"+QoS_df['Store'].astype(str)
QoS_df

Unnamed: 0,Day,Store,QoS,Day-Store
0,1,A,79,1-A
1,2,A,78,2-A
2,3,A,81,3-A
3,4,A,80,4-A
4,5,A,70,5-A
0,1,B,81,1-B
1,2,B,86,2-B
2,3,B,87,3-B
3,4,B,83,4-B
4,5,B,74,5-B


In [26]:
tukey_table = pairwise_tukeyhsd(endog=QoS_df['QoS'],
                                groups=QoS_df['Day-Store'],
                               alpha=0.05)
print(tukey_table)

  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


ValueError: v must be > 1 when p >= .9

In [28]:
print("""
Unable to perform Tukey test for Two Way ANOVA
""")


Unable to perform Tukey test for Two Way ANOVA



## Example 2: Height of Kids compared to nurition given daily/weekly or exposed to games

In [31]:
nut_arr = np.repeat(['daily','weekly'],repeats=15)
nut_arr

array(['daily', 'daily', 'daily', 'daily', 'daily', 'daily', 'daily',
       'daily', 'daily', 'daily', 'daily', 'daily', 'daily', 'daily',
       'daily', 'weekly', 'weekly', 'weekly', 'weekly', 'weekly',
       'weekly', 'weekly', 'weekly', 'weekly', 'weekly', 'weekly',
       'weekly', 'weekly', 'weekly', 'weekly'], dtype='<U6')

In [33]:
games_arr = np.tile( np.repeat(['low', 'med', 'high'], repeats=5), 2)
games_arr

array(['low', 'low', 'low', 'low', 'low', 'med', 'med', 'med', 'med',
       'med', 'high', 'high', 'high', 'high', 'high', 'low', 'low', 'low',
       'low', 'low', 'med', 'med', 'med', 'med', 'med', 'high', 'high',
       'high', 'high', 'high'], dtype='<U4')

In [34]:
height_arr = np.array([6, 6, 6, 5, 6, 5, 5, 6, 4, 5,
                              6, 6, 7, 8, 7, 3, 4, 4, 4, 5,
                              4, 4, 4, 4, 4, 5, 6, 6, 7, 8])
height_arr

array([6, 6, 6, 5, 6, 5, 5, 6, 4, 5, 6, 6, 7, 8, 7, 3, 4, 4, 4, 5, 4, 4,
       4, 4, 4, 5, 6, 6, 7, 8])

In [35]:
kids_df = pd.DataFrame({'Nut': nut_arr,
                       'games': games_arr,
                       'height': height_arr})
kids_df

Unnamed: 0,Nut,games,height
0,daily,low,6
1,daily,low,6
2,daily,low,6
3,daily,low,5
4,daily,low,6
5,daily,med,5
6,daily,med,5
7,daily,med,6
8,daily,med,4
9,daily,med,5


In [38]:
kids_model = ols('height ~ C(games) + C(Nut) + C(games):C(Nut)', data=kids_df).fit()
kids_model.summary()

0,1,2,3
Dep. Variable:,height,R-squared:,0.737
Model:,OLS,Adj. R-squared:,0.682
Method:,Least Squares,F-statistic:,13.45
Date:,"Sat, 17 Jul 2021",Prob (F-statistic):,2.62e-06
Time:,13:38:33,Log-Likelihood:,-29.792
No. Observations:,30,AIC:,71.58
Df Residuals:,24,BIC:,79.99
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.8000,0.327,20.821,0.000,6.126,7.474
C(games)[T.low],-1.0000,0.462,-2.165,0.041,-1.953,-0.047
C(games)[T.med],-1.8000,0.462,-3.897,0.001,-2.753,-0.847
C(Nut)[T.weekly],-0.4000,0.462,-0.866,0.395,-1.353,0.553
C(games)[T.low]:C(Nut)[T.weekly],-1.4000,0.653,-2.143,0.042,-2.748,-0.052
C(games)[T.med]:C(Nut)[T.weekly],-0.6000,0.653,-0.919,0.367,-1.948,0.748

0,1,2,3
Omnibus:,1.012,Durbin-Watson:,1.647
Prob(Omnibus):,0.603,Jarque-Bera (JB):,0.297
Skew:,0.201,Prob(JB):,0.862
Kurtosis:,3.275,Cond. No.,9.77


In [39]:
kids_anova_table = anova_lm(kids_model, typ=2)
kids_anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(games),24.866667,2.0,23.3125,2e-06
C(Nut),8.533333,1.0,16.0,0.000527
C(games):C(Nut),2.466667,2.0,2.3125,0.120667
Residual,12.8,24.0,,


In [43]:
print(f"""
1.Since the p_value of Games: {kids_anova_table['PR(>F)'][0]} is < 0.05 they have a statistical significance, Ho is rejected in favour of Ha.
So games have an impact of height.

2. Since the p_value of Nut: {kids_anova_table['PR(>F)'][1]} is < 0.05 they have a statistical significance, Ho is rejected in favour of Ha.
So Nut have an impact of height.

3. Since the p_value of Games & Nutrition: {kids_anova_table['PR(>F)'][2]} is > 0.05 they have no a statistical significance, failed to reject Ho.
So games & nitrition do not have an interaction between both of them
""")


1.Since the p_value of Games: 2.371555925858227e-06 is < 0.05 they have a statistical significance, Ho is rejected in favour of Ha.
So games have an impact of height.

2. Since the p_value of Nut: 0.0005269080727816927 is < 0.05 they have a statistical significance, Ho is rejected in favour of Ha.
So Nut have an impact of height.

3. Since the p_value of Games & Nutrition: 0.12066712248670274 is > 0.05 they have no a statistical significance, failed to reject Ho.
So games & nitrition do not have an interaction between both of them



In [None]:
# One more dataset
table2  = [['Day','Store-A','Store-B','Store-C','Store-D','Store-E'], [1,69, 71, 64, 67, 56],\
           [2, 68, 76, 79, 87, 76], [3, 71, 77, 74, 84, 72], [4, 70, 73, 71, 78, 73], [5, 60, 64, 67, 79, 58]]