In [1]:
import numpy as np
import pandas as pd
from scipy.stats import f as f_distr
from IPython.display import display

import statsmodels.api as sm
from statsmodels.formula.api import ols

# Multi-Way Anova

https://www.youtube.com/watch?v=qdPJRP3j5WM

In [3]:
def as_pandas(arr, cols):
    nRows, nCols = arr.shape
    
    rez = []
    for i in range(nRows):
        for j in range(nCols):
            rez += [[i, j, arr[i, j]]]
            
    return pd.DataFrame(np.array(rez), columns=cols + ('rez',))
            


def anova_homebrew(dataDF, keyrez):
    muTot = np.mean(dataDF[keyrez])
    SST = np.sum((dataDF[keyrez] - muTot) ** 2)
    nT = len(dataDF)
    rezLst = [("tot", nT, SST)]

    getRow = lambda colName, colVal: dataDF[dataDF[colName] == colVal][keyrez]

    colNames = set(dataDF.columns) - {keyrez}
    for colName in colNames:
        colVals = set(dataDF[colName])

        muB = [np.mean(getRow(colName, colVal)) for colVal in colVals]
        nB = len(colVals)
        prefix = len(getRow(colName, list(colVals)[0]))
        SSB = np.sum((muB - muTot) ** 2) * prefix

        rezLst += [(colName, nB, SSB)]

    nE = 0
    SSE = rezLst[0][2] - np.sum([r[2] for r in rezLst[1:]])
    rezLst += [('err', nE, SSE)]

    rezDF = pd.DataFrame(rezLst, columns=('axis', 'nDim', 'sumsq'))

    # # Calculating degrees of freedom
    rezDF['df'] = rezDF['nDim'] - 1
    rezDF.at[len(rezDF)-1, 'df'] = 0
    rezDF.at[len(rezDF)-1, 'df'] = 2*rezDF.at[0, 'df'] - np.sum(rezDF['df'])
    del rezDF['nDim']  # This is a proxy column to calculate df, not informative by itself

    # Calculating mean square error and F-ratio
    rezDF['meansq'] = rezDF['sumsq'] / rezDF['df']
    rezDF['F'] = rezDF['meansq'] / list(rezDF[rezDF['axis'] == 'err']['meansq'])[0]
    df2 = rezDF.at[len(rezDF)-1, 'df']
    rezDF['pval'] = [1 - f_distr.cdf(f, df1, df2) for f, df1 in zip(rezDF['F'], rezDF['df'])]
    return rezDF

In [7]:
data = np.array([
    [75,75,90],
    [70,70,70],
    [50,55,75],
    [65,60,85],
    [80,65,80],
    [65,65,65]
])
dataDF = as_pandas(data, ('sellers', 'towns'))
dataDF

Unnamed: 0,sellers,towns,rez
0,0,0,75
1,0,1,75
2,0,2,90
3,1,0,70
4,1,1,70
5,1,2,70
6,2,0,50
7,2,1,55
8,2,2,75
9,3,0,65


In [5]:
# Homebrew
anova_homebrew(dataDF, 'rez')

Unnamed: 0,axis,sumsq,df,meansq,F,pval
0,tot,1750.0,17,102.941176,2.167183,0.107324
1,sellers,750.0,5,150.0,3.157895,0.057399
2,towns,525.0,2,262.5,5.526316,0.024181
3,err,475.0,10,47.5,1.0,0.5


In [18]:
# Statsmodels

linModel = ols('rez ~ C(towns)+C(sellers)', data=dataDF).fit()
sm.stats.anova_lm(linModel, typ=1)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(towns),2.0,525.0,262.5,5.526316,0.024181
C(sellers),5.0,750.0,150.0,3.157895,0.057399
Residual,10.0,475.0,47.5,,
