In [1]:
# Required libraries for ANOVA

import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols # ols stand for ordinary least square

In [2]:
# creating the dataframe
data = {'Drug-1': [67, 42, 67, 56, 62, 64, 59, 72, 71, 60],
        'Drug-2': [50, 52, 43, 67, 67, 59, 67, 64, 63, 65],
        'Drug-3': [48, 49, 50, 55, 56, 61, 61, 60, 59, 64],
        'Drug-4': [47, 67, 54, 67, 68, 65, 65, 56, 60, 65]}

df = pd.DataFrame(data)
df

Unnamed: 0,Drug-1,Drug-2,Drug-3,Drug-4
0,67,50,48,47
1,42,52,49,67
2,67,43,50,54
3,56,67,55,67
4,62,67,56,68
5,64,59,61,65
6,59,67,61,65
7,72,64,60,56
8,71,63,59,60
9,60,65,64,65


In [6]:
# In order to pass it into ols, we will melt the data from wide format to long format

data_melt = df.melt(var_name= 'Treatment', value_name= 'Value')
data_melt

Unnamed: 0,Treatment,Value
0,Drug-1,67
1,Drug-1,42
2,Drug-1,67
3,Drug-1,56
4,Drug-1,62
5,Drug-1,64
6,Drug-1,59
7,Drug-1,72
8,Drug-1,71
9,Drug-1,60


In [10]:
# hypothesis
    # Ho : mean of Drug1 = Drug2 = Drug3 = Drug4
    # Ha : Not all drug mean are same

# interpretation
# we accept null hypothesis if
    # F value is close to 1
    # P value > 0.05

In [9]:
# OLS Model

model = ols('Value ~ C(Treatment)', data=data_melt).fit()

# perform anova
anova_table = sm.stats.anova_lm(model)

# printing the output of anova
print(anova_table)

                df  sum_sq    mean_sq         F   PR(>F)
C(Treatment)   3.0   196.5  65.500000  1.144327  0.34436
Residual      36.0  2060.6  57.238889       NaN      NaN


In [11]:
 # R - Squared: Models Accuracy / Proportion of variance in the dependent variable (value)

 r_square = model.rsquared
print('R-Square:', round(r_square*100, 2))

R-Square: 8.71


In [None]:
# when to use which test

# 1-column: z-test, t-test
# 2-column: correlation
# >2column: ANOVA

In [1]:
# Other way of calculating anova without dataframe
from scipy.stats import f_oneway

# data
Drug_1 = [67, 42, 67, 56, 62, 64, 59, 72, 71, 60]
Drug_2 = [50, 52, 43, 67, 67, 59, 67, 64, 63, 65]
Drug_3 = [48, 49, 50, 55, 56, 61, 61, 60, 59, 64]
Drug_4 = [47, 67, 54, 67, 68, 65, 65, 56, 60, 65]

In [2]:
# one-way anova
f_stats, p_value = f_oneway(Drug_1, Drug_2, Drug_3, Drug_4)

# output
print(f'F-Stats: {f_stats}', f'P-Value: {p_value}', sep='\n')

F-Stats: 1.144326895079103
P-Value: 0.3443595629359094


In [None]:
# DataScience
    # Data Analytics    - What does my data says ? (EDA - Exploratory Data Anlaysis)
        # Business analytics
        # Data visualisation
    # Machine Learning  - How does my future data react ?
        # deep learning
        # Natural Language Processing
        # Computer vision