In [108]:
import math
import random 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import t, norm, ttest_1samp, f, f_oneway

from scipy.stats import chi2

import statsmodels.api as sm
from statsmodels.formula.api import ols

In [95]:
class ZTest:
    def __init__(self, sample, pop_mean, pop_std):
        self.sample = sample
        self.sample_mean = sample.mean()
        self.pop_mean = pop_mean
        self.pop_std = pop_std
        
    def Z_score(self, alpha=0.05):
        z_score = (self.sample.mean() - self.pop_mean)/(self.pop_std/math.sqrt(self.sample.size))
        p_value = 2 * (1-norm.cdf(abs(z_score)))
        
        if p_value > alpha:
            print('Fail to reject H0 (Accept H0). The mean of the population is considered to be equal to', self.pop_mean)
        else:
            print('Reject H0 (Accept H1). The mean of the population is NOT equal to', self.pop_mean)


In [96]:
sample = np.array([4.18613688, 4.43860151, 4.78053989, 4.38801261, 4.94714259, 4.8528787,
 4.88875992, 4.68165426, 4.42798665, 4.53260313, 4.55727267, 4.76126105,
 4.49138963, 4.333771,   4.8295567,  4.26810296, 4.67983236, 4.43654222,
 5.38207414, 5.11092259, 5.09641506, 5.23208847, 5.03342048, 5.56957311,
 4.47390108, 4.7893098,  4.72459487, 4.84036066, 4.81666639, 4.43825881,
 4.75351214, 4.96596792, 4.28500486, 4.48239589, 4.39598646, 4.41771026,
 4.51422029, 5.38328062, 5.2636805,  4.3091652,  5.0583079,  4.39178018,
 4.78803634, 4.93375277, 4.96726921, 4.55379975, 4.42004136, 4.9998488,
 5.12604516, 5.08242965, 4.9436677,  4.65807828]).round(3)

pop_mean = 4.5
pop_std = 0.6

# Create ZTest object
z_test = ZTest(sample, pop_mean, pop_std)

# Test the hypothesis with alpha=0.05
z_test.Z_score()

Reject H0 (Accept H1). The mean of the population is NOT equal to 4.5


In [103]:
class TTest:
    def __init__(self, alpha = 0.05):
        self.alpha = alpha
        
    def One_sample(self, sample, pop_mean):
        df = sample.size-1
        sample_mean = sample.mean()
        sample_std = sample.std(ddof=1)
                
        t_statistic = round((sample_mean - pop_mean)/(sample_std/math.sqrt(sample.size)), 5)
        p_value = round(2 * (1-t.cdf(abs(t_statistic), df)), 5)
        
        self.Check(t_statistic, p_value)
        
    def Two_sample(self, sample1, sample2):
        n1 = sample1.size
        n2 = sample2.size
        df = n1 + n2 - 2
        
        mean1 = sample1.mean() 
        mean2 = sample2.mean()
        
        var1 = sample1.var(ddof=1)
        var2 = sample2.var(ddof=1)

        t_statistic = (mean1 - mean2)/math.sqrt((var1/n1)+(var2/n2))
        p_value = round(2 * (1-t.cdf(abs(t_statistic), df)), 5)
        
        self.Check(t_statistic, p_value)
        
    def Paired(self, before, after, lazy=False):
        n = before.size
        df = n - 1
        diff = after-before
        
        if lazy:
            self.One_sample(diff, 0)
        else:
            print(diff)
            std = diff.std(ddof=1)

            t_statistic = diff.mean()/(std/math.sqrt(n))
            p_value = round(2 * (1-t.cdf(abs(t_statistic), df)), 10)
            self.Check(t_statistic, p_value)

    def Check(self, t_statistic, p_value): 
        print("Test Statistic: ", t_statistic)
        print("p-value: ", p_value)
        if p_value > self.alpha:
            print('Fail to reject H0')
        else:
            print('Reject H0 ') 

In [99]:
sample = np.array([14.8, 15.2, 15.1, 15.3, 15.0, 14.9, 15.2, 14.8, 15.1, 15.0, 14.9, 14.8, 15.2, 
                   14.9, 15.0, 14.9, 15.1, 15.3, 15.0, 15.1, 14.8, 15.0, 15.2, 15.1, 15.3, 15.1, 
                   15.0, 14.8, 15.2, 15.0])

pop_mean = 15

# Create T-Test object
one_sample_test = TTest().One_sample(sample, pop_mean)

Test Statistic:  1.2661
p-value:  0.21555
Fail to reject H0


In [101]:
New_flavor = np.array([8, 7, 9, 6, 7, 8, 9, 7, 8, 7, 6, 8, 7, 9, 8, 7, 6, 9, 8, 7])
Old_flavor = np.array([6, 7, 8, 6, 7, 6, 7, 6, 8, 7, 6, 7, 6, 8, 7, 6, 7, 8, 6, 7])


two_sample_test = TTest().Two_sample(Old_flavor, New_flavor)

Test Statistic:  -2.6626213096882503
p-value:  0.01131
Reject H0 


In [104]:
Before = np.array([15, 18, 12, 10, 17, 16, 12, 14, 19, 18, 11, 13, 16, 17, 19, 14, 16, 13, 15, 12])
After = np.array([18, 20, 15, 13, 19, 18, 14, 16, 21, 20, 14, 16, 19, 20, 22, 16, 18, 15, 17, 14])

paired_sample_test = TTest().Paired(Before, After)

[3 2 3 3 2 2 2 2 2 2 3 3 3 3 3 2 2 2 2 2]
Test Statistic:  21.354156504062622
p-value:  0.0
Reject H0 


# ANOVA Test

In [105]:
class Anova:
    def __initi__(self, alpha=0.05):
        pass
    def One_way(self, categ):
        k = categ.shape[0]
        n_T = categ.size

        mean = categ.mean(axis=1).mean().round(2)
        
        SS_B  = sum([cat.size*((cat.mean() - mean)**2) for cat in categ])
        
        df_B = k - 1
        MS_B  = SS_B /df_B
       
        SS_W = sum([(cat.size - 1 ) * cat.var(ddof=1) for cat in groups])
        
        df_W = n_T - k
        MS_W  = SS_W /df_W

        TSS = SS_B + SS_B

        F_statistic = MS_B / MS_W

        p_value = 1 - f.cdf(F_statistic, df_B, df_W)

        print('ANOVA using the formula')
        print("MS B:", MS_B)
        print("MS W:", MS_W)
        print("F-statistic:", F_statistic)
        print("p-value:", p_value)

        if p_value < 0.05:
            print('Reject the null hypothesis H0, There is one group mean different from the others')
        else:
            print('Fail to reject the null hypothesis, all the group means are equal')
    def Two_way(self, df):
        model = ols('height ~ C(water) + C(sun) + C(water):C(sun)', data=df).fit()#
        return sm.stats.anova_lm(model, typ=2)

In [106]:
groups = np.array([[3, 2, 1, 1, 4],
                   [5, 2, 4, 2, 3],
                   [7, 4, 5, 3, 6]])

Anova().One_way(groups)

df = pd.DataFrame({'water': np.repeat(['daily', 'weekly'], 15),
                   'sun': np.tile(np.repeat(['low', 'med', 'high'], 5), 2),
                   'height': [6, 6, 6, 5, 6, 5, 5, 6, 4, 5,
                              6, 6, 7, 8, 7, 3, 4, 4, 4, 5,
                              4, 4, 4, 4, 4, 5, 6, 6, 7, 8]})

Anova().Two_way(df)

ANOVA using the formula
MS B: 10.066749999999999
MS W: 1.9666666666666668
F-statistic: 5.1186864406779655
p-value: 0.024693724889743307
Reject the null hypothesis H0, There is one group mean different from the others


Unnamed: 0,sum_sq,df,F,PR(>F)
C(water),8.533333,1.0,16.0,0.000527
C(sun),24.866667,2.0,23.3125,2e-06
C(water):C(sun),2.466667,2.0,2.3125,0.120667
Residual,12.8,24.0,,


In [107]:
class ChiSquare:
    def __init__(self, alpha=0.05):
        self.alpha = alpha
       
    def calculate_statistic(self, observed_freq, expected_prob):
        df = observed_frequencies.size - 1
        expected_freq = expected_freq = expected_probabilities * np.sum(observed_freq)
        
        chi_square_statistic = np.sum((observed_freq - expected_freq)**2 / expected_freq)
        p_value = 1 - chi2.cdf(chi_square_statistic, df)
    
        if p_value < self.alpha:
            print("Reject null hypothesis")
        else:
            print("Fail to reject null hypothesis")
    

In [43]:
# Define observed frequencies
observed_freq = np.array([18, 20, 16, 22, 14, 10])

# Define expected probabilities under the null hypothesis of a fair die
expected_prob = np.array([1/6]*6)

ChiSquare().calculate_statistic(observed_freq, expected_prob)

# "Fail to reject null hypothesis", indicating that there is insufficient evidence to reject the null hypothesis that the die is fair.

Fail to reject null hypothesis


In [109]:
class ABTest:
    def __init__(self, alpha=0.05):
        self.alpha = alpha
    
    def t_statistic(self, control, treatment):
        n1 = control.size
        n2 = treatment.size
        
        df = n1+n2-2

        mean_control = np.mean(control)
        mean_treatment = np.mean(treatment)
        
        var_control = np.var(control, ddof=1)
        var_treatment = np.var(treatment, ddof=1)
        
        pooled_var = ((n1 - 1) * var_control + (n2 - 1) * var_treatment) / df
        se_difference = np.sqrt(pooled_var * (1/n1 + 1/n2))
        t_statistic = (mean_treatment - mean_control) / se_difference

        p_value = 1 - t.cdf(t_statistic, df)
    
        if p_value < self.alpha:
            print("Reject null hypothesis")
        else:
            print("Fail to reject null hypothesis")

In [110]:
New_flavor = np.array([8, 9, 7, 8, 9, 6, 7, 8, 7, 8, 7, 8, 9, 6, 8])
Old_flavor = np.array([6, 7, 8, 5, 6, 7, 5, 8, 6, 7, 5, 6, 7, 6, 5])

AB_Test = ABTest().t_statistic(Old_flavor, New_flavor)

Reject null hypothesis
