# Analysis of Variance

The first block of code must be run before anything else.  It only needs to be run once each time you use the website.  It loads the class and class functions necessary to generate the practice problems.  You can hide the first block (it is long)

In [4]:
# @title Click to hide code
import numpy as np
import pandas as pd
import random, math
from scipy import stats
from IPython.display import Markdown, display

class RandomData():
    def __init__(self, groups = 1, n = 10, distribution = "normal"):
        self.groups = groups
        self.n = n # TODO add option for unequal sample sizes.
        self.df = self.generate_data()
        self.ss = self.sum_of_squares()
        self.means = self.group_means()
        self.sums = self.col_sums()
        self.g = self.anova_g()
        self.sum_squared_scores = self.grand_sum_squared_scores()
        self.var = self.variance()
        self.std = self.stdev()
        self.test = "" # value is set when a stats function is called
        self.alpha = self.set_alpha()
        self.tails = int
        self.null = int
        self.obt = float
        self.effect_size = float
        self.crit_values = {}; dict
        self.significance = bool
        if distribution == "normal":
            self.distribution = distribution 
        else:
            raise ValueError("only the normal distribution is currently supported")
            # TODO add the ability to generate data from other distribuions


    def set_alpha(self):
        self.alpha = random.choice([0.05, 0.01])
        return self.alpha
        
    
    def generate_data(self):
        self.df = pd.DataFrame()
        # list of letters for group labels
        letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

        # create data for each group and add it to the dataframe
        for group in range(self.groups):
            mean = random.randint(10, 100)
            sd = mean * random.uniform(0.05, 0.50)

            # generate the sample based on the above values
            samples = np.random.normal(mean, sd, self.n)

            # round the data so it only includes whole numbers
            sample = np.round(samples).astype(int)

            # convert to a dataframe to display the data
            self.df[f'{letters[group]}'] = sample
        return self.df
    

    def generate_question(self):
        # determine the test type

        if self.tails == 2:
            text = "significantly different from"
        elif self.tails == 1:
            if self.crit_values["direction"] == "increase" :
                text = "significantly greater than"
            elif self.crit_values["direction"] == "decrease":
                text = "significantly less than"
            else:
                return ValueError("direction error for generating question")
        else:
            return ValueError("tails error for question generation")    

        print(self.df)
        
        if self.test == "z":
            display(Markdown(f"Given the following data, is the mean of $Group_A$ {text} ${{{self.null}}}$?  Use a ${{{self.tails}}}$ tailed-test with $\\alpha = {{{self.alpha}}}$"))
            display(Markdown(f"$M_A = {{{self.means[0]}}}$"))
            display(Markdown(f"$ {{\\sigma_A}} = {{{round(self.df['A'].std(ddof = 0), 2)}}}$"))
            display(Markdown(f"$ n = {{{len(self.df['A'])}}}$"))
        elif self.test == "one-sample t-test":
            display(Markdown(f"Given the following data, is the mean of $Group_A$ {text} ${{{self.null}}}$?  Use a ${{{self.tails}}}$ tailed-test with $\\alpha = {{{self.alpha}}}$"))
            display(Markdown(f"$M_A = {{{self.means[0]}}}$"))
            display(Markdown(f"$s^2 = {{{self.var[0]}}}$"))
            display(Markdown(f"$ n = {{{len(self.df['A'])}}}$"))
        elif self.test == "independent-samples t-test":
            display(Markdown(f"Given the following between-subjects data, is the mean of $Group_A$ {text} the mean of $Group_B$?  Use a ${{{self.tails}}}$ tailed-test with $\\alpha = {{{self.alpha}}}$"))
            display(Markdown(f"$M_A = {{{self.means[0]}}}, M_B = {{{self.means[1]}}}$"))
            display(Markdown(f"$SS_A = {{{self.ss[0]}}}, SS_B = {{{self.ss[1]}}}$"))
            display(Markdown(f"$ n_A = {{{len(self.df['A'])}}}, n_B = {{{len(self.df['B'])}}}$"))
        elif self.test == "dependent-samples t-test":
            display(Markdown(f"Given the following within-subjects data, is $M_D$ {text} ${{{self.null}}}$?  Use a ${{{self.tails}}}$ tailed-test with $\\alpha = {{{self.alpha}}}$"))
            display(Markdown(f"$M_A = {{{self.means[0]}}}, M_B = {{{self.means[1]}}}$"))
            display(Markdown(f"$ n = {{{len(self.df['A'])}}}$"))
        elif self.test == "one-way ANOVA":
            display(Markdown(f"Given the following between-subjects data, use a one-way ANOVA with $\\alpha = {{{self.alpha}}}$"))
            display(Markdown(f"$G = {{{self.g}}}, \\Sigma X^2 = {{{self.sum_squared_scores}}}, k = {{{self.groups}}}, N = {{{self.groups * self.n}}}$"))
            letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
            for group in range(self.groups):
                display(Markdown(f"$T_{{{letters[group]}}} = {{{self.sums[group]}}}, SS_{{{letters[group]}}} = {{{self.ss[group]}}}$"))   
        else:
            return ValueError("test-type specification error in question geneneration")
       
        
    def final_decision(self):
        if self.tails == 2:
            if self.obt > self.crit_values["positive"] or self.obt < self.crit_values["negative"]:
                self.significance = True
            else:
                self.significance = False
        elif self.tails == 1:
            if self.crit_values["direction"] == "increase" and self.obt > self.crit_values["positive"]:
                self.significance = True
            elif self.crit_values["direction"] == "decrease" and self.obt < self.crit_values["negative"]:
                self.significance = True
            else:
                self.significance = False
        else:
            return ValueError("error in tails specification for final decision")
        
        return self.significance 
    

    def write_result(self):
        # TODO add more elaborate functionality for the results
        if self.test in ["independent-samples t-test", "one-sample t-test", "dependent-samples t-test"]:
            # print the critical value for the test
            if self.tails == 2:
                display(Markdown(f"$t_{{crit}} = \\pm{{{self.crit_values['positive']}}}, \\alpha_{{two-tailed}} = {{{self.alpha}}}, df = {{{self.crit_values['degf']}}}$"))
            elif self.tails == 1 and self.crit_values["direction"] == "increase":
                display(Markdown(f"$t_{{crit}} = +{{{self.crit_values['positive']}}}, \\alpha_{{one-tailed}} = {{{self.alpha}}}, df = {{{self.crit_values['degf']}}}$"))
            elif self.tails == 1 and self.crit_values["direction"] == "decrease":
                display(Markdown(f"$t_{{crit}} = {{{self.crit_values['negative']}}}, \\alpha_{{one-tailed}} = {{{self.alpha}}}, df = {{{self.crit_values['degf']}}}$"))
            else:
                return ValueError("tails error in writing results")
            # determine significance
            if self.significance:
                print(f"reject the null hypothesis, results are significant, t({self.crit_values['degf']}) = {self.obt}, p < {self.alpha}, d = {self.effect_size}")
            elif not self.significance:
                print(f"fail to reject the null hypothesis, results not significant, t({self.crit_values['degf']}) = {self.obt}, p > {self.alpha}, d = {self.effect_size}")
            else:
                return ValueError("significance boolean error in writing results")
        elif self.test == "z":
            # print the critical value of t
            if self.tails == 2:
                display(Markdown(f"$z_{{crit}} = \\pm{{{self.crit_values['positive']}}}, \\alpha_{{two-tailed}} = {{{self.alpha}}}$"))
            elif self.tails == 1 and self.crit_values["direction"] == "increase":
                display(Markdown(f"$z_{{crit}} = +{{{self.crit_values['positive']}}}, \\alpha_{{one-tailed}} = {{{self.alpha}}}$"))
            elif self.tails == 1 and self.crit_values["direction"] == "decrease":
                display(Markdown(f"$z_{{crit}} = {{{self.crit_values['negative']}}}, \\alpha_{{one-tailed}} = {{{self.alpha}}}$"))
            else:
                return ValueError("tails error in writing results")
            # determine significance
            if self.significance:
                print(f"reject the null hypothesis, results are significant, z = {self.obt}, p < {self.alpha}, d = {self.effect_size}")
            elif not self.significance:
                print(f"fail to reject the null hypothesis, results not significant, z = {self.obt}, p > {self.alpha}, d = {self.effect_size}")
            else:
                return ValueError("significance boolean error in writing results")
        elif self.test in ["one-way ANOVA"]:
            display(Markdown(f"$F_{{crit}} = {{{self.crit_values['positive']}}}, \\alpha = {{{self.alpha}}}$"))
            if self.significance:
                display(Markdown(f"reject the null hypothesis, results are significant, $ F({{{self.crit_values['degf_b']}}}, {{{self.crit_values['degf_w']}}}), p < {{{self.alpha}}}, \\eta^2 = {{{self.effect_size}}}$"))
            elif not self.significance:
                display(Markdown(f"fail to reject the null hypothesis, results not significant, $ F({{{self.crit_values['degf_b']}}}, {{{self.crit_values['degf_w']}}}), p > {{{self.alpha}}}, \\eta^2 = {{{self.effect_size}}}$"))

        else:
            return ValueError("test specificaion error when writing results")
        
        
    def set_null_hypothesis(self):
        # for one sample tests, sets a null hypothess between -3 to + 3 x the mean
        if  self.test in ["one-sample t-test", "z"]:
            mean = self.means[0]
            multiplier = random.uniform(-3, 3)
            self.null = round(mean * multiplier)
        else:
            self.null = 0
        return self.null


    def sum_of_squares(self):  # calculating the values presented in the problem.
        ss = []
        for column in self.df: 
            sum_scores = self.df[column].sum()
            sum_sqared_scores= (self.df[column].apply(lambda x: x ** 2)).sum()  
            ss_vals = sum_sqared_scores - round((sum_scores ** 2)/self.n, 2)
            ss.append(round(ss_vals, 2))
        return ss


    def group_means(self):
        means = []
        for column in self.df:
            means.append(round(self.df[column].mean(), 2))  
        return means
    

    def variance(self):
        vars = []
        for column in self.df:
            vars.append(round(self.df[column].var(ddof = 1), 2))
        return vars


    def stdev(self):
        stdevs = []
        for column in self.df:
            stdevs.append(round(self.df[column].std(ddof = 1), 2))
        return stdevs
    

    def col_sums(self):
        sums = []
        for column in self.df:
            sums.append(self.df[column].sum())
        return sums


    def anova_g(self):
        g = 0
        for column in self.df:
            g += self.df[column].sum()
        return g
    

    def grand_sum_squared_scores(self): # for anova
        sum_squared_scores = 0
        for column in self.df: 
            score = (self.df[column].apply(lambda x: x ** 2)).sum()  
            sum_squared_scores += score
        return sum_squared_scores
    

    def critical_value(self):
        # calculate the degrees of freedom based on the type of test used
        if self.test == "independent-samples t-test":
            degf = (self.n - 1) + (self.n - 1)
            self.tails = random.choice([1, 2])
            if self.tails == 1:
                crit = round(stats.t.ppf(1 - self.alpha, degf), 2)
            elif self.tails == 2:
                crit = round(stats.t.ppf(1 - self.alpha/2, degf), 2)
            self.crit_values = {"positive": crit, "negative": -crit, "degf": degf}

        elif self.test == "one-sample t-test" or self.test == "dependent-samples t-test":
            degf = self.n - 1
            self.tails = random.choice([1, 2])
            if self.tails == 1:
                crit = round(stats.t.ppf(1 - self.alpha, degf), 2)
            elif self.tails == 2:
                crit = round(stats.t.ppf(1 - self.alpha/2, degf), 2)
            self.crit_values = {"positive": crit, "negative": -crit, "degf": degf}

        # TODO manually specify crit values for z scores.  or figure out why the math is incorrect
        elif self.test == "z":
            self.tails = random.choice([1, 2])
            if self.tails == 1:
                crit = round(stats.norm.ppf(1 - self.alpha), 2)
            elif self.tails == 2:
                crit = round(stats.norm.ppf(1 - self.alpha/2), 2)
            self.crit_values = {"positive": crit, "negative": -crit}
        
        elif self.test == "one-way ANOVA":
            degf_w = (self.n * self.groups) - self.groups
            degf_b = self.groups - 1
            self.tails = 1
            crit = round(stats.f.ppf(q = (1 - self.alpha), dfn = degf_b, dfd = degf_w), 2)
            self.crit_values = {"positive": crit, "degf_w": degf_w, "degf_b": degf_b}
            
        
        else:
            raise ValueError("incorrect test specification - degrees of freedom")

        # add a direction for one-tailed tests 
        if self.test in ["one-way ANOVA"]:
            self.crit_values["direction"] = "increase"
        else:
            if self.tails == 1:  
                direction = random.choice(["increase", "decrease"])
                self.crit_values["direction"] = direction
            else:
                return ValueError("tails must be 1 for directional crit values")
        return self.crit_values


    def z_test(self):
        if len(self.df.columns) > 1:
            raise Exception("Data contains more than one sample")
        elif len(self.df.columns) == 0:
            raise Exception("Dataframe error: no data columns")
        else:
            self.test = "z"

            # set the null and write out the question
            self.set_null_hypothesis()
            self.critical_value()
            self.generate_question()

            # calculate the standard error
            # TODO double check the work here to make sure it is accurate
            sd = round(self.df['A'].std(ddof = 0), 2)
            n = len(self.df['A'])
            sem = round(sd/(round(math.sqrt(n),2)),2)
            self.obt = round((self.means[0] - self.null) / sem, 2)
            self.effect_size = round((self.means[0] - self.null) / sd, 2)

            # TODO add a way to determine environment so output can display in terminal or notebook
            # print calculations for the standard error
            display(Markdown("Calculating the standard error..."))
            display(Markdown(f"$\\sigma_M = \\frac{{\\sigma}}{{\\sqrt{{N}}}}$"))
            display(Markdown(f"$\\sigma_M = \\frac{{{sd}}}{{\\sqrt{n}}}$"))
            display(Markdown(f"$\\sigma_M = \\frac{{{sd}}}{{{round(math.sqrt(n),2)}}}$"))
            display(Markdown(f"$\\sigma_M = {{{sem}}}$"))
            print() # blank space
            # print the caluclations for z_obt
            display(Markdown("calculating $z_{{obt}}$..."))
            display(Markdown(f"$z_{{obt}} = {{\\frac{{M - \\mu}}{{\\sigma_M}}}}$"))
            display(Markdown(f"$z_{{obt}} = \\frac{{{self.means[0]} - {self.null}}}{{{sem}}}$"))
            display(Markdown(f"$z_{{obt}} = \\frac{{{self.means[0] - self.null}}}{{{sem}}}$"))
            display(Markdown(f"$z_{{obt}} = {{{self.obt}}}$"))
            print() # blank space
            # print calculations for cohen's d
            display(Markdown("calculating Cohen's d..."))
            display(Markdown("Cohen's d = $\\frac{{M - \\mu}}{{\\sigma}}$"))
            display(Markdown(f"Cohen's d = $\\frac{{{self.means[0]} - {self.null}}}{{{sd}}}$"))
            display(Markdown(f"Cohen's d = $\\frac{{{self.means[0] - self.null}}}{{{sd}}}$"))
            display(Markdown(f"Cohen's d = ${{{self.effect_size}}}$"))
            print() # blank space

            self.significance = self.final_decision()
            self.write_result()

            # return self.obt - not returning b/c it was printing out the value of self.obt.  need to figure out why but commenting out fixed it


    def one_sample_t_test(self):
        if len(self.df.columns) > 1:
            raise Exception("Data contains more than one sample")
        elif len(self.df.columns) == 0:
            raise Exception("Dataframe error: no data columns")
        else:
            self.test = "one-sample t-test"   
            
            # set the null and write out the question
            self.set_null_hypothesis()
            self.critical_value()
            self.generate_question()

            # calculate the standard error
            sem = round(math.sqrt(round((self.var[0]/self.n),2)),2)
            self.obt = round((self.means[0] - self.null) / sem, 2)
            self.effect_size = round((self.means[0] - self.null) / self.std[0], 2)

            # print the caluclations for the standard error
            # TODO add a way to determine environment so output can display in terminal or notebook
            print("calculating the standard error...")
            display(Markdown("$s_M = \\sqrt{{\\frac{{s^2}}{{n}}}}$"))
            display(Markdown(f"$s_M = \\sqrt{{\\frac{{{self.var[0]}}}{{{self.n}}}}}$"))
            display(Markdown(f"$s_M = \\sqrt{{{round((self.var[0]/self.n),2)}}}$"))
            display(Markdown(f"$s_M = {{{sem}}}$"))
            print() # blank space
            # print the caluclations for t_obt
            display(Markdown("calculating $t_{{obt}}$..."))
            display(Markdown(f"$t_{{obt}} = {{\\frac{{M - \\mu}}{{s_M}}}}$"))
            display(Markdown(f"$t_{{obt}} = \\frac{{{self.means[0]} - {self.null}}}{{{sem}}}$"))
            display(Markdown(f"$t_{{obt}} = \\frac{{{self.means[0] - self.null}}}{{{sem}}}$"))
            display(Markdown(f"$t_{{obt}} = {{{self.obt}}}$"))
            print() # blank space
            # print calculations for cohen's d
            display(Markdown("calculating Cohen's d..."))
            display(Markdown("Cohen's d = $\\frac{{M - \\mu}}{{s}}$"))
            display(Markdown(f"Cohen's d = $\\frac{{{self.means[0]} - {self.null}}}{{{self.std[0]}}}$"))
            display(Markdown(f"Cohen's d = $\\frac{{{self.means[0] - self.null}}}{{{self.std[0]}}}$"))
            display(Markdown(f"Cohen's d = ${{{self.effect_size}}}$"))
            print() # blank space

            self.significance = self.final_decision()
            self.write_result()

            # return self.obt - not returning b/c it was printing out the value of self.obt.  need to figure out why but commenting out fixed it        


    def independent_samples_t_test(self):
        if len(self.df.columns) == 1 or len(self.df.columns) > 2:
            raise ValueError("Data does not contain two samples")
        elif len(self.df.columns) == 0:
            raise ValueError("Dataframe error: no data columns")
        else: 
            self.test = "independent-samples t-test"           
                        
            # set the null and write out the question
            self.set_null_hypothesis()
            self.critical_value()
            self.generate_question()
            
            # primary calculations
            pooled_var = round(((self.ss[0] + self.ss[1]) / ((self.n - 1) + (self.n - 1))), 2)
            sem = round(math.sqrt((round((pooled_var/self.n),2))+(round((pooled_var/self.n),2))),2)
            self.obt = round(((self.means[0] - self.means[1]) - self.null) / sem, 2)
            self.effect_size = round(((self.means[0] - self.means[1])) / round(math.sqrt(pooled_var),2), 2)

            # TODO adapt to display in the terminal or a notebook
            # display the caluclations for the pooled variance
            print("calculating the pooled variance...")
            display(Markdown("$s_p^2 = {{\\frac{{SS_A + SS_B}}{{df_A + df_B}}}}$"))
            display(Markdown(f"$s_p^2 = {{\\frac{{{self.ss[0]} + {self.ss[1]}}}{{{self.n - 1} + {self.n - 1}}}}}$"))
            display(Markdown(f"$s_p^2 = \\frac{{{round(self.ss[0] + self.ss[1],2)}}}{{{(self.n - 1) + (self.n - 1)}}}$"))
            display(Markdown(f"$s_p^2 = {{{pooled_var}}}$"))
            # display the calculations for the estimated standard error
            print("calculating the estimated standard error of the difference between means...")
            display(Markdown("$s_{{(M_A - M_B)}} = \\sqrt{{\\frac{{s_p^2}}{{n_1}} + \\frac{{s_p^2}}{{n_1}}}}$"))
            display(Markdown(f"$s_{{(M_A - M_B)}} = \\sqrt{{\\frac{{{pooled_var}}}{{{self.n}}} + \\frac{{{pooled_var}}}{{{self.n}}}}}$"))
            display(Markdown(f"$s_{{(M_A - M_B)}} = \\sqrt{{{round(pooled_var/self.n, 2)} + {round(pooled_var/self.n, 2)}}}$"))
            display(Markdown(f"$s_{{(M_A - M_B)}} = \\sqrt{{{round(pooled_var/self.n, 2) + round(pooled_var/self.n, 2)}}}$"))
            display(Markdown(f"$s_{{(M_A - M_B)}} = {{{sem}}}$"))
            # display the caluclations for t_obt
            display(Markdown("calculating $t_{{obt}}$..."))
            display(Markdown(f"$t_{{obt}} = {{\\frac{{(M_A - M_B) - (\\mu_A - \\mu_B)}}{{s_{{(M_A - M_B)}}}}}}$"))
            display(Markdown(f"$t_{{obt}} = \\frac{{({self.means[0]} - {self.means[1]}) - {{{self.null}}}}}{{{sem}}}$")) 
            display(Markdown(f"$t_{{obt}} = \\frac{{{round(self.means[0] - self.means[1] - self.null, 2)}}}{{{sem}}}$"))
            display(Markdown(f"$t_{{obt}} = {{{self.obt}}}$"))
            print() # blank space
            # print calculations for cohen's d
            display(Markdown("calculating Cohen's d..."))
            display(Markdown("Cohen's d = $\\frac{{M_A - M_B}}{{\\sqrt{{{s_p^2}}}}}$"))
            display(Markdown(f"Cohen's d = $\\frac{{({self.means[0]} - {self.means[1]})}}{{{{{{\\sqrt{{{pooled_var}}}}}}}}}$"))
            display(Markdown(f"Cohen's d = $\\frac{{({self.means[0] - self.means[1]})}}{{{round(math.sqrt(pooled_var),2)}}}$"))
            display(Markdown(f"Cohen's d = ${{{self.effect_size}}}$"))
            print() # blank space

            self.significance = self.final_decision()
            self.write_result()

            # return self.obt - not returning b/c it was printing out the value of self.obt.  need to figure out why but commenting out fixed it
        

    def dependent_samples_t_test(self):
        if len(self.df.columns) == 1 or len(self.df.columns) > 2:
            raise ValueError("Data does not contain two samples")
        elif len(self.df.columns) == 0:
            raise ValueError("Dataframe error: no data columns")
        else:
            self.test = "dependent-samples t-test"     
            
            # set the null and write out the question
            self.set_null_hypothesis()
            self.critical_value()
            self.generate_question()

            # primary calculations
            # need to gather the difference scores
            self.df['D'] = self.df['B'] - self.df['A']
            
            # print the dataframe with the difference scores
            display(Markdown("Calculating the difference scores $D = X_B - X_A$"))
            print(self.df.to_string(index = False))
            print() # blank space
            # Calculate the Mean of the Difference Scores
            sum_d = self.df['D'].sum()
            n = len(self.df['D'])
            mean_d = round(sum_d/n, 2)
            display(Markdown("Calculating the Mean of the Difference Scores..."))
            display(Markdown("$M_D = \\frac{{\\Sigma D}}{{n}}$"))
            display(Markdown(f"$M_D = \\frac{{{sum_d}}}{{{n}}}$"))
            display(Markdown(f"$M_D = {{{mean_d}}}$"))
            print() # blank space
            # calculate the SS for the difference scores
            self.df['D^2'] = self.df['D'].apply(lambda x: x ** 2)
            sum_sqared_scores = self.df['D^2'].sum()
            ss = round(sum_sqared_scores - round((sum_d ** 2)/n, 2), 2)
            # print the dataframe with the squared difference scores
            display(Markdown("Calculating the sum of the squared deviations..."))
            print(self.df.to_string(index = False))
            display(Markdown("$ SS_D = \\Sigma D^2 - \\frac{{(\\Sigma D)^2}}{{n}}$"))
            display(Markdown(f"$ SS_D = {{{sum_sqared_scores}}} - \\frac{{{sum_d ** 2}}}{{{n}}}$"))
            display(Markdown(f"$ SS_D = {{{sum_sqared_scores}}} - {{{round((sum_d ** 2)/n, 2)}}}$"))
            display(Markdown(f"$ SS_D = {{{ss}}}$"))
            print() # blank space
            # calculate the variance    
            variance = round(ss / (n - 1), 2)
            display(Markdown("$ s^2 = \\frac{{SS_D}}{{df}}$")) 
            display(Markdown(f"$ s^2 = \\frac{{{ss}}}{{{n - 1}}}$"))   
            display(Markdown(f"$ s^2 = \\frac{{{round(ss/(n - 1), 2)}}}$"))
            display(Markdown(f"$ s^2 = {{{variance}}}$"))  
            print() # blank space
            # Calculate the estimated standard error
            sem = round(math.sqrt(variance/n), 2)
            display(Markdown("Calculating the estimated standard error..."))
            display(Markdown("$ s_{M_D} = \\sqrt{{\\frac{{s^2}}{{n}}}}$"))
            display(Markdown(f"$ s_{{M_D}} = \\sqrt{{\\frac{{{variance}}}{{{n}}}}}$"))
            display(Markdown(f"$ s_{{M_D}} = \\sqrt{{{round(variance/n, 2)}}}$"))
            display(Markdown(f"$ s_{{M_D}} = {{{sem}}}$"))
            print() # blank space
            # caclulate the t-statistic
            self.obt = round((mean_d - self.null) / sem, 2)
            display(Markdown("calculating $t_{{obt}}$..."))
            display(Markdown("$t_{{obt}} = {{\\frac{{M_D - \\mu_D}}{{s_{M_D}}}}}$"))
            display(Markdown(f"$t_{{obt}} = \\frac{{{mean_d} - {self.null}}}{{{sem}}}$"))
            display(Markdown(f"$t_{{obt}} = \\frac{{{mean_d - self.null}}}{{{sem}}}$"))
            display(Markdown(f"$t_{{obt}} = {{{self.obt}}}$"))
            print() # blank space
            # print calculations for cohen's d
            self.effect_size = round(mean_d / round(math.sqrt(variance),2), 2)
            display(Markdown("calculating Cohen's d..."))
            display(Markdown("Cohen's d = $\\frac{{M_D}}{{\\sqrt{{s^2}}}}$"))
            display(Markdown(f"Cohen's d = $\\frac{{{mean_d}}}{{{{{{\\sqrt{{{variance}}}}}}}}}$"))
            display(Markdown(f"Cohen's d = $\\frac{{{mean_d}}}{{{round(math.sqrt(variance),2)}}}$"))
            display(Markdown(f"Cohen's d = ${{{self.effect_size}}}$"))
            print() # blank space

            self.significance = self.final_decision()
            self.write_result()

            # return self.obt - not returning b/c it was printing out the value of self.obt.  need to figure out why but commenting out fixed it


    def one_way_anova(self):
        if len(self.df.columns) == 1:
            raise ValueError("Data does not contain at least two samples")
        elif len(self.df.columns) == 0:
            raise ValueError("Dataframe error: no data columns")
        else:
            self.test = "one-way ANOVA"     

            # set the null and write out the question
            self.set_null_hypothesis()
            self.critical_value()
            self.generate_question()

            # Primary Calculations
            big_n = self.groups * self.n
            
            # degrees of freedom
            df_total = big_n - 1
            df_between = self.groups - 1
            df_within = big_n - self.groups

            print() # blank space
            print("calculating the degrees of freedom...")
    
            display(Markdown(f"$df_{{total}} = N - 1$"))
            display(Markdown(f"$df_{{total}} = {{{big_n}}} - 1$"))
            display(Markdown(f"$df_{{total}} = {{{df_total}}}$"))
            print() # blank space

            display(Markdown(f"$df_{{between}} = k - 1 $"))
            display(Markdown(f"$df_{{between}} = {{{self.groups}}} - 1 $"))
            display(Markdown(f"$df_{{between}} = {{{df_between}}}$"))
            print() # blank space

            display(Markdown(f"$df_{{within}} = N - K $"))
            display(Markdown(f"$df_{{within}} = {{{big_n}}} - {{{self.groups}}} $"))
            display(Markdown(f"$df_{{within}} = {{{df_within}}}$"))
            print() # blank space

            # sum of squares
            ss_total = self.sum_squared_scores - round(((self.g**2)/big_n), 2)
            ss_within = 0
            for group in range(self.groups):
                ss_within += self.ss[group]
            ss_between = ss_total - ss_within

            print("calulating the sum of squares...")
            display(Markdown(f"$ SS_{{total}} = \\Sigma X^2 - \\frac{{G^2}}{{N}} $"))
            display(Markdown(f"$ SS_{{total}} = {{{self.sum_squared_scores}}} - \\frac{{{self.g}^2}}{{{big_n}}} $"))
            display(Markdown(f"$ SS_{{total}} = {{{self.sum_squared_scores}}} - \\frac{{{self.g**2}}}{{{big_n}}} $"))
            display(Markdown(f"$ SS_{{total}} = {{{self.sum_squared_scores}}} - {{{round(((self.g**2)/big_n), 2)}}} $"))
            display(Markdown(f"$ SS_{{total}} = {{{round(ss_total, 2)}}} $"))
            print() # blank space

            display(Markdown(f"$ SS_{{within}} = \\Sigma SS_{{inside\\_each\\_condition}} $"))
            values = ""
            for group in range(self.groups):
                if group == 0:
                    values += f"{self.ss[group]}"
                else:
                    values += f" + {self.ss[group]}"
            display(Markdown(f"$ SS_{{within}} = {{{values}}}$"))
            display(Markdown(f"$ SS_{{within}} = {{{round(ss_within, 2)}}}$"))
            print() # blank space

            display(Markdown(f"$ SS_{{between}} = SS_{{total}} - SS_{{within}} $"))
            display(Markdown(f"$ SS_{{between}} = {{{round(ss_total, 2)}}} - {{{round(ss_within, 2)}}} $"))
            display(Markdown(f"$ SS_{{between}} = {{{round(ss_between, 2)}}} $"))
            print() # blank space

            display(Markdown("note: the other way to calculate $SS_{{betwen}}$ is:"))
            display(Markdown("$ SS_{{between}} = \\Sigma{{\\frac{{T^2}}{{n}}}} - \\frac{{G^2}}{{N}} $"))
            print() # blank space

            # mean squares
            ms_between = round(round(ss_between, 2)/df_between, 2)
            ms_within = round(round(ss_within, 2)/df_within, 2)

            print("calculating the mean squares...")
            display(Markdown(f"$ MS_{{between}} = \\frac{{SS_{{between}}}}{{df_{{between}}}} $"))
            display(Markdown(f"$ MS_{{between}} = \\frac{{{round(ss_between, 2)}}}{{{df_between}}} $"))
            display(Markdown(f"$ MS_{{between}} = {{{round(ms_between, 2)}}} $"))
            print() # blank space

            display(Markdown(f"$ MS_{{within}} = \\frac{{SS_{{within}}}}{{df_{{within}}}} $"))
            display(Markdown(f"$ MS_{{within}} = \\frac{{{round(ss_within, 2)}}}{{{df_within}}} $"))
            display(Markdown(f"$ MS_{{within}} = {{{round(ms_within, 2)}}} $"))
            print() # blank space

            # F obtained
            self.obt = round(ms_between/ms_within, 2)

            print("calculating the f ratio...")
            display(Markdown(f"$ F_{{obt}} = \\frac{{MS_{{between}}}}{{MS_{{within}}}} $"))
            display(Markdown(f"$ F_{{obt}} = \\frac{{{round(ms_between,2)}}}{{{round(ms_within,2)}}} $"))
            display(Markdown(f"$ F_{{obt}} = {{{round(self.obt, 2)}}} $"))
            print() # blank space

            # effect size
            self.effect_size = round(round(ss_between, 2)/round(ss_total, 2), 2)
            print("calculating eta squared...")
            display(Markdown(f"$ \\eta^2 = \\frac{{SS{{between}}}}{{SS_{{total}}}} $"))
            display(Markdown(f"$ \\eta^2 = \\frac{{{round(ss_between, 2)}}}{{{round(ss_total, 2)}}} $"))
            display(Markdown(f"$ \\eta^2 = {{{round(self.effect_size, 2)}}} $"))
            print() # blank space

            self.significance = self.final_decision()
            self.write_result()
            print() # blank space

            # Tukeys HSD

# One-Way ANOVA

conducting a one-way ANOVA.  Groups needs to be set to 2 or more.  The value for n can be any integer (whole number)

In [6]:
RandomData(groups = 4, n = 10).one_way_anova()

    A   B    C   D
0  69  16  103  48
1  64  11   99  45
2  65  17   85  41
3  37  18   90  44
4  88  16   91  50
5  81  18   84  32
6  77  15   88  26
7  92  12  142  47
8  84  15  117  48
9  84  16   92  42


Given the following between-subjects data, use a one-way ANOVA with $\alpha = {0.05}$

$G = {2309}, \Sigma X^2 = {179237}, k = {4}, N = {40}$

$T_{A} = {741}, SS_{A} = {2352.9}$

$T_{B} = {154}, SS_{B} = {48.4}$

$T_{C} = {991}, SS_{C} = {2924.9}$

$T_{D} = {423}, SS_{D} = {530.1}$


calculating the degrees of freedom...


$df_{total} = N - 1$

$df_{total} = {40} - 1$

$df_{total} = {39}$




$df_{between} = k - 1 $

$df_{between} = {4} - 1 $

$df_{between} = {3}$




$df_{within} = N - K $

$df_{within} = {40} - {4} $

$df_{within} = {36}$


calulating the sum of squares...


$ SS_{total} = \Sigma X^2 - \frac{G^2}{N} $

$ SS_{total} = {179237} - \frac{2309^2}{40} $

$ SS_{total} = {179237} - \frac{5331481}{40} $

$ SS_{total} = {179237} - {133287.02} $

$ SS_{total} = {45949.98} $




$ SS_{within} = \Sigma SS_{inside\_each\_condition} $

$ SS_{within} = {2352.9 + 48.4 + 2924.9 + 530.1}$

$ SS_{within} = {5856.3}$




$ SS_{between} = SS_{total} - SS_{within} $

$ SS_{between} = {45949.98} - {5856.3} $

$ SS_{between} = {40093.68} $




note: the other way to calculate $SS_{{betwen}}$ is:

$ SS_{{between}} = \Sigma{{\frac{{T^2}}{{n}}}} - \frac{{G^2}}{{N}} $


calculating the mean squares...


$ MS_{between} = \frac{SS_{between}}{df_{between}} $

$ MS_{between} = \frac{40093.68}{3} $

$ MS_{between} = {13364.56} $




$ MS_{within} = \frac{SS_{within}}{df_{within}} $

$ MS_{within} = \frac{5856.3}{36} $

$ MS_{within} = {162.68} $


calculating the f ratio...


$ F_{obt} = \frac{MS_{between}}{MS_{within}} $

$ F_{obt} = \frac{13364.56}{162.68} $

$ F_{obt} = {82.15} $


calculating eta squared...


$ \eta^2 = \frac{SS{between}}{SS_{total}} $

$ \eta^2 = \frac{40093.68}{45949.98} $

$ \eta^2 = {0.87} $




$F_{crit} = {2.87}, \alpha = {0.05}$

reject the null hypothesis, results are significant, $ F({3}, {36}), p < {0.05}, \eta^2 = {0.87}$


