# Heirarchical Modelling 

Heirarchical modelling may be applicable any time the observation unit is nested within the randomization unit or unit at which the intervention occcurs. Examples include user level data where treatment occured at the geographic level, Student performance where the intervnetion wass at the class level....

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy

sns.set_theme(context = 'notebook', style = 'whitegrid')

## Data Generation: College student class performance given 5 different teaching methods  

### Assign students to an instructors and instructors to methods (creating the nested structure in the data)

In [82]:

num_student = 100000
num_instructor = 50
num_method = 5

all_students = np.arange(0, num_student, step =1)
all_instructors = np.arange(0, num_instructor, step = 1)
all_methods = np.arange(0, num_method, 1)

students_per_instructor = int(num_student/num_instructor)
instructors_per_method = int(num_instructor/num_method)

### 1) Assign students to an instrcutor
student_assignment = []

for i in all_instructors:
    # randomly assign the appropriate number of students to an instructor 
    assigned_students = np.random.choice(all_students, size = students_per_instructor, replace = False)
    instructor = np.repeat(i, repeats = students_per_instructor)
    assignment = np.column_stack((assigned_students, instructor)).tolist()
    # remove students from the pool that have already been assigned
    all_students = np.setdiff1d(all_students, assigned_students)
    # 'extend' avoids nested list issues for simple df creation 
    student_assignment.extend(assignment)

df1 = pd.DataFrame(student_assignment, columns=['student','instructor']).sort_values(by = 'student').reset_index(drop = True)

### 2) Assign instructors to a method
instructor_assignment = []

for i in all_methods:
    # randomly assign the appropriate number of students to an instructor 
    assigned_instructors = np.random.choice(all_instructors, size = instructors_per_method, replace = False)
    method = np.repeat(i, repeats = instructors_per_method)
    assignment = np.column_stack((assigned_instructors, method)).tolist()
    # remove students from the pool that have already been assigned
    all_instructors = np.setdiff1d(all_instructors, assigned_instructors)
    # 'extend' avoids nested list issues for simple df creation 
    instructor_assignment.extend(assignment)

df2 = pd.DataFrame(instructor_assignment, columns=['instructor','method']).sort_values(by = 'instructor').reset_index(drop = True)

df = df1.merge(df2, on = 'instructor', how = 'left')


### 3) Generating the dependant variable and effects of instrcutor/method
instructor_effect = pd.Series(np.random.normal(loc= 1, scale = .1, size = num_instructor), name = 'instructor_effect')
method_effect = pd.Series(np.random.normal(loc= 1, scale = .05, size = num_method), name = 'method_effect')
print("typical instructor effect (in absolute terms)", ((instructor_effect -1).abs().mean())*100, "%")
print("typical method effect (in absolute terms)", ((method_effect -1).abs().mean())*100, "%")
display(instructor_effect.sample(5))
display(method_effect)

# Adding in a continuous variable so regression makes sense :) high school gps
df['hs_gpa'] = np.random.normal(loc = 3,scale = .25, size = num_student)
df.loc[df['hs_gpa'] > 4] == 4



df = df.merge(instructor_effect, left_on='instructor', right_index=True, how = 'left')
df = df.merge(method_effect, left_on='method', right_index=True, how = 'left')
df['score'] = (20 + 20*df['hs_gpa'])*np.random.normal(loc = 75, scale = 10, size = num_student)*df.instructor_effect*df.method_effect
df.loc[df['score'] > 100, 'score'] == 100



display(df.head())



typical instructor effect (in absolute terms) 7.675879307738049 %
typical method effect (in absolute terms) 1.9808051056535558 %


38    1.146159
18    1.102902
19    0.999004
23    0.847660
21    1.072892
Name: instructor_effect, dtype: float64

0    1.041167
1    0.991910
2    1.024238
3    1.009130
4    0.983585
Name: method_effect, dtype: float64

Unnamed: 0,student,instructor,method,hs_gpa,instructor_effect,method_effect,score
0,0,13,0,3.187389,0.993342,1.041167,8084.161662
1,1,49,2,2.654985,0.896771,1.024238,5221.355664
2,2,8,1,3.116515,0.916649,0.99191,5952.946253
3,3,2,2,3.065969,0.956179,1.024238,7175.601085
4,4,14,3,2.787788,1.067673,1.00913,6461.992435


In [79]:
df = df.astype({'student':'object','instructor':'object','method':'object'})

In [83]:

fig = px.histogram(df, x = 'score', color = 'method', marginal= 'box', opacity = .5)
fig.update_layout(height = 500, width = 1000, barmode = 'overlay')
fig.show()

In [85]:
mixed = smf.mixedlm("score ~ hs_gpa", data = df,  groups = df['instructor'])
print(mixed.fit().summary())

            Mixed Linear Model Regression Results
Model:             MixedLM  Dependent Variable:  score       
No. Observations:  100000   Method:              REML        
No. Groups:        50       Scale:               677760.3860 
Min. group size:   2000     Log-Likelihood:      -813384.7301
Max. group size:   2000     Converged:           Yes         
Mean group size:   2000.0                                    
-------------------------------------------------------------
            Coef.    Std.Err.    z    P>|z|  [0.025   0.975] 
-------------------------------------------------------------
Intercept   1514.610   88.574  17.100 0.000 1341.009 1688.212
hs_gpa      1538.677   10.430 147.527 0.000 1518.235 1559.119
Group Var 343017.691   84.892                                



In [87]:
vcf = {'student':'0+C(student)'}
mixed = smf.mixedlm("score ~ hs_gpa", data = df,  groups = df['instructor'], vc_formula = vcf)
print(mixed.fit().summary())

KeyboardInterrupt: 