Exploring DSC SET Data

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [40]:
# load sets into dataframe
dsc_sets = pd.read_csv('datasets/dsc_sets.csv')
# merge duplicate columns
dsc_sets["Enrolled Resp Rate"] = dsc_sets["Enrolled/ Resp Rate"].fillna(dsc_sets["Enrolled/  Resp Rate"])
dsc_sets = dsc_sets.drop(columns=["Enrolled/ Resp Rate", "Enrolled/  Resp Rate"])
# remove Letter from Avg Grade Received
dsc_sets["Avg Grade Received"] = (
    dsc_sets["Avg Grade Received"]
    .str.extract(r'(\d+\.\d+|\d+)')   # extract first number
    .astype(float)
)
# rename columns for easier access
dsc_sets.columns = dsc_sets.columns.str.replace('*', '', regex=False)
dsc_sets.columns = dsc_sets.columns.str.replace(' ', '_')
# delete unnecessary columns
dsc_sets = dsc_sets.drop(columns=["Course"])
dsc_sets.head()

Unnamed: 0,Instructor,Term,Avg_Grade_Received,Avg_Hours_Worked,Learning_Average,Structure_Average,Environment_Average,course_title,Enrolled_Resp_Rate
0,"Bellur, Umesh",WI25,3.93,9.93,3.74,3.55,4.12,dsc180b,114 (14.04%)
1,"Bellur, Umesh",WI25,3.99,11.89,4.02,3.74,3.8,dsc180b,118 (16.95%)
2,"Rampure, Suraj",WI24,3.96,9.6,4.6,4.59,4.58,dsc180b,97 (10.31%)
3,"Rampure, Suraj",WI24,3.99,6.23,4.57,4.57,4.53,dsc180b,138 (9.42%)
4,"Weng, Tsui-Wei (lily)",SP25,3.71,7.38,4.19,4.13,4.24,dsc140b,124 (45.16%)


In [42]:
webreg_data = pd.read_csv('webreg_data/results/webreg_processed_data.csv')
webreg_data.head()

Unnamed: 0,course,course_number,quarter,quarter_label,quarter_numeric,enrolled,available,waitlisted,total_capacity,division,...,waitlist_rate,available_rate,demand_pressure,is_oversubscribed,has_waitlist,quarters_offered,is_every_quarter,is_fall,is_winter,is_spring
0,DSC_80,80,fa24,Fall 2024,1,202,38,0,240,lower_division,...,0.0,15.83,84.166667,0,0,3,1,1,0,0
1,DSC_95,95,fa24,Fall 2024,1,10,40,0,50,lower_division,...,0.0,80.0,20.0,0,0,3,1,1,0,0
2,DSC_90,90,fa24,Fall 2024,1,7,13,0,20,lower_division,...,0.0,65.0,35.0,0,0,1,0,1,0,0
3,DSC_20,20,fa24,Fall 2024,1,74,76,0,150,lower_division,...,0.0,50.67,49.333333,0,0,3,1,1,0,0
4,DSC_40A,40,fa24,Fall 2024,1,158,7,0,165,lower_division,...,0.0,4.24,95.757576,1,0,3,1,1,0,0


Unnamed: 0,Instructor,Course,Term,Avg_Grade_Received,Avg_Hours_Worked,Learning_Average,Structure_Average,Environment_Average,course_title,Enrolled_Resp_Rate,division,is_elective
16,"Mishne, Gal",DSC 120 - Signal Processing/Data Analys (A00),FA23,3.08,9.44,4.64,4.34,4.29,dsc120,29 (31.03%),Upper,Elective
17,"Roberts, Margaret Earling",DSC 161 - Text as Data (A00),SP25,3.73,4.6,4.6,4.6,4.6,dsc161,32 (15.63%),Upper,Elective
18,"Roberts, Margaret Earling",DSC 161 - Text as Data (A00),WI24,3.5,6.5,4.75,4.75,4.75,dsc161,14 (28.57%),Upper,Elective
63,"Tiefenbruck, Janine LoBue",DSC 95 - Tutor Appshp in Data Science (A00),SP25,,1.5,5.0,5.0,5.0,dsc95,22 (18.18%),Lower,Elective
64,"Langlois, Marina",DSC 95 - Tutor Appshp in Data Science (A00),WI25,,1.67,5.0,5.0,5.0,dsc95,24 (16.67%),Lower,Elective


### Two-Way ANOVA: Division × Elective (on Learning Average)
Factors
- Division: Lower vs Upper
- Elective: Required vs Elective
- Interaction: Division × Elective

In [32]:
model = ols("Learning_Average ~ C(division) * C(is_elective)", data=dsc_sets).fit()
sm.stats.anova_lm(model, typ=2)

Unnamed: 0,sum_sq,df,F,PR(>F)
C(division),0.725219,1.0,7.980119,0.005406
C(is_elective),0.918282,1.0,10.104538,0.001813
C(division):C(is_elective),0.012054,1.0,0.132639,0.71625
Residual,12.995581,143.0,,
