# WebReg Statistical Analysis

**Objective**: Analyze enrollment data to identify key features predicting utilization rates, perform hypothesis testing, and calculate feature importance.


## Setup


In [2]:
pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-1.0.2-py2.py3-none-any.whl.metadata (3.6 kB)
Downloading statsmodels-0.14.6-cp312-cp312-macosx_11_0_arm64.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m7.4 MB/s[0m  [33m0:00:01[0mm0:00:01[0m0:01[0mm
[?25hDownloading patsy-1.0.2-py2.py3-none-any.whl (233 kB)
Installing collected packages: patsy, statsmodels
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [statsmodels][0m [statsmodels]
[1A[2KSuccessfully installed patsy-1.0.2 statsmodels-0.14.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use up

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re
import warnings
warnings.filterwarnings('ignore')

from scipy.stats import ttest_ind, f_oneway, chi2_contingency, pearsonr, spearmanr
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.inspection import permutation_importance

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)


## Data Loading & Feature Engineering


In [4]:
def load_all_webreg_data():
    quarters = ['fa24', 'wi25', 'sp25']
    quarter_labels = {'fa24': 'Fall 2024', 'wi25': 'Winter 2025', 'sp25': 'Spring 2025'}
    quarter_numeric = {'fa24': 1, 'wi25': 2, 'sp25': 3}
    
    all_data = []
    
    for quarter in quarters:
        for division in ['lower_division', 'upper_division']:
            data_path = Path(f'webreg_data/{quarter}/{division}')
            if not data_path.exists():
                continue
            
            for csv_file in data_path.glob('DSC_*.csv'):
                df = pd.read_csv(csv_file)
                if len(df) == 0 or df['total'].iloc[-1] == 0:
                    continue
                
                final_row = df.iloc[-1]
                course_name = csv_file.stem
                course_num = course_name.replace('DSC_', '')
                
                match = re.match(r'(\d+)', course_num)
                if not match:
                    continue
                course_number = int(match.group(1))
                
                record = {
                    'course': course_name,
                    'course_number': course_number,
                    'quarter': quarter,
                    'quarter_label': quarter_labels[quarter],
                    'quarter_numeric': quarter_numeric[quarter],
                    'enrolled': final_row['enrolled'],
                    'available': final_row['available'],
                    'waitlisted': final_row['waitlisted'],
                    'total_capacity': final_row['total'],
                    'division': division,
                    'is_upper_division': 1 if division == 'upper_division' else 0,
                    'is_lower_division': 1 if division == 'lower_division' else 0,
                }
                all_data.append(record)
    
    return pd.DataFrame(all_data)

df = load_all_webreg_data()
print(f"Loaded {len(df)} observations, {df['course'].nunique()} unique courses")


Loaded 45 observations, 20 unique courses


In [5]:
# Course classification
REQUIRED_COURSES = {
    'DSC_10', 'DSC_20', 'DSC_30', 'DSC_40A', 'DSC_40B', 'DSC_80',
    'DSC_100', 'DSC_102', 'DSC_106', 'DSC_180A', 'DSC_180B', 'DSC_190'
}

# Add features
df['is_required'] = df['course'].apply(lambda x: 1 if x in REQUIRED_COURSES else 0)
df['is_elective'] = 1 - df['is_required']
df['requirement_type'] = df['is_required'].map({1: 'Required', 0: 'Elective'})

df['utilization_rate'] = (df['enrolled'] / df['total_capacity'] * 100).round(2)
df['waitlist_rate'] = (df['waitlisted'] / df['total_capacity'] * 100).round(2)
df['available_rate'] = (df['available'] / df['total_capacity'] * 100).round(2)
df['demand_pressure'] = (df['enrolled'] + df['waitlisted']) / df['total_capacity'] * 100
df['is_oversubscribed'] = (df['utilization_rate'] >= 95).astype(int)
df['has_waitlist'] = (df['waitlisted'] > 0).astype(int)

course_frequency = df.groupby('course')['quarter'].nunique().to_dict()
df['quarters_offered'] = df['course'].map(course_frequency)
df['is_every_quarter'] = (df['quarters_offered'] == 3).astype(int)

df['is_fall'] = (df['quarter'] == 'fa24').astype(int)
df['is_winter'] = (df['quarter'] == 'wi25').astype(int)
df['is_spring'] = (df['quarter'] == 'sp25').astype(int)

df.to_csv('webreg_processed_data.csv', index=False)
df.head()


Unnamed: 0,course,course_number,quarter,quarter_label,quarter_numeric,enrolled,available,waitlisted,total_capacity,division,...,waitlist_rate,available_rate,demand_pressure,is_oversubscribed,has_waitlist,quarters_offered,is_every_quarter,is_fall,is_winter,is_spring
0,DSC_80,80,fa24,Fall 2024,1,202,38,0,240,lower_division,...,0.0,15.83,84.166667,0,0,3,1,1,0,0
1,DSC_95,95,fa24,Fall 2024,1,10,40,0,50,lower_division,...,0.0,80.0,20.0,0,0,3,1,1,0,0
2,DSC_90,90,fa24,Fall 2024,1,7,13,0,20,lower_division,...,0.0,65.0,35.0,0,0,1,0,1,0,0
3,DSC_20,20,fa24,Fall 2024,1,74,76,0,150,lower_division,...,0.0,50.67,49.333333,0,0,3,1,1,0,0
4,DSC_40A,40,fa24,Fall 2024,1,158,7,0,165,lower_division,...,0.0,4.24,95.757576,1,0,3,1,1,0,0


## Hypothesis Testing


### T-Test: Required vs. Elective


In [6]:
required_util = df[df['is_required'] == 1]['utilization_rate']
elective_util = df[df['is_required'] == 0]['utilization_rate']

t_stat, p_value = ttest_ind(required_util, elective_util)

print(f"Required: n={len(required_util)}, mean={required_util.mean():.2f}%")
print(f"Elective: n={len(elective_util)}, mean={elective_util.mean():.2f}%")
print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.6f}")
print(f"Significant: {'Yes' if p_value < 0.05 else 'No'}")


Required: n=31, mean=82.69%
Elective: n=14, mean=65.51%
t-statistic: 2.9789, p-value: 0.004741
Significant: Yes


### T-Test: Lower vs. Upper Division


In [7]:
lower_util = df[df['is_lower_division'] == 1]['utilization_rate']
upper_util = df[df['is_upper_division'] == 1]['utilization_rate']

t_stat, p_value = ttest_ind(lower_util, upper_util)

print(f"Lower: n={len(lower_util)}, mean={lower_util.mean():.2f}%")
print(f"Upper: n={len(upper_util)}, mean={upper_util.mean():.2f}%")
print(f"t-statistic: {t_stat:.4f}, p-value: {p_value:.6f}")
print(f"Significant: {'Yes' if p_value < 0.05 else 'No'}")


Lower: n=23, mean=73.64%
Upper: n=22, mean=81.23%
t-statistic: -1.3199, p-value: 0.193837
Significant: No


### ANOVA: Quarters & Two-Way


In [8]:
# One-way ANOVA: Quarters
groups = [df[df['quarter'] == q]['utilization_rate'].values for q in df['quarter'].unique()]
f_stat, p_value = f_oneway(*groups)
print(f"Quarters ANOVA: F={f_stat:.4f}, p={p_value:.6f}, Significant: {'Yes' if p_value < 0.05 else 'No'}")

# Two-way ANOVA: Division × Requirement
formula = 'utilization_rate ~ C(division) + C(requirement_type) + C(division):C(requirement_type)'
model = ols(formula, data=df).fit()
anova_table = anova_lm(model, typ=2)
anova_table


Quarters ANOVA: F=0.0686, p=0.933839, Significant: No


Unnamed: 0,sum_sq,df,F,PR(>F)
C(division),1392.289925,1.0,4.780872,0.03454
C(requirement_type),3591.633447,1.0,12.33302,0.001098
C(division):C(requirement_type),466.27885,1.0,1.601117,0.212889
Residual,11940.057864,41.0,,


### Chi-Square & Correlation


In [9]:
# Chi-square: Oversubscription by type
contingency_table = pd.crosstab(df['requirement_type'], df['is_oversubscribed'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-square: χ²={chi2:.4f}, p={p_value:.6f}, Significant: {'Yes' if p_value < 0.05 else 'No'}")
print("\nContingency Table:")
print(contingency_table)

# Correlation: Capacity vs Utilization
pearson_r, pearson_p = pearsonr(df['total_capacity'], df['utilization_rate'])
spearman_r, spearman_p = spearmanr(df['total_capacity'], df['utilization_rate'])
print(f"\nCapacity vs Utilization:")
print(f"Pearson: r={pearson_r:.4f}, p={pearson_p:.6f}")
print(f"Spearman: ρ={spearman_r:.4f}, p={spearman_p:.6f}")


Chi-square: χ²=0.4775, p=0.489570, Significant: No

Contingency Table:
is_oversubscribed   0  1
requirement_type        
Elective           12  2
Required           22  9

Capacity vs Utilization:
Pearson: r=0.3139, p=0.035731
Spearman: ρ=0.2840, p=0.058692


## Linear Regression


### Simple Linear Regressions


In [10]:
predictors = ['is_required', 'is_upper_division', 'total_capacity', 'quarters_offered', 
              'is_every_quarter', 'waitlisted', 'demand_pressure', 'course_number']

results = []
for predictor in predictors:
    X = sm.add_constant(df[[predictor]])
    model = sm.OLS(df['utilization_rate'], X).fit()
    results.append({
        'predictor': predictor,
        'coefficient': model.params[1],
        'std_error': model.bse[1],
        't_stat': model.tvalues[1],
        'p_value': model.pvalues[1],
        'r_squared': model.rsquared
    })

simple_reg_results = pd.DataFrame(results).sort_values('p_value')
simple_reg_results.to_csv('simple_regression_results.csv', index=False)
simple_reg_results


Unnamed: 0,predictor,coefficient,std_error,t_stat,p_value,r_squared
6,demand_pressure,0.968276,0.011719,82.625554,5.116934e-49,0.993741
0,is_required,17.182903,5.768264,2.978869,0.004741498,0.171063
5,waitlisted,2.560658,1.117382,2.291658,0.02688462,0.10884
2,total_capacity,0.063655,0.02936,2.168098,0.03573073,0.098545
1,is_upper_division,7.59251,5.752137,1.319946,0.1938371,0.03894
7,course_number,0.01247,0.053131,0.234707,0.8155506,0.001279
4,is_every_quarter,1.133889,7.330539,0.15468,0.8777972,0.000556
3,quarters_offered,0.558336,3.989529,0.13995,0.8893526,0.000455


### Multiple Linear Regression


In [None]:
feature_columns = ['is_required', 'is_upper_division', 'total_capacity', 'quarters_offered',
                   'waitlisted', 'is_fall', 'is_winter', 'course_number']

X = sm.add_constant(df[feature_columns])
multi_model = sm.OLS(df['utilization_rate'], X).fit()

print(multi_model.summary())

coef_df = pd.DataFrame({
    'Feature': ['Intercept'] + feature_columns,
    'Coefficient': multi_model.params,
    'Std_Error': multi_model.bse,
    't_stat': multi_model.tvalues,
    'p_value': multi_model.pvalues
})
coef_df.to_csv('multiple_regression_coefficients.csv', index=False)
coef_df


## Feature Importance Analysis


### Method 1: Standardized Coefficients


In [None]:
X = df[feature_columns]
y = df['utilization_rate']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model_std = LinearRegression()
model_std.fit(X_scaled, y)

std_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Std_Coefficient': model_std.coef_,
    'Abs_Std_Coefficient': np.abs(model_std.coef_)
}).sort_values('Abs_Std_Coefficient', ascending=False)

std_importance


### Method 2: Permutation Importance


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model_perm = LinearRegression()
model_perm.fit(X_train, y_train)

perm_importance = permutation_importance(model_perm, X_test, y_test, n_repeats=30, random_state=42)

perm_df = pd.DataFrame({
    'Feature': feature_columns,
    'Importance_Mean': perm_importance.importances_mean,
    'Importance_Std': perm_importance.importances_std
}).sort_values('Importance_Mean', ascending=False)

perm_df


### Method 3: Correlation-Based


In [None]:
corr_importance = []
for col in feature_columns:
    corr, p_val = pearsonr(df[col], df['utilization_rate'])
    corr_importance.append({
        'Feature': col,
        'Correlation': corr,
        'Abs_Correlation': abs(corr),
        'p_value': p_val
    })

corr_df = pd.DataFrame(corr_importance).sort_values('Abs_Correlation', ascending=False)
corr_df


### Combined Feature Importance Ranking


In [None]:
combined = std_importance[['Feature', 'Abs_Std_Coefficient']].copy()
combined = combined.merge(perm_df[['Feature', 'Importance_Mean']], on='Feature')
combined = combined.merge(corr_df[['Feature', 'Abs_Correlation']], on='Feature')

# Normalize to 0-1
for col in ['Abs_Std_Coefficient', 'Importance_Mean', 'Abs_Correlation']:
    max_val = combined[col].max()
    if max_val > 0:
        combined[f'{col}_norm'] = combined[col] / max_val

combined['Average_Importance'] = combined[['Abs_Std_Coefficient_norm', 
                                            'Importance_Mean_norm', 
                                            'Abs_Correlation_norm']].mean(axis=1)

combined = combined.sort_values('Average_Importance', ascending=False)
combined.to_csv('feature_importance_analysis.csv', index=False)
combined[['Feature', 'Abs_Std_Coefficient', 'Importance_Mean', 'Abs_Correlation', 'Average_Importance']]


## Model Performance


In [None]:
final_model = LinearRegression()
final_model.fit(X_train, y_train)

y_train_pred = final_model.predict(X_train)
y_test_pred = final_model.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

cv_scores = cross_val_score(final_model, X, y, cv=5, scoring='r2')

print(f"Train R²: {train_r2:.4f}, RMSE: {train_rmse:.4f}")
print(f"Test R²: {test_r2:.4f}, RMSE: {test_rmse:.4f}")
print(f"5-Fold CV R²: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


## Summary Visualization


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Feature importance bar plot
combined_plot = combined.sort_values('Average_Importance')
axes[0, 0].barh(combined_plot['Feature'], combined_plot['Average_Importance'], color='purple', alpha=0.7)
axes[0, 0].set_xlabel('Average Importance')
axes[0, 0].set_title('Combined Feature Importance', fontweight='bold')

# Actual vs Predicted
axes[0, 1].scatter(y_test, y_test_pred, alpha=0.6)
axes[0, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0, 1].set_xlabel('Actual Utilization (%)')
axes[0, 1].set_ylabel('Predicted Utilization (%)')
axes[0, 1].set_title(f'Predictions (R²={test_r2:.4f})', fontweight='bold')

# Residual plot
residuals = y_test - y_test_pred
axes[1, 0].scatter(y_test_pred, residuals, alpha=0.6)
axes[1, 0].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1, 0].set_xlabel('Predicted Utilization (%)')
axes[1, 0].set_ylabel('Residuals')
axes[1, 0].set_title('Residual Plot', fontweight='bold')

# Box plots by type
df['division_label'] = df['division'].map({'lower_division': 'Lower', 'upper_division': 'Upper'})
comparison_data = pd.DataFrame({
    'Utilization': pd.concat([df[df['is_required']==1]['utilization_rate'], 
                              df[df['is_required']==0]['utilization_rate']]),
    'Type': ['Required']*len(df[df['is_required']==1]) + ['Elective']*len(df[df['is_required']==0])
})
comparison_data.boxplot(column='Utilization', by='Type', ax=axes[1, 1])
axes[1, 1].set_title('Utilization by Course Type', fontweight='bold')
axes[1, 1].set_xlabel('')
plt.suptitle('')

plt.tight_layout()
plt.savefig('webreg_plots/analysis_summary.png', dpi=300, bbox_inches='tight')
plt.show()


## Summary

### Files Generated:
- `webreg_processed_data.csv` - Full processed dataset
- `simple_regression_results.csv` - Simple regression p-values
- `multiple_regression_coefficients.csv` - Multiple regression coefficients
- `feature_importance_analysis.csv` - Feature importance rankings

### Key Features Analyzed:
1. **is_required** - Required vs elective course
2. **is_upper_division** - Division level
3. **total_capacity** - Class size
4. **quarters_offered** - How many quarters course is offered
5. **waitlisted** - Number on waitlist
6. **is_fall/is_winter** - Quarter indicators
7. **course_number** - Course level (numeric)

### Next Steps:
- Use these features for multivariate regression model
- Reference p-values for report significance testing
- Apply feature importance rankings to guide model selection
