In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
from pathlib import Path
import re

In [20]:
# ------------------------------------------------------------
# Helper Function: Extract numeric and optional letter suffix
# ------------------------------------------------------------
def extract_course_num(course):
    """Extract numeric and optional letter suffix from course code like DSC_40A or DSC_106."""
    match = re.search(r'(\d+)([A-Za-z]*)', course)
    if match:
        num = int(match.group(1))
        suffix = match.group(2)
        return (num, suffix)
    return (0, '')

In [21]:
# ------------------------------------------------------------
# Visualization Style (for later plots if needed)
# ------------------------------------------------------------
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (16, 10)

# ------------------------------------------------------------
# Quarters Configuration
# ------------------------------------------------------------
quarters = ['fa24', 'sp25', 'wi25']
quarter_labels = {'fa24': 'Fall 2024', 'sp25': 'Spring 2025', 'wi25': 'Winter 2025'}

In [22]:
# ------------------------------------------------------------
# Get Courses and Enrollment Data
# ------------------------------------------------------------
def get_courses_in_quarter(quarter):
    """Get all courses available in a quarter."""
    courses = set()
    for division in ['lower_division', 'upper_division']:
        path = Path("webreg_data") / quarter / division
        if path.exists():
            for file in path.glob('DSC_*.csv'):
                course = file.stem
                courses.add(course)
    return courses

def get_final_enrollment_data(quarter, course):
    """Get the last row of enrollment data for a course in a quarter."""
    for division in ['lower_division', 'upper_division']:
        file_path = Path("webreg_data") / quarter / division / f"{course}.csv"
        if file_path.exists():
            df = pd.read_csv(file_path)
            if len(df) > 0:
                last_row = df.iloc[-1]
                return {
                    'enrolled': last_row['enrolled'],
                    'available': last_row['available'],
                    'waitlisted': last_row['waitlisted'],
                    'total': last_row['total'],
                    'time': last_row['time']
                }
    return None

In [23]:
# ------------------------------------------------------------
# Step 1: Identify Course Offerings Across Quarters
# ------------------------------------------------------------
print("Finding shared and partial courses across all quarters...")
print("=" * 60)

quarter_courses = {}
for quarter in quarters:
    quarter_courses[quarter] = get_courses_in_quarter(quarter)
    print(f"{quarter_labels[quarter]}: {len(quarter_courses[quarter])} courses")

# Shared and partial sets
shared_courses = quarter_courses['fa24'] & quarter_courses['sp25'] & quarter_courses['wi25']
all_courses = set.union(*quarter_courses.values())
partial_courses = all_courses - shared_courses

print(f"\nShared across all 3 quarters: {len(shared_courses)} courses")
print(f"Partial (offered in 1‚Äì2 quarters): {len(partial_courses)} courses")
print("=" * 60)

Finding shared and partial courses across all quarters...
Fall 2024: 15 courses
Spring 2025: 14 courses
Winter 2025: 16 courses

Shared across all 3 quarters: 12 courses
Partial (offered in 1‚Äì2 quarters): 8 courses


In [24]:
# ------------------------------------------------------------
# Step 2: Collect Data Helper
# ------------------------------------------------------------
def collect_enrollment_data(course_list, quarters):
    records = []
    for course in course_list:
        for quarter in quarters:
            data = get_final_enrollment_data(quarter, course)
            if data:
                records.append({
                    'course': course,
                    'quarter': quarter,
                    'quarter_label': quarter_labels[quarter],
                    **data
                })
    return pd.DataFrame(records)

In [25]:
# ------------------------------------------------------------
# Step 3A: Shared Courses Analysis
# ------------------------------------------------------------
shared_df = collect_enrollment_data(shared_courses, quarters)

if not shared_df.empty:
    shared_df['utilization_rate'] = (shared_df['enrolled'] / shared_df['total'] * 100).round(2)
    shared_df['waitlist_rate'] = (shared_df['waitlisted'] / shared_df['total'] * 100).round(2)
    shared_df['available_rate'] = (shared_df['available'] / shared_df['total'] * 100).round(2)

    shared_df['course'] = pd.Categorical(
        shared_df['course'],
        categories=sorted(shared_df['course'].unique(), key=lambda x: extract_course_num(x)),
        ordered=True
    )
    shared_df = shared_df.sort_values(['course', 'quarter'])
    #shared_df.to_csv('enrollment_comparison_shared.csv', index=False)
    #print("Saved shared-course data to 'enrollment_comparison_shared.csv'")
else:
    print("No shared courses found, skipping shared CSV export.")

shared_df.head()

Unnamed: 0,course,quarter,quarter_label,enrolled,available,waitlisted,total,time,utilization_rate,waitlist_rate,available_rate
27,DSC_10,fa24,Fall 2024,427,17,8,444,2024-11-24T22:03:51,96.17,1.8,3.83
28,DSC_10,sp25,Spring 2025,108,42,0,150,2025-05-15T03:50:50,72.0,0.0,28.0
29,DSC_10,wi25,Winter 2025,207,83,0,290,2025-01-31T12:41:46,71.38,0.0,28.62
21,DSC_20,fa24,Fall 2024,74,76,0,150,2024-11-24T22:03:52,49.33,0.0,50.67
22,DSC_20,sp25,Spring 2025,107,13,0,120,2025-05-15T03:50:51,89.17,0.0,10.83


In [26]:
# ------------------------------------------------------------
# Step 3B: Partial Courses Analysis
# ------------------------------------------------------------
partial_df = collect_enrollment_data(partial_courses, quarters)

if not partial_df.empty:
    partial_df['utilization_rate'] = (partial_df['enrolled'] / partial_df['total'] * 100).round(2)
    partial_df['waitlist_rate'] = (partial_df['waitlisted'] / partial_df['total'] * 100).round(2)
    partial_df['available_rate'] = (partial_df['available'] / partial_df['total'] * 100).round(2)

    partial_df['course'] = pd.Categorical(
        partial_df['course'],
        categories=sorted(partial_df['course'].unique(), key=lambda x: extract_course_num(x)),
        ordered=True
    )
    partial_df = partial_df.sort_values(['course', 'quarter'])
    #partial_df.to_csv('enrollment_comparison_partial.csv', index=False)
    #print("‚úì Saved partial-course data to 'enrollment_comparison_partial.csv'")
else:
    print("‚ö†Ô∏è No partial courses found, skipping partial CSV export.")

partial_df.head()

Unnamed: 0,course,quarter,quarter_label,enrolled,available,waitlisted,total,time,utilization_rate,waitlist_rate,available_rate
5,DSC_90,fa24,Fall 2024,7,13,0,20,2024-11-24T22:03:57,35.0,0.0,65.0
0,DSC_96,wi25,Winter 2025,32,8,0,40,2025-01-31T12:41:54,80.0,0.0,20.0
3,DSC_102,fa24,Fall 2024,71,33,0,104,2024-11-24T22:04:00,68.27,0.0,31.73
4,DSC_102,sp25,Spring 2025,251,49,0,300,2025-05-15T03:50:58,83.67,0.0,16.33
7,DSC_148,wi25,Winter 2025,110,15,0,125,2025-01-31T12:41:59,88.0,0.0,12.0


In [27]:
# ------------------------------------------------------------
# Step 4: Summary Reporting
# ------------------------------------------------------------
def print_summary(df, label):
    print("\n" + "=" * 60)
    print(f"{label.upper()} COURSE SUMMARY")
    print("=" * 60)
    if df.empty:
        print("No data available.")
        return

    for course in df['course'].unique():
        course_data = df[df['course'] == course]
        print(f"\n{course}:")
        print(f"  {'Quarter':<15} {'Enrolled':>8} {'Waitlist':>8} {'Avail':>8} {'Total':>7} {'Util%':>7}")
        print("  " + "-" * 60)
        for _, row in course_data.iterrows():
            print(f"  {row['quarter_label']:<15} {row['enrolled']:>8.0f} {row['waitlisted']:>8.0f} "
                  f"{row['available']:>8.0f} {row['total']:>7.0f} {row['utilization_rate']:>6.1f}%")

In [35]:
# Print both summaries
print_summary(shared_df, "Shared")


SHARED COURSE SUMMARY

DSC_10:
  Quarter         Enrolled Waitlist    Avail   Total   Util%
  ------------------------------------------------------------
  Fall 2024            427        8       17     444   96.2%
  Spring 2025          108        0       42     150   72.0%
  Winter 2025          207        0       83     290   71.4%

DSC_20:
  Quarter         Enrolled Waitlist    Avail   Total   Util%
  ------------------------------------------------------------
  Fall 2024             74        0       76     150   49.3%
  Spring 2025          107        0       13     120   89.2%
  Winter 2025          309        0       75     384   80.5%

DSC_30:
  Quarter         Enrolled Waitlist    Avail   Total   Util%
  ------------------------------------------------------------
  Fall 2024             66        0       34     100   66.0%
  Spring 2025          254        0      126     380   66.8%
  Winter 2025           68        0       72     140   48.6%

DSC_40A:
  Quarter         E

In [36]:
print_summary(partial_df, "Partial")


PARTIAL COURSE SUMMARY

DSC_90:
  Quarter         Enrolled Waitlist    Avail   Total   Util%
  ------------------------------------------------------------
  Fall 2024              7        0       13      20   35.0%

DSC_96:
  Quarter         Enrolled Waitlist    Avail   Total   Util%
  ------------------------------------------------------------
  Winter 2025           32        0        8      40   80.0%

DSC_102:
  Quarter         Enrolled Waitlist    Avail   Total   Util%
  ------------------------------------------------------------
  Fall 2024             71        0       33     104   68.3%
  Spring 2025          251        0       49     300   83.7%

DSC_148:
  Quarter         Enrolled Waitlist    Avail   Total   Util%
  ------------------------------------------------------------
  Winter 2025          110        0       15     125   88.0%

DSC_161:
  Quarter         Enrolled Waitlist    Avail   Total   Util%
  ------------------------------------------------------------
  S

In [29]:
# ------------------------------------------------------------
# STEP 5: Popularity & COURSE TYPE ANALYSIS (Ordered Version)
# ------------------------------------------------------------
# Combine shared + partial data
combined_df = pd.concat([shared_df, partial_df], ignore_index=True)
if combined_df.empty:
    print("No combined data found, skipping popularity analysis.")
else:
    quarter_order = ['Fall 2024', 'Winter 2025', 'Spring 2025']
    combined_df['quarter_label'] = pd.Categorical(combined_df['quarter_label'],
                                                  categories=quarter_order,
                                                  ordered=True)
    combined_df['course'] = pd.Categorical(combined_df['course'],
                                           categories=sorted(combined_df['course'].unique(),
                                                             key=lambda x: extract_course_num(x)),
                                           ordered=True)

    # --- Identify Required vs Elective ---
    required_courses = {
        'DSC_10', 'DSC_20', 'DSC_30', 'DSC_40A', 'DSC_40B',
        'DSC_80', 'DSC_100', 'DSC_102', 'DSC_106',
        'DSC_140A', 'DSC_140B', 'DSC_148', 'DSC_180A', 'DSC_180B'
    }
    combined_df['course_type'] = combined_df['course'].apply(
        lambda x: 'Required' if x in required_courses else 'Elective'
    )

In [30]:
# --- Top Classes Per Quarter (by Enrollment) ---
print("\nTop 5 Courses per Quarter (by Enrollment Count):")
for quarter in quarter_order:
    qdata = combined_df[combined_df['quarter_label'] == quarter]
    if qdata.empty:
        continue
    qdata = qdata.sort_values('course')
    top_enrolled = qdata.sort_values('enrolled', ascending=False).head(5)
    print(f"\n{quarter}")
    for _, row in top_enrolled.iterrows():
        print(f"  ‚Ä¢ {row['course']}: {int(row['enrolled'])} enrolled ({row['utilization_rate']:.1f}% full)")


Top 5 Courses per Quarter (by Enrollment Count):

Fall 2024
  ‚Ä¢ DSC_10: 427 enrolled (96.2% full)
  ‚Ä¢ DSC_180A: 231 enrolled (98.7% full)
  ‚Ä¢ DSC_80: 202 enrolled (84.2% full)
  ‚Ä¢ DSC_190: 199 enrolled (74.2% full)
  ‚Ä¢ DSC_40A: 158 enrolled (95.8% full)

Winter 2025
  ‚Ä¢ DSC_20: 309 enrolled (80.5% full)
  ‚Ä¢ DSC_106: 254 enrolled (87.0% full)
  ‚Ä¢ DSC_180B: 231 enrolled (98.3% full)
  ‚Ä¢ DSC_140A: 208 enrolled (71.7% full)
  ‚Ä¢ DSC_10: 207 enrolled (71.4% full)

Spring 2025
  ‚Ä¢ DSC_30: 254 enrolled (66.8% full)
  ‚Ä¢ DSC_102: 251 enrolled (83.7% full)
  ‚Ä¢ DSC_106: 177 enrolled (88.5% full)
  ‚Ä¢ DSC_80: 136 enrolled (97.1% full)
  ‚Ä¢ DSC_100: 135 enrolled (92.5% full)


In [31]:
# --- Top Classes Per Quarter (by Waitlist) ---
print("\nTop 5 Courses per Quarter (by Waitlist Rate):")
for quarter in quarter_order:
    qdata = combined_df[combined_df['quarter_label'] == quarter]
    if qdata.empty:
        continue
    qdata = qdata.sort_values('course')
    top_waitlist = qdata.sort_values('waitlist_rate', ascending=False).head(5)
    print(f"\n{quarter}")
    for _, row in top_waitlist.iterrows():
        print(f"  ‚Ä¢ {row['course']}: {row['waitlist_rate']:.1f}% waitlisted ({int(row['waitlisted'])} students)")


Top 5 Courses per Quarter (by Waitlist Rate):

Fall 2024
  ‚Ä¢ DSC_10: 1.8% waitlisted (8 students)
  ‚Ä¢ DSC_20: 0.0% waitlisted (0 students)
  ‚Ä¢ DSC_30: 0.0% waitlisted (0 students)
  ‚Ä¢ DSC_40A: 0.0% waitlisted (0 students)
  ‚Ä¢ DSC_40B: 0.0% waitlisted (0 students)

Winter 2025
  ‚Ä¢ DSC_100: 7.6% waitlisted (11 students)
  ‚Ä¢ DSC_80: 2.7% waitlisted (4 students)
  ‚Ä¢ DSC_10: 0.0% waitlisted (0 students)
  ‚Ä¢ DSC_20: 0.0% waitlisted (0 students)
  ‚Ä¢ DSC_30: 0.0% waitlisted (0 students)

Spring 2025
  ‚Ä¢ DSC_140B: 7.9% waitlisted (10 students)
  ‚Ä¢ DSC_80: 0.7% waitlisted (1 students)
  ‚Ä¢ DSC_10: 0.0% waitlisted (0 students)
  ‚Ä¢ DSC_20: 0.0% waitlisted (0 students)
  ‚Ä¢ DSC_30: 0.0% waitlisted (0 students)


In [32]:
# --- Most Popular Quarter per Course (ordered by course) ---
print("\nMost Popular Quarter per Course (by Enrollment):")

most_popular_q = (
    combined_df.loc[combined_df.groupby('course', observed=True)['enrolled'].idxmax()]
    [['course', 'quarter_label', 'enrolled', 'utilization_rate']]
    .assign(course_str=lambda df: df['course'].astype(str))  # üëà Convert categorical safely
    .sort_values('course_str', key=lambda x: x.map(lambda c: extract_course_num(c)))
    .drop(columns='course_str')
)

for _, row in most_popular_q.iterrows():
    print(f"  ‚Ä¢ {row['course']}: {int(row['enrolled'])} enrolled in {row['quarter_label']} ({row['utilization_rate']:.1f}% full)")


Most Popular Quarter per Course (by Enrollment):
  ‚Ä¢ DSC_10: 427 enrolled in Fall 2024 (96.2% full)
  ‚Ä¢ DSC_20: 309 enrolled in Winter 2025 (80.5% full)
  ‚Ä¢ DSC_30: 254 enrolled in Spring 2025 (66.8% full)
  ‚Ä¢ DSC_40A: 164 enrolled in Winter 2025 (85.4% full)
  ‚Ä¢ DSC_40B: 141 enrolled in Winter 2025 (97.2% full)
  ‚Ä¢ DSC_80: 202 enrolled in Fall 2024 (84.2% full)
  ‚Ä¢ DSC_90: 7 enrolled in Fall 2024 (35.0% full)
  ‚Ä¢ DSC_95: 24 enrolled in Winter 2025 (68.6% full)
  ‚Ä¢ DSC_96: 32 enrolled in Winter 2025 (80.0% full)
  ‚Ä¢ DSC_100: 145 enrolled in Winter 2025 (100.0% full)
  ‚Ä¢ DSC_102: 251 enrolled in Spring 2025 (83.7% full)
  ‚Ä¢ DSC_106: 254 enrolled in Winter 2025 (87.0% full)
  ‚Ä¢ DSC_140A: 208 enrolled in Winter 2025 (71.7% full)
  ‚Ä¢ DSC_140B: 129 enrolled in Fall 2024 (86.0% full)
  ‚Ä¢ DSC_148: 110 enrolled in Winter 2025 (88.0% full)
  ‚Ä¢ DSC_161: 28 enrolled in Spring 2025 (70.0% full)
  ‚Ä¢ DSC_170: 33 enrolled in Winter 2025 (66.0% full)
  ‚Ä¢ DSC_180A: 

In [33]:
# --- Summary Stats by Course Type ---
print("\nAverage Utilization & Enrollment by Course Type:")
type_summary = (
    combined_df.groupby('course_type', observed=True)[['enrolled', 'utilization_rate', 'waitlist_rate']]
    .mean()
    .round(2)
    .rename(columns={
        'enrolled': 'Avg Enrolled',
        'utilization_rate': 'Avg Utilization (%)',
        'waitlist_rate': 'Avg Waitlist (%)'
    })
)
print(type_summary.to_string())



Average Utilization & Enrollment by Course Type:
             Avg Enrolled  Avg Utilization (%)  Avg Waitlist (%)
course_type                                                     
Elective            50.50                61.84              0.00
Required           158.94                81.78              0.59


In [34]:
# --- Overall Most Popular Courses (by total enrollment) ---
print("\nOverall Most Popular Courses (by Total Enrollment Across All Quarters):")
top_courses = (
    combined_df.groupby('course', observed=True)['enrolled'].sum()
    .reset_index()
    .assign(course_str=lambda df: df['course'].astype(str))
    .sort_values('course_str', key=lambda x: x.map(lambda c: extract_course_num(c)))
    .drop(columns='course_str')
)

top5 = top_courses.sort_values('enrolled', ascending=False).head(5)
for _, row in top5.iterrows():
    ctype = 'Required' if row['course'] in required_courses else 'Elective'
    print(f"  ‚Ä¢ {row['course']}: {int(row['enrolled'])} total enrolled ({ctype})")

# --- Export ordered CSVs ---
#type_summary.to_csv('enrollment_summary_by_type.csv', index=True)
#most_popular_q.to_csv('most_popular_quarters_by_course.csv', index=False)
#top_courses.to_csv('overall_enrollment_by_course.csv', index=False)
#print("\n Exported:")
#print("  - enrollment_summary_by_type.csv")
#print("  - most_popular_quarters_by_course.csv")
#print("  - overall_enrollment_by_course.csv (sorted numerically)"


Overall Most Popular Courses (by Total Enrollment Across All Quarters):
  ‚Ä¢ DSC_10: 742 total enrolled (Required)
  ‚Ä¢ DSC_106: 573 total enrolled (Required)
  ‚Ä¢ DSC_20: 490 total enrolled (Required)
  ‚Ä¢ DSC_80: 485 total enrolled (Required)
  ‚Ä¢ DSC_40A: 455 total enrolled (Required)
