<a href="https://colab.research.google.com/github/caseyeaston/BEA_PipelineEngagementAnalysis/blob/main/BEA_PipelineEngagementCleaningFinal_Logging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Main

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import libraries
import pandas as pd
import numpy as np

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

### Initialize Change Tracking

In [None]:
# Create changes log dataframe
changes_log = pd.DataFrame()
current_log_id = 0

def log_dropped(rows_df, reason, description):
    """
    Log rows that are being dropped (no original needed - these ARE the originals).
    Each row gets its own unique _log_id.
    """
    global changes_log, current_log_id
    if len(rows_df) > 0:
        log_entry = rows_df.copy()
        log_entry['_change_reason_description'] = f"{reason} - {description}"
        # log_entry['_change_type'] = 'dropped'
        log_entry['_row_version'] = 'dropped'
        # Each dropped row gets its own unique ID
        log_entry['_log_id'] = range(current_log_id, current_log_id + len(rows_df))
        current_log_id += len(rows_df)
        changes_log = pd.concat([changes_log, log_entry], ignore_index=True)
        print(f"  Logged {len(rows_df)} dropped rows: {reason}")

def log_original_only(rows_df, reason, description):
    """
    Log only the original rows (for changes where we don't need to show the modified version).
    Each row gets its own unique _log_id.
    """
    global changes_log, current_log_id
    if len(rows_df) > 0:
        log_entry = rows_df.copy()
        log_entry['_change_reason_description'] = f"{reason} - {description}"
        # log_entry['_change_type'] = 'modified'
        log_entry['_row_version'] = 'original'
        # Each row gets its own unique ID
        log_entry['_log_id'] = range(current_log_id, current_log_id + len(rows_df))
        current_log_id += len(rows_df)
        changes_log = pd.concat([changes_log, log_entry], ignore_index=True)
        print(f"  Logged {len(rows_df)} original rows: {reason}")

def log_duplicates(kept_rows_df, dropped_rows_df, reason, description):
    """
    Log duplicate rows - includes the kept row AND the dropped duplicates.
    The kept row and its duplicates share the same _log_id.
    """
    global changes_log, current_log_id
    if len(dropped_rows_df) > 0:
        # For each kept row, assign an ID that its duplicates will share
        kept_entry = kept_rows_df.copy()
        kept_entry['_change_reason_description'] = f"{reason} - {description}"
        # kept_entry['_change_type'] = 'duplicate_kept'
        kept_entry['_row_version'] = 'kept'
        kept_entry['_log_id'] = range(current_log_id, current_log_id + len(kept_rows_df))

        # Dropped duplicates get the same ID as their kept counterpart
        dropped_entry = dropped_rows_df.copy()
        dropped_entry['_change_reason_description'] = f"{reason} - {description}"
        # dropped_entry['_change_type'] = 'duplicate_dropped'
        dropped_entry['_row_version'] = 'dropped'
        dropped_entry['_log_id'] = range(current_log_id, current_log_id + len(dropped_rows_df))

        current_log_id += max(len(kept_rows_df), len(dropped_rows_df))

        changes_log = pd.concat([changes_log, kept_entry, dropped_entry], ignore_index=True)
        print(f"  Logged {len(dropped_rows_df)} dropped duplicates (with {len(kept_rows_df)} kept rows): {reason}")

def log_created(original_rows_df, created_rows_df, reason, description):
    """
    Log newly created rows (e.g., Pro101 splits).
    Logs the original row AND the newly created row.
    Original and created rows share the same _log_id.
    """
    global changes_log, current_log_id
    if len(created_rows_df) > 0:
        # Log original rows
        orig_entry = original_rows_df.copy()
        orig_entry['_change_reason_description'] = f"{reason} - {description}"
        # orig_entry['_change_type'] = 'split'
        orig_entry['_row_version'] = 'original'
        orig_entry['_log_id'] = range(current_log_id, current_log_id + len(original_rows_df))

        # Log created rows with same IDs as their originals
        created_entry = created_rows_df.copy()
        created_entry['_change_reason_description'] = f"{reason} - {description}"
        # created_entry['_change_type'] = 'split'
        created_entry['_row_version'] = 'created'
        created_entry['_log_id'] = range(current_log_id, current_log_id + len(created_rows_df))

        current_log_id += len(original_rows_df)

        changes_log = pd.concat([changes_log, orig_entry, created_entry], ignore_index=True)
        print(f"  Logged {len(created_rows_df)} created rows (with {len(original_rows_df)} source rows): {reason}")

print("Change tracking initialized.")

Change tracking initialized.


### Load Data

In [None]:
# Define file paths
base_path = '/content/drive/MyDrive/Work/BEA/2025 BEA Data Project Shared Folder/Data/(Main) Data Sources/Existing/PPBEA Pipeline/CSVs/'
file_paths = {
    '2019-2020': f'{base_path}2019-2020_PPBEA Pipeline_Engagement.csv',
    '2020-2021': f'{base_path}2020-2021_PPBEA Pipeline_Engagement.csv',
    '2021-2022': f'{base_path}2021-2022_PPBEA Pipeline_Engagement.csv',
    '2022-2023': f'{base_path}2022-2023_PPBEA Pipeline_Engagement.csv',
    '2023-2024': f'{base_path}2023-2024_PPBEA Pipeline_Engagement.csv',
    '2024-2025': f'{base_path}2024-2025_PPBEA Pipeline_Engagement.csv',
}

# Load all CSV files
dfs = {}
for year, path in file_paths.items():
    df = pd.read_csv(path)
    df = df.dropna(how='all')  # Remove completely empty rows
    df['School Year'] = year  # Add school year identifier
    dfs[year] = df

# Combine all dataframes
dfmain = pd.concat(dfs.values(), ignore_index=True)

In [None]:
# Rename PPBEA Member to District
dfmain = dfmain.rename(columns={'PPBEA Member': 'District'})

# Drop district columns and other unwanted columns
columns_to_drop = [
    ' ',  # Unnamed column
    'Calhan District RJ-1', 'Harrison District 2', 'Widefield District 3',
    'Fountain Ft.Carson District 8', 'Colorado Springs District 11',
    'Cheyenne Mountain District 12', 'Manitou Springs District 14',
    'Academy District 20', 'Ellicott District 22', 'Peyton District 23JT',
    'Lewis Palmer District 38', 'El Paso County District 49',
    'Colorado Springs Early College (CSEC)', 'CO Digital BOCES PPOS & CPA',
    'Eastlake High School', 'Banning Lewis Ranch', 'Atlas Prep',
    'Woodland Park School District',
    'Unnamed: 24',
    'Career Rep Email', 'Follow-up Task: ', 'Employer post Internship',
    'Sponsor Email', 'Placed into Employment Post Internship',
    'Staff Interactions with Businesses', 'Career Rep First Name',
    'Career Rep Last Name', 'Opp Number', 'Task Number',
    'PPBEA Staff Assigned', 'Next Action'
]
dfmain = dfmain.drop(columns=columns_to_drop)

In [None]:
# Merge duplicate columns
dfmain['Pro101 Certificates Earned'] = dfmain['Pro101 Certificates Earned'].fillna(
    dfmain['Professionalism 101 Certificates Earned']
)
dfmain = dfmain.drop(columns=[
    'Professionalism 101 Certificates Earned',
])

dfmain['PPBEA Notes'] = dfmain['PPBEA Notes'].fillna(
    dfmain['Notes: Student Name, Duration, School Name, Sponsor Name, Teacher Name, Flags']
)
dfmain = dfmain.drop(columns=[
    'Notes: Student Name, Duration, School Name, Sponsor Name, Teacher Name, Flags',
])

In [None]:
# Convert numeric columns
numeric_cols = [
    'Complete Student Trainings',
    'Complete Staff Trainings',
    'Complete Student Interactions',
    'Complete Student Internships',
    'Internships in Progress',
    'Pending Student Interactions',
    'Declined or Cancelled Student Interactions',
    'Pro101 Certificates Earned'
]

for col in numeric_cols:
    dfmain[col] = pd.to_numeric(dfmain[col], errors='coerce')

# Fill nulls with 0 for numeric columns
for col in numeric_cols:
    dfmain[col] = dfmain[col].fillna(0)

print(f"Loaded {len(dfmain):,} rows")

Loaded 10,165 rows


### Remove Summary and Empty Rows (LOG: dropped rows)

In [None]:
# Identify summary and empty rows
summary_rows = dfmain[
    (dfmain['Complete Student Interactions'] > 2000) |
    (dfmain['Event Title'].isna())
]

# LOG: dropped summary/empty rows
log_dropped(summary_rows, "Summary/empty rows", "dropped")

# Remove them
dfmain = dfmain[
    ((dfmain['Complete Student Interactions'] <= 2000) | (dfmain['Complete Student Interactions'].isna())) &
    (dfmain['Event Title'].notna())
]

# Convert date columns to datetime
dfmain['Initiation Date'] = pd.to_datetime(dfmain['Initiation Date'], errors='coerce')
dfmain['Status Update Date'] = pd.to_datetime(dfmain['Status Update Date'], errors='coerce')
dfmain['Event Date or Start Date'] = pd.to_datetime(dfmain['Event Date or Start Date'], errors='coerce')

# Standardize WBL Opportunity Type
dfmain['WBL Opportunity Type'] = dfmain['WBL Opportunity Type'].replace({
    "Speaker's Bureau": "Speakers Bureau"
})

# Clean text columns (strip whitespace/newlines)
dfmain = dfmain.copy()
dfmain['Business Champion Name'] = dfmain['Business Champion Name'].str.strip()
dfmain = dfmain.rename(columns={'Student and Sponsor\nor School POC Name': 'Student Sponsor Name'})
dfmain['Student Sponsor Name'] = dfmain['Student Sponsor Name'].str.strip()

print(f"Rows after removing summary/empty: {len(dfmain):,}")

  Logged 96 dropped rows: Summary/empty rows
Rows after removing summary/empty: 10,069


### Initial Numbers Comparison Before Cleaning

In [None]:
# Company reported numbers (annual)
company_reported = {
    '2019-2020': 3233,
    '2020-2021': 4056,
    '2021-2022': 6787,
    '2022-2023': 9815,
    '2023-2024': 11865,
    '2024-2025': 14135
}
company_total = 49891

print("=" * 70)
print("INITIAL NUMBERS COMPARISON")
print("(After removing summary rows, before other cleaning)")
print("=" * 70)

print(f"\nTotal rows: {len(dfmain):,}")

# Define complete columns
complete_cols_initial = [
    'Complete Student Interactions',
    'Complete Student Trainings',
    'Complete Student Internships'
]

# Comparison by School Year (Completed Status Only)
print("\n" + "-" * 70)
print("COMPARISON BY SCHOOL YEAR (Completed Status Only)")
print("-" * 70)
print(f"{'Year':<15} {'Company':<12} {'Our Data':<12} {'Difference':<12}")
print("-" * 70)

total_company = 0
total_ours = 0

for year in sorted(dfmain['School Year'].unique()):
    year_df = dfmain[(dfmain['School Year'] == year) & (dfmain['Placement Status'] == 'Completed')]

    year_total = year_df[complete_cols_initial].sum().sum()
    year_total += year_df['Pro101 Certificates Earned'].sum()

    company_num = company_reported.get(year, 0)
    diff = year_total - company_num

    total_company += company_num
    total_ours += year_total

    print(f"{year:<15} {company_num:<12,} {year_total:<12,.0f} {diff:<+12,.0f}")

print("-" * 70)
print(f"{'TOTAL':<15} {total_company:<12,} {total_ours:<12,.0f} {total_ours - total_company:<+12,.0f}")

print("\n" + "=" * 70)

INITIAL NUMBERS COMPARISON
(After removing summary rows, before other cleaning)

Total rows: 10,069

----------------------------------------------------------------------
COMPARISON BY SCHOOL YEAR (Completed Status Only)
----------------------------------------------------------------------
Year            Company      Our Data     Difference  
----------------------------------------------------------------------
2019-2020       3,233        3,905        +672        
2020-2021       4,056        3,965        -91         
2021-2022       6,787        6,858        +71         
2022-2023       9,815        9,825        +10         
2023-2024       11,865       12,022       +157        
2024-2025       14,135       14,732       +597        
----------------------------------------------------------------------
TOTAL           49,891       51,307       +1,416      



### Drop One-off WBL Type (LOG: dropped row)

In [None]:
# Identify the one-off WBL type row
bad_wbl_rows = dfmain[dfmain['WBL Opportunity Type'] == 'Jobs/Training/Apprenticeship']

# LOG: dropped row
log_dropped(bad_wbl_rows, "One-off WBL type", "dropped")

# Drop it
dfmain = dfmain[dfmain['WBL Opportunity Type'] != 'Jobs/Training/Apprenticeship']

# Clean text columns (strip whitespace/newlines)
dfmain = dfmain.copy()
dfmain['Business Champion Name'] = dfmain['Business Champion Name'].str.strip()
dfmain = dfmain.rename(columns={'Student and Sponsor\nor School POC Name': 'Student Sponsor Name'})
dfmain['Student Sponsor Name'] = dfmain['Student Sponsor Name'].str.strip()

# Checking shape
dfmain.shape

  Logged 1 dropped rows: One-off WBL type


(10068, 20)

# Date Errors and Nulls (LOG: original and modified rows)

In [None]:
# Fix unrealistic dates (before 2018)
date_cols = ['Initiation Date', 'Status Update Date', 'Event Date or Start Date']
cutoff_date = pd.Timestamp('2018-01-01')

for col in date_cols:
    unrealistic_mask = dfmain[col] < cutoff_date
    count = unrealistic_mask.sum()
    if count > 0:
        # LOG: original rows only
        original_rows = dfmain[unrealistic_mask].copy()
        log_original_only(original_rows, "Unrealistic dates", f"set to null in {col}")

        # Make the modification
        dfmain.loc[unrealistic_mask, col] = pd.NaT
        print(f"Setting {count} unrealistic dates to null in {col}")

# Fill remaining nulls in date columns using cascade logic
dfmain['Initiation Date'] = dfmain['Initiation Date'].fillna(dfmain['Event Date or Start Date']).fillna(dfmain['Status Update Date'])
dfmain['Status Update Date'] = dfmain['Status Update Date'].fillna(dfmain['Event Date or Start Date']).fillna(dfmain['Initiation Date'])

# Create Derived Event Date column with fallback logic
# Priority: Event Date or Start Date → Status Update Date → Initiation Date
dfmain['Derived Event Date'] = dfmain['Event Date or Start Date'].fillna(
    dfmain['Status Update Date']
).fillna(
    dfmain['Initiation Date']
)

  Logged 4 original rows: Unrealistic dates
Setting 4 unrealistic dates to null in Initiation Date
  Logged 7 original rows: Unrealistic dates
Setting 7 unrealistic dates to null in Status Update Date


# 'Placement Status' Parent Column & Separating Declined/Cancelled

In [None]:
# Create parent category column
def categorize_placement_status(status):
    if status == 'Completed':
        return 'Completed'
    elif status in ['Cancelled-COVID', 'Cancelled-Weather', 'Cancelled-Illness']:
        return 'Cancelled'
    elif status in ['Initial Contact Made', 'Pending-Scheduling', 'Scheduled Interview',
    'Internship In Process', 'Scheduled Event (Pending Completion)']:
        return 'Pending'
    else:
        return 'Declined'

dfmain['Placement Status Category'] = dfmain['Placement Status'].apply(categorize_placement_status)

# Verify
dfmain['Placement Status Category'].value_counts()

Unnamed: 0_level_0,count
Placement Status Category,Unnamed: 1_level_1
Completed,7549
Declined,2161
Pending,277
Cancelled,81


In [None]:
# Create Cancelled column and split Declined/Cancelled
dfmain['Cancelled Student Interactions'] = 0
dfmain = dfmain.rename(columns={
    'Declined or Cancelled Student Interactions': 'Declined Student Interactions'
})

# Move Cancelled values to correct column based on Placement Status Category
cancelled_mask = dfmain['Placement Status Category'] == 'Cancelled'
dfmain.loc[cancelled_mask, 'Cancelled Student Interactions'] = dfmain.loc[cancelled_mask, 'Declined Student Interactions']
dfmain.loc[cancelled_mask, 'Declined Student Interactions'] = 0

# Convert 'Cancelled Student Interactions' to float datatype
dfmain['Cancelled Student Interactions'] = dfmain['Cancelled Student Interactions'].astype(float)

# Redefine numeric columns
numeric_cols = [
    'Complete Student Trainings',
    'Complete Staff Trainings',
    'Complete Student Interactions',
    'Complete Student Internships',
    'Internships in Progress',
    'Pending Student Interactions',
    'Declined Student Interactions',
    'Cancelled Student Interactions',
    'Pro101 Certificates Earned'
]

# Pro101 (LOG: original and created rows)

In [None]:
# from rapidfuzz import fuzz

# # Extract student name (before "/") from Student Sponsor Name
# def extract_student_name(name):
#     if pd.isna(name):
#         return ''
#     name_str = str(name)
#     if '/' in name_str:
#         return name_str.split('/')[0].strip()
#     else:
#         return name_str.strip()

# # Add temporary column for student names
# dfmain['Student Name'] = dfmain['Student Sponsor Name'].apply(extract_student_name)

# # Find rows with Pro101 cert earned during other events
# pro101_during_other_event = dfmain[
#     (dfmain['Pro101 Certificates Earned'] > 0) &
#     (dfmain['WBL Opportunity Type'] != 'Professionalism 101 Training')
# ]

# print(f"Rows with Pro101 cert earned during OTHER events: {len(pro101_during_other_event)}")

# # Check if any already have matching standalone Pro101 rows
# potential_duplicates = []

# for idx, row in pro101_during_other_event.iterrows():
#     # Get student name from this row
#     student_name = row['Student Name']

#     if not student_name:  # Skip if no student name
#         continue

#     # Look for Pro101 Training rows with fuzzy match on student name
#     pro101_rows = dfmain[dfmain['WBL Opportunity Type'] == 'Professionalism 101 Training']

#     for pro101_idx, pro101_row in pro101_rows.iterrows():
#         pro101_student_name = pro101_row['Student Name']

#         if not pro101_student_name:
#             continue

#         # Fuzzy match on student names
#         similarity = fuzz.ratio(student_name.lower(), pro101_student_name.lower())

#         if similarity >= 85:  # 85% threshold
#             potential_duplicates.append((idx, pro101_idx, similarity))
#             break  # Found a match, move to next row

# print(f"\nRows that already have standalone Pro101 records: {len(potential_duplicates)}")

# if len(potential_duplicates) > 0:
#     print("\nSample matches (first 10):")
#     for orig_idx, pro101_idx, similarity in potential_duplicates[:10]:
#         print(f"\nOriginal event row {orig_idx} matches Pro101 row {pro101_idx} (similarity: {similarity}%)")
#         print(f"  Original: {dfmain.loc[orig_idx, 'Student Name']} - {dfmain.loc[orig_idx, 'WBL Opportunity Type']}")
#         print(f"  Pro101:   {dfmain.loc[pro101_idx, 'Student Name']} - {dfmain.loc[pro101_idx, 'WBL Opportunity Type']}")

In [None]:
# Drop the 2 existing Pro101 rows that we'll recreate from their matching events
dfmain = dfmain.drop([1098, 540])

# Find rows that need to be split (Pro101 earned during another event)
rows_to_split = dfmain[
    (dfmain['Pro101 Certificates Earned'] > 0) &
    (dfmain['WBL Opportunity Type'] != 'Professionalism 101 Training')
].copy()

# Create new Pro101 rows
new_pro101_rows = []

for idx, row in rows_to_split.iterrows():
    pro101_row = row.copy()
    pro101_row['Placement Status'] = 'Completed'
    pro101_row['Placement Status Category'] = 'Completed'
    pro101_row['Business Champion Name'] = 'PPBEA'
    pro101_row['Event Title'] = 'PPBEA Professionalism 101 Course'
    pro101_row['WBL Opportunity Type'] = 'Professionalism 101 Training'
    pro101_row['Complete Student Interactions'] = 1
    pro101_row['Complete Student Trainings'] = 0
    pro101_row['Complete Staff Trainings'] = 0
    pro101_row['Complete Student Internships'] = 0
    pro101_row['Internships in Progress'] = 0
    pro101_row['Pending Student Interactions'] = 0
    pro101_row['Declined Student Interactions'] = 0
    pro101_row['Cancelled Student Interactions'] = 0
    pro101_row['Pro101 Certificates Earned'] = 0
    new_pro101_rows.append(pro101_row)

# LOG: original rows and created Pro101 rows
if len(new_pro101_rows) > 0:
    created_df = pd.DataFrame(new_pro101_rows)
    log_created(rows_to_split, created_df, "Separate Pro101 Cert from non-Pro101 event", "split into original event and new Pro101 row")

# Add new Pro101 rows to dfmain
dfmain = pd.concat([dfmain, pd.DataFrame(new_pro101_rows)], ignore_index=True)

# Transfer 'Pro101 Certificates Earned' to 'Complete Student Interactions'
pro101_completed_wrong = dfmain[
    (dfmain['Placement Status'] == 'Completed') &
    (dfmain['WBL Opportunity Type'] == 'Professionalism 101 Training') &
    (dfmain['Complete Student Interactions'] == 0)
]

dfmain.loc[pro101_completed_wrong.index, 'Complete Student Interactions'] = 1

# Drop Pro101 column (no longer needed)
dfmain = dfmain.drop(columns=['Pro101 Certificates Earned'])

print(f"Created {len(new_pro101_rows)} new Pro101 rows")
dfmain.shape

  Logged 61 created rows (with 61 source rows): Separate Pro101 Cert from non-Pro101 event
Created 61 new Pro101 rows


(10127, 22)

# Placement Status & Numeric Columns Mismatch

### Fix Double-Counting (LOG: original and modified rows)

In [None]:
# Redefine numeric columns
numeric_cols = [
    'Complete Student Trainings',
    'Complete Staff Trainings',
    'Complete Student Interactions',
    'Complete Student Internships',
    'Internships in Progress',
    'Pending Student Interactions',
    'Declined Student Interactions',
    'Cancelled Student Interactions',
]

# Fix double-counting: rows with values in multiple numeric columns
dfmain['num_cols_with_values'] = (dfmain[numeric_cols] > 0).sum(axis=1)
rows_with_multiple_mask = dfmain['num_cols_with_values'] > 1

if rows_with_multiple_mask.sum() > 0:
    # LOG: original rows only
    original_rows = dfmain[rows_with_multiple_mask].copy()
    log_original_only(original_rows, "Fix student trainings and interactions double-counting", "zeroed 'Complete Student Interactions'")

# Zero out Complete Student Interactions for rows with Trainings + Interactions
dfmain.loc[rows_with_multiple_mask, 'Complete Student Interactions'] = 0

# Drop helper column
dfmain = dfmain.drop(columns=['num_cols_with_values'])

# Create a column for numeric sum
dfmain['_numeric_sum'] = dfmain[numeric_cols].sum(axis=1)

  Logged 24 original rows: Fix student trainings and interactions double-counting


### Fill All-Zero Rows (LOG: original and modified rows)

In [None]:
# Find all-zero rows
all_zeros = (dfmain[numeric_cols] == 0).all(axis=1)

# LOG: original all-zero rows - ONLY COMPLETED STATUS
all_zeros_completed = all_zeros & (dfmain['Placement Status Category'] == 'Completed')
if all_zeros_completed.sum() > 0:
    original_all_zero_rows = dfmain[all_zeros_completed].copy()
    log_original_only(original_all_zero_rows, "All-zero rows", "filled with min value based on WBL type")

# Calculate minimum numeric sum by WBL Opportunity Type (excluding zeros)
wbl_minimums = dfmain[dfmain['_numeric_sum'] > 0].groupby('WBL Opportunity Type')['_numeric_sum'].min()

# Get WBL types that have all-zero rows
wbl_types_with_zeros = dfmain[all_zeros]['WBL Opportunity Type'].unique()

print("Minimum values by WBL Opportunity Type (for types with all-zero rows):")
for wbl_type in sorted(wbl_types_with_zeros):
    if pd.notna(wbl_type):
        min_val = wbl_minimums.get(wbl_type, 1)
        print(f"  {wbl_type}: {min_val}")

# Show value counts of WBL Opportunity Type for all-zero rows
print("\nWBL Opportunity Type breakdown for all-zero rows:")
print(dfmain[all_zeros]['WBL Opportunity Type'].value_counts())
print(f"\nTotal all-zero rows: {all_zeros.sum()}")

# WBL Opportunity Type to numeric column mapping (Completed)
wbl_to_column_map_completed = {
    'Staff Training': 'Complete Staff Trainings',
    'Regional Advisory Meeting': 'Complete Staff Trainings',
    'Site Visit - Staff': 'Complete Staff Trainings',
    'Student Training': 'Complete Student Trainings',
    'Professionalism 101 Training': 'Complete Student Interactions',
    'Informational Interview Video': 'Complete Student Interactions',
    'Career Story Video': 'Complete Student Interactions',
    'e-WBL Informational Interview': 'Complete Student Interactions',
    'e-WBL Class Presentation': 'Complete Student Interactions',
    'Job Fair': 'Complete Student Interactions',
    'Class/Group Mentorship': 'Complete Student Interactions',
    'Industry Sponsored Project': 'Complete Student Interactions',
    'Class Presentation': 'Complete Student Interactions',
    'Job Shadow': 'Complete Student Interactions',
    'Site Visit': 'Complete Student Interactions',
    'Speakers Bureau': 'Complete Student Interactions',
    'Event': 'Complete Student Interactions',
    'Individual Mentorship': 'Complete Student Interactions',
    'Paid Job': 'Complete Student Interactions',
    'Internship 60': 'Complete Student Internships',
    'Internship 120': 'Complete Student Internships',
    'Internship 320': 'Complete Student Internships',
    'Apprenticeship': 'Complete Student Internships',
}

# WBL Opportunity Type to numeric column mapping (Pending)
wbl_to_column_map_pending = {
    'Professionalism 101 Training': 'Pending Student Interactions',
    'Career Story Video': 'Pending Student Interactions',
    'e-WBL Informational Interview': 'Pending Student Interactions',
    'Industry Sponsored Project': 'Pending Student Interactions',
    'Class Presentation': 'Pending Student Interactions',
    'Job Shadow': 'Pending Student Interactions',
    'Site Visit': 'Pending Student Interactions',
    'Speakers Bureau': 'Pending Student Interactions',
    'Event': 'Pending Student Interactions',
    'Internship 60': 'Internships in Progress',
    'Internship 120': 'Internships in Progress',
    'Internship 320': 'Internships in Progress',
    'Apprenticeship': 'Internships in Progress',
}

# Fill all-zero rows with minimum values
for idx in dfmain[all_zeros].index:
    row = dfmain.loc[idx]
    wbl_type = row['WBL Opportunity Type']
    category = row['Placement Status Category']

    # Get minimum value for this WBL type (default to 1 if no minimum available)
    min_value = wbl_minimums.get(wbl_type, 1)

    if category == 'Completed':
        if wbl_type in wbl_to_column_map_completed:
            dfmain.loc[idx, wbl_to_column_map_completed[wbl_type]] = min_value
    elif category == 'Pending':
        if wbl_type in wbl_to_column_map_pending:
            dfmain.loc[idx, wbl_to_column_map_pending[wbl_type]] = min_value
    elif category == 'Declined':
        dfmain.loc[idx, 'Declined Student Interactions'] = min_value
    elif category == 'Cancelled':
        dfmain.loc[idx, 'Cancelled Student Interactions'] = min_value

# Drop temporary numeric sum column
dfmain = dfmain.drop(columns=['_numeric_sum'])

# Verify
all_zeros_after = (dfmain[numeric_cols] == 0).all(axis=1)
print(f"\nAll-zero rows before filling: {all_zeros.sum()}")
print(f"All-zero rows after filling: {all_zeros_after.sum()}")

  Logged 222 original rows: All-zero rows
Minimum values by WBL Opportunity Type (for types with all-zero rows):
  Apprenticeship: 1.0
  Career Story Video: 1.0
  Class Presentation: 1.0
  Event: 1.0
  Industry Sponsored Project: 1.0
  Internship 60: 1.0
  Job Shadow: 1.0
  Professionalism 101 Training: 1.0
  Regional Advisory Meeting: 1
  Site Visit: 1.0
  Site Visit - Staff: 1
  Speakers Bureau: 10.0
  Student Training: 1.0
  e-WBL Class Presentation: 1.0
  e-WBL Informational Interview: 1.0

WBL Opportunity Type breakdown for all-zero rows:
WBL Opportunity Type
Speakers Bureau                  489
Professionalism 101 Training     141
Site Visit                       134
Class Presentation                98
e-WBL Class Presentation          91
Internship 60                     62
Regional Advisory Meeting         42
Student Training                   9
Job Shadow                         8
Event                              5
e-WBL Informational Interview      5
Site Visit - Staff    

# Duplicate Handling & Pending Events

### Duplicate Detection (LOG: kept and dropped duplicates)

In [None]:
#  Log a single duplicate pair - the kept row AND its dropped duplicate. Both rows share the same _log_id.

def log_duplicate_pair(kept_row, dropped_row, reason, description):

    global changes_log, current_log_id

    # Log kept row
    kept_entry = kept_row.to_frame().T.copy()
    kept_entry['_log_id'] = current_log_id
    ...

    # Log dropped row with SAME ID
    dropped_entry = dropped_row.to_frame().T.copy()
    dropped_entry['_log_id'] = current_log_id
    ...

    current_log_id += 1  # Increment AFTER both are assigned the same ID

In [None]:
# Define base matching fields for duplicate detection
base_match_fields = [
    'Business Champion Name',
    'Student Sponsor Name',
    'WBL Opportunity Type',
    'District',
    'Event Title',
    'School or Program Site'
]

# Create temporary column for numeric sum
dfmain['_numeric_sum'] = dfmain[numeric_cols].sum(axis=1)

# Find duplicates and decide which to keep
rows_to_drop = []
duplicate_pairs = []  # Store (kept_idx, dropped_idx) pairs

for idx in dfmain.index:
    if idx in rows_to_drop:
        continue

    row = dfmain.loc[idx]

    # Look for matches in earlier rows
    matches = dfmain[
        (dfmain.index < idx) &
        (~dfmain.index.isin(rows_to_drop)) &
        (dfmain['Business Champion Name'] == row['Business Champion Name']) &
        (dfmain['Student Sponsor Name'] == row['Student Sponsor Name']) &
        (dfmain['WBL Opportunity Type'] == row['WBL Opportunity Type']) &
        (dfmain['District'] == row['District']) &
        (dfmain['Event Title'] == row['Event Title']) &
        (dfmain['School or Program Site'] == row['School or Program Site']) &
        (dfmain['_numeric_sum'] == row['_numeric_sum']) &
        (
            (dfmain['Initiation Date'] == row['Initiation Date']) |
            (dfmain['Derived Event Date'] == row['Derived Event Date'])
        )
    ]

    if len(matches) > 0:
        orig_idx = matches.index[0]
        orig_status = dfmain.loc[orig_idx, 'Placement Status']
        dup_status = row['Placement Status']

        # Determine which to keep
        keep_idx = None
        drop_idx = None

        # Rule 1: Prefer 'Completed' status
        if orig_status == 'Completed' and dup_status != 'Completed':
            keep_idx, drop_idx = orig_idx, idx
        elif dup_status == 'Completed' and orig_status != 'Completed':
            keep_idx, drop_idx = idx, orig_idx
        else:
            # Rule 2: Choose the one with least nulls
            orig_null_count = dfmain.loc[orig_idx].isna().sum()
            dup_null_count = row.isna().sum()

            if orig_null_count < dup_null_count:
                keep_idx, drop_idx = orig_idx, idx
            elif dup_null_count < orig_null_count:
                keep_idx, drop_idx = idx, orig_idx
            else:
                # Rule 3: Keep the most recent based on Derived Event Date
                orig_date = dfmain.loc[orig_idx, 'Derived Event Date']
                dup_date = row['Derived Event Date']

                if pd.isna(orig_date) and pd.isna(dup_date):
                    keep_idx, drop_idx = orig_idx, idx
                elif pd.isna(orig_date):
                    keep_idx, drop_idx = idx, orig_idx
                elif pd.isna(dup_date):
                    keep_idx, drop_idx = orig_idx, idx
                elif dup_date > orig_date:
                    keep_idx, drop_idx = idx, orig_idx
                else:
                    keep_idx, drop_idx = orig_idx, idx

        rows_to_drop.append(drop_idx)
        duplicate_pairs.append((keep_idx, drop_idx))

# Remove duplicates from rows_to_drop list
rows_to_drop = list(set(rows_to_drop))

# LOG: each duplicate pair with the same _log_id
if len(duplicate_pairs) > 0:
    print(f"  Logging {len(duplicate_pairs)} duplicate pairs...")
    for kept_idx, dropped_idx in duplicate_pairs:
        kept_row = dfmain.loc[kept_idx]
        dropped_row = dfmain.loc[dropped_idx]
        log_duplicate_pair(kept_row, dropped_row, "Duplicates", "identified using business name, student/sponsor, WBL type, district, event title, school, engagement count, initiation date/derived event date")
    print(f"  Logged {len(duplicate_pairs)} duplicate pairs: Duplicates")

# Drop duplicates
dfmain = dfmain.drop(rows_to_drop)

# Drop temporary numeric sum column
dfmain = dfmain.drop(columns=['_numeric_sum'])

  Logging 209 duplicate pairs...
  Logged 209 duplicate pairs: Duplicates


### Fix Internship Rows (LOG: original and modified)

In [None]:
# Fix 2019-2020 Internship rows marked as Declined that should be Completed
declined_should_be_completed = dfmain[
    (dfmain['Placement Status Category'] == 'Declined') &
    (dfmain['School Year'] == '2019-2020') &
    (dfmain['WBL Opportunity Type'].isin(['Internship 60', 'Internship 120', 'Internship 320'])) &
    (dfmain['Complete Student Interactions'] > 0)
].index

if len(declined_should_be_completed) > 0:
    # LOG: original rows only
    original_rows = dfmain.loc[declined_should_be_completed].copy()
    log_original_only(original_rows, "2019-2020 completed Internship rows marked as 'Declined'", "marked as 'Completed'")

    # Make modifications
    dfmain.loc[declined_should_be_completed, 'Placement Status'] = 'Completed'
    dfmain.loc[declined_should_be_completed, 'Placement Status Category'] = 'Completed'
    dfmain.loc[declined_should_be_completed, 'Complete Student Internships'] = dfmain.loc[declined_should_be_completed, 'Complete Student Interactions']
    dfmain.loc[declined_should_be_completed, 'Complete Student Interactions'] = 0

# Mark 'Internship In Process' 2024-2025 rows as 'Completed' (NOT LOGGED)
internship_2024_25 = (
    (dfmain['Placement Status'] == 'Internship In Process') &
    (dfmain['School Year'] == '2024-2025')
)
dfmain.loc[internship_2024_25, 'Placement Status'] = 'Completed'
dfmain.loc[internship_2024_25, 'Placement Status Category'] = 'Completed'

# Transfer internship counts to Complete Student Internships for these rows
dfmain.loc[internship_2024_25, 'Complete Student Internships'] = dfmain.loc[internship_2024_25, 'Internships in Progress']
dfmain.loc[internship_2024_25, 'Internships in Progress'] = 0

### Reclassify Pending Statuses (LOG: original and modified)

In [None]:
# Reclassify remaining pending statuses as "Declined - Unfinished"
pending_statuses = [
    'Initial Contact Made', 'Pending-Scheduling', 'Scheduled Interview',
    'Internship In Process', 'Scheduled Event (Pending Completion)'
]

pending_mask = dfmain['Placement Status'].isin(pending_statuses)

if pending_mask.sum() > 0:
    # LOG: original rows only
    original_pending_rows = dfmain[pending_mask].copy()
    log_original_only(original_pending_rows, "Reclassify remaining pending statuses to 'Declined'", "marked as 'Declined - Unfinished'")

dfmain.loc[dfmain['Placement Status'].isin(pending_statuses), 'Placement Status'] = 'Declined - Unfinished'
dfmain.loc[dfmain['Placement Status'] == 'Declined - Unfinished', 'Placement Status Category'] = 'Declined'

# Transfer pending counts to declined counts for reclassified rows
dfmain.loc[
    dfmain['Placement Status'] == 'Declined - Unfinished',
    'Declined Student Interactions'
] = (
    dfmain.loc[dfmain['Placement Status'] == 'Declined - Unfinished', 'Pending Student Interactions'] +
    dfmain.loc[dfmain['Placement Status'] == 'Declined - Unfinished', 'Internships in Progress']
)

# Zero out the pending columns for these rows
dfmain.loc[dfmain['Placement Status'] == 'Declined - Unfinished', 'Pending Student Interactions'] = 0
dfmain.loc[dfmain['Placement Status'] == 'Declined - Unfinished', 'Internships in Progress'] = 0

# Drop the now-irrelevant pending columns
dfmain = dfmain.drop(columns=['Pending Student Interactions', 'Internships in Progress'])

# Verify
print(f"Rows dropped as duplicates: {len(rows_to_drop)}")
print(f"Final row count: {len(dfmain)}")
print(f"\nPlacement Status counts:")
print(dfmain['Placement Status'].value_counts())

  Logged 38 original rows: Reclassify remaining pending statuses to 'Declined'
Rows dropped as duplicates: 209
Final row count: 9918

Placement Status counts:
Placement Status
Completed                           7644
Declined-Applicant                   475
Declined - Business Unresponsive     225
Declined/Cancelled-Other             224
Declined-Business Scheduling         220
Declined-Business                    210
Declined- Student Applicant          181
Declined - Student Profile           176
Declined - Student Other             113
Declined - Student Unresponsive       90
Declined - Staff Scheduling           76
Declined-Intern NOT Selected          66
Cancelled-COVID                       63
Declined - Unfinished                 38
Declined - Staff Applicant            35
Declined - Staff Unresponsive         26
Declined-Opportunity FULL             24
Cancelled-Weather                     17
Terminated                            14
Cancelled-Illness                      1
Name

# Standardizing District Names and Reorganize Column Structure

In [None]:
# Standardize District Names

district_mapping = {
    'D11': 'Colorado Springs (D11)',
    'D20': 'Academy (D20)',
    'D49': 'El Paso County (D49)',
    'D2': 'Harrison (D2)',
    'D3': 'Widefield (D3)',
    'D8': 'Fountain-Fort Carson (D8)',
    'D12': 'Cheyenne Mountain (D12)',
    'D14': 'Manitou Springs (D14)',
    'D38': 'Lewis-Palmer (D38)',
    'CEC-CS': 'Colorado Springs Early College (CEC-CS)',
    'BLR': 'Banning Lewis Ranch (BLR)',
    'WPSD': 'Woodland Park (WPSD)',
    'Ellicott': 'Ellicott (D22)',
    'Peyton': 'Peyton (D23JT)',
    'Calhan': 'Calhan (RJ-1)',
    'Atlas Prep': 'Atlas Preparatory',
    'CPA/PPOS': 'CO Digital BOCES (CPA/PPOS)',
    'PTEC': 'Power Technical (PTEC)',
    'Goal H.S.': 'Goal High School',
    'Mon Impact': 'Monumental Impact',
    'Peak Ed': 'Peak Education',
    'Miami-Yoder': 'Miami-Yoder (JT-60)',
    'ECA': 'Evangel Christian Academy',
    'MET': 'Mountain Employment Training',
    'TCA': 'The Classical Academy',
    'CCV': 'Cripple Creek-Victor (RE-1)',
    'DYS': 'Division of Youth Services',
    'Vanguard': 'Vanguard School',
    'Homeschool': 'Homeschool',
    'Various': 'Various',
    'BBBS': 'Big Brothers Big Sisters',
    'Roundup': 'Roundup School'
}

dfmain['District'] = dfmain['District'].replace(district_mapping)

# Reorganize Column Structure

column_order = [
    'School Year',
    'Derived Event Date',
    'Placement Status Category',
    'Placement Status',
    'WBL Opportunity Type',
    'Event Title',
    'Business Champion Name',
    'Student Sponsor Name',
    'District',
    'School or Program Site',
    'Complete Student Interactions',
    'Complete Student Trainings',
    'Complete Student Internships',
    'Declined Student Interactions',
    'Cancelled Student Interactions',
    'Complete Staff Trainings',
    'Initiation Date',
    'Status Update Date',
    'Event Date or Start Date',
]

dfmain = dfmain[column_order]

# Export

### Change Log Summary & Export

In [None]:
print("=" * 70)
print("CHANGES LOG SUMMARY")
print("=" * 70)

print(f"\nTotal rows logged: {len(changes_log):,}")

# print("\nChanges by Type:")
# print(changes_log['_change_type'].value_counts().to_string())

print("\nChanges by Reason:")
print(changes_log['_change_reason_description'].value_counts().to_string())

print("\nChanges by Row Version:")
print(changes_log['_row_version'].value_counts().to_string())

# Sort changes log for easier review (group by log_id so pairs are together)
changes_log_sorted = changes_log.sort_values(['_log_id', '_row_version'])

# Reorder columns for export
log_column_order = [
    '_log_id',
    '_change_reason_description',
    # '_change_type',
    '_row_version',
    'School Year',
    'Derived Event Date',
    'Placement Status Category',
    'Placement Status',
    'WBL Opportunity Type',
    'Event Title',
    'Business Champion Name',
    'Student Sponsor Name',
    'District',
    'School or Program Site',
    'Complete Student Interactions',
    'Complete Student Trainings',
    'Complete Student Internships',
    'Declined Student Interactions',
    'Cancelled Student Interactions',
    'Complete Staff Trainings',
    'Initiation Date',
    'Status Update Date',
    'Event Date or Start Date',
    'PPBEA Notes'
]

# Only include columns that exist in the log
existing_cols = [col for col in log_column_order if col in changes_log_sorted.columns]
changes_log_export = changes_log_sorted[existing_cols]

# Export changes log
import os
changes_output_path = '/content/drive/MyDrive/Work/BEA/2025 BEA Data Project Shared Folder/Data/(Main) Data Sources/Existing/PPBEA Pipeline/Cleaned/'
os.makedirs(changes_output_path, exist_ok=True)

changes_file = f'{changes_output_path}PPBEA_Pipeline_Changes_Log.csv'
changes_log_export.to_csv(changes_file, index=False)

print(f"\nChanges log exported to: {changes_file}")
print(f"Total rows in log: {len(changes_log):,}")

CHANGES LOG SUMMARY

Total rows logged: 514

Changes by Reason:
_change_reason_description
All-zero rows - filled with min value based on WBL type                                            222
Separate Pro101 Cert from non-Pro101 event - split into original event and new Pro101 row          122
Summary/empty rows - dropped                                                                        96
Reclassify remaining pending statuses to 'Declined' - marked as 'Declined - Unfinished'             38
Fix student trainings and interactions double-counting - zeroed 'Complete Student Interactions'     24
Unrealistic dates - set to null in Status Update Date                                                7
Unrealistic dates - set to null in Initiation Date                                                   4
One-off WBL type - dropped                                                                           1

Changes by Row Version:
_row_version
original    356
dropped      97
created      61

# Final Data Summary

In [None]:
print("=" * 60)
print("FINAL DATASET SUMMARY")
print("=" * 60)

# Size
print(f"\nSize: {dfmain.shape[0]:,} rows × {dfmain.shape[1]} columns")

# Date Range
print(f"\nDate Range: {dfmain['Derived Event Date'].min().strftime('%B %d, %Y')} - {dfmain['Derived Event Date'].max().strftime('%B %d, %Y')}")

# Records by Year
print("\nRecords by Year:")
year_counts = dfmain['School Year'].value_counts().sort_index()
for year, count in year_counts.items():
    print(f"  {year}: {count:,}")

# Status Distribution (by Category)
print("\nStatus Distribution (by Category):")
status_counts = dfmain['Placement Status Category'].value_counts()
for status, count in status_counts.items():
    pct = count / len(dfmain) * 100
    print(f"  {status}: {count:,} ({pct:.1f}%)")

# Status Distribution (detailed)
print("\nStatus Distribution (detailed):")
detailed_status = dfmain['Placement Status'].value_counts()
for status, count in detailed_status.items():
    pct = count / len(dfmain) * 100
    print(f"  {status}: {count:,} ({pct:.1f}%)")

# Student Engagement Metrics
print("\nStudent Engagement Metrics:")
total_completed = dfmain['Complete Student Interactions'].sum() + dfmain['Complete Student Trainings'].sum() + dfmain['Complete Student Internships'].sum()
total_declined = dfmain['Declined Student Interactions'].sum()
total_cancelled = dfmain['Cancelled Student Interactions'].sum()
grand_total = total_completed + total_declined + total_cancelled

print(f"  Total Completed: {total_completed:,.0f}")
print(f"  Total Declined: {total_declined:,.0f}")
print(f"  Total Cancelled: {total_cancelled:,.0f}")
print(f"  Grand Total: {grand_total:,.0f} student engagements")

# Average students per completed event
completed_events = dfmain[dfmain['Placement Status Category'] == 'Completed']
avg_students = total_completed / len(completed_events) if len(completed_events) > 0 else 0
print(f"  Average Students per Completed Event: {avg_students:.1f}")

# Staff Metrics
print("\nStaff Metrics:")
print(f"  Staff Trainings: {dfmain['Complete Staff Trainings'].sum():,.0f}")

# Top 5 Districts
print("\nTop 5 Districts:")
top_districts = dfmain['District'].value_counts().head(5)
for i, (district, count) in enumerate(top_districts.items(), 1):
    print(f"  {i}. {district}: {count:,} events")

# Top 5 WBL Types
print("\nTop 5 WBL Opportunity Types:")
top_wbl = dfmain['WBL Opportunity Type'].value_counts().head(5)
for i, (wbl, count) in enumerate(top_wbl.items(), 1):
    print(f"  {i}. {wbl}: {count:,} events")

# Top 5 Businesses
print("\nTop 5 Business Partners:")
top_business = dfmain['Business Champion Name'].value_counts().head(5)
for i, (business, count) in enumerate(top_business.items(), 1):
    print(f"  {i}. {business}: {count:,} events")

FINAL DATASET SUMMARY

Size: 9,918 rows × 19 columns

Date Range: January 07, 2019 - September 22, 2025

Records by Year:
  2019-2020: 309
  2020-2021: 1,926
  2021-2022: 1,993
  2022-2023: 2,743
  2023-2024: 1,502
  2024-2025: 1,445

Status Distribution (by Category):
  Completed: 7,644 (77.1%)
  Declined: 2,193 (22.1%)
  Cancelled: 81 (0.8%)

Status Distribution (detailed):
  Completed: 7,644 (77.1%)
  Declined-Applicant: 475 (4.8%)
  Declined - Business Unresponsive: 225 (2.3%)
  Declined/Cancelled-Other: 224 (2.3%)
  Declined-Business Scheduling: 220 (2.2%)
  Declined-Business: 210 (2.1%)
  Declined- Student Applicant: 181 (1.8%)
  Declined - Student Profile: 176 (1.8%)
  Declined - Student Other: 113 (1.1%)
  Declined - Student Unresponsive: 90 (0.9%)
  Declined - Staff Scheduling: 76 (0.8%)
  Declined-Intern NOT Selected: 66 (0.7%)
  Cancelled-COVID: 63 (0.6%)
  Declined - Unfinished: 38 (0.4%)
  Declined - Staff Applicant: 35 (0.4%)
  Declined - Staff Unresponsive: 26 (0.3%)
  D

### Reconciliation Summary

In [None]:
print("=" * 70)
print("RECONCILIATION SUMMARY")
print("=" * 70)

company_reported = {
    '2019-2020': 3233,
    '2020-2021': 4056,
    '2021-2022': 6787,
    '2022-2023': 9815,
    '2023-2024': 11865,
    '2024-2025': 14135
}

# Calculate final numbers
complete_cols_final = ['Complete Student Interactions', 'Complete Student Trainings', 'Complete Student Internships']

print(f"\n{'Year':<15} {'Company':<12} {'Our Final':<12} {'Difference':<12} {'% Diff':<10}")
print("-" * 70)

total_company = 0
total_ours = 0

for year in sorted(dfmain['School Year'].unique()):
    year_df = dfmain[(dfmain['School Year'] == year) & (dfmain['Placement Status Category'] == 'Completed')]
    our_complete = year_df[complete_cols_final].sum().sum()

    company_num = company_reported.get(year, 0)
    diff = our_complete - company_num
    pct_diff = (diff / company_num * 100) if company_num > 0 else 0

    total_company += company_num
    total_ours += our_complete

    print(f"{year:<15} {company_num:<12,} {our_complete:<12,.0f} {diff:<+12,.0f} {pct_diff:<+10.1f}%")

print("-" * 70)
total_pct = (total_ours - total_company) / total_company * 100
print(f"{'TOTAL':<15} {total_company:<12,} {total_ours:<12,.0f} {total_ours - total_company:<+12,.0f} {total_pct:<+10.1f}%")

print("\n" + "=" * 70)

RECONCILIATION SUMMARY

Year            Company      Our Final    Difference   % Diff    
----------------------------------------------------------------------
2019-2020       3,233        3,201        -32          -1.0      %
2020-2021       4,056        3,983        -73          -1.8      %
2021-2022       6,787        6,955        +168         +2.5      %
2022-2023       9,815        9,936        +121         +1.2      %
2023-2024       11,865       11,980       +115         +1.0      %
2024-2025       14,135       14,779       +644         +4.6      %
----------------------------------------------------------------------
TOTAL           49,891       50,834       +943         +1.9      %

