<a href="https://colab.research.google.com/github/caseyeaston/BEA_PipelineEngagementAnalysis/blob/main/BEA_PipelineEngagementCleaningFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Main

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import libraries
import pandas as pd
import numpy as np

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

In [3]:
# Define file paths
base_path = '/content/drive/MyDrive/Work/BEA/2025 BEA Data Project Shared Folder/Data/(Main) Data Sources/Existing/PPBEA Pipeline/CSVs/'
file_paths = {
    '2019-2020': f'{base_path}2019-2020_PPBEA Pipeline_Engagement.csv',
    '2020-2021': f'{base_path}2020-2021_PPBEA Pipeline_Engagement.csv',
    '2021-2022': f'{base_path}2021-2022_PPBEA Pipeline_Engagement.csv',
    '2022-2023': f'{base_path}2022-2023_PPBEA Pipeline_Engagement.csv',
    '2023-2024': f'{base_path}2023-2024_PPBEA Pipeline_Engagement.csv',
    '2024-2025': f'{base_path}2024-2025_PPBEA Pipeline_Engagement.csv',
}

# Load all CSV files
dfs = {}
for year, path in file_paths.items():
    df = pd.read_csv(path)
    df = df.dropna(how='all')  # Remove completely empty rows
    df['School Year'] = year  # Add school year identifier
    dfs[year] = df

# Combine all dataframes
dfmain = pd.concat(dfs.values(), ignore_index=True)

In [4]:
# Rename PPBEA Member to District
dfmain = dfmain.rename(columns={'PPBEA Member': 'District'})

In [5]:
# Drop district columns and other unwanted columns
columns_to_drop = [
    ' ',  # Unnamed column
    'Calhan District RJ-1', 'Harrison District 2', 'Widefield District 3',
    'Fountain Ft.Carson District 8', 'Colorado Springs District 11',
    'Cheyenne Mountain District 12', 'Manitou Springs District 14',
    'Academy District 20', 'Ellicott District 22', 'Peyton District 23JT',
    'Lewis Palmer District 38', 'El Paso County District 49',
    'Colorado Springs Early College (CSEC)', 'CO Digital BOCES PPOS & CPA',
    'Eastlake High School', 'Banning Lewis Ranch', 'Atlas Prep',
    'Woodland Park School District',
    'Unnamed: 24',
    'Career Rep Email', 'Follow-up Task: ', 'Employer post Internship',
    'Sponsor Email', 'Placed into Employment Post Internship',
    'Staff Interactions with Businesses', 'Career Rep First Name',
    'Career Rep Last Name', 'Opp Number', 'Task Number',
    'PPBEA Staff Assigned', 'Next Action', 'PPBEA Notes',
    'Notes: Student Name, Duration, School Name, Sponsor Name, Teacher Name, Flags'
]
dfmain = dfmain.drop(columns=columns_to_drop)

In [6]:
# Merge duplicate columns
dfmain['Pro101 Certificates Earned'] = dfmain['Pro101 Certificates Earned'].fillna(
    dfmain['Professionalism 101 Certificates Earned']
)
dfmain = dfmain.drop(columns=[
    'Professionalism 101 Certificates Earned',
])

In [7]:
# Convert numeric columns
numeric_columns = [
    'Complete Student Trainings',
    'Complete Staff Trainings',
    'Complete Student Interactions',
    'Complete Student Internships',
    'Internships in Progress',
    'Pending Student Interactions',
    'Declined or Cancelled Student Interactions',
    'Pro101 Certificates Earned'
]

for col in numeric_columns:
    dfmain[col] = pd.to_numeric(dfmain[col], errors='coerce')

# Fill nulls with 0 for numeric columns
for col in numeric_columns:
    dfmain[col] = dfmain[col].fillna(0)

In [8]:
# Remove summary and empty rows
dfmain = dfmain[
    ((dfmain['Complete Student Interactions'] <= 2000) | (dfmain['Complete Student Interactions'].isna())) &
    (dfmain['Event Title'].notna())
]

In [9]:
# Convert date columns to datetime
dfmain['Initiation Date'] = pd.to_datetime(dfmain['Initiation Date'], errors='coerce')
dfmain['Status Update Date'] = pd.to_datetime(dfmain['Status Update Date'], errors='coerce')
dfmain['Event Date or Start Date'] = pd.to_datetime(dfmain['Event Date or Start Date'], errors='coerce')

In [10]:
# Checking shape
dfmain.shape

(10069, 19)

# Date Errors and Nulls

In [11]:
# Fix unrealistic dates (before 2018)
date_cols = ['Initiation Date', 'Status Update Date', 'Event Date or Start Date']
cutoff_date = pd.Timestamp('2018-01-01')

for col in date_cols:
    unrealistic = dfmain[col] < cutoff_date
    count = unrealistic.sum()
    if count > 0:
        print(f"Setting {count} unrealistic dates to null in {col}")
        dfmain.loc[unrealistic, col] = pd.NaT

# Fill remaining nulls in date columns using cascade logic
dfmain['Initiation Date'] = dfmain['Initiation Date'].fillna(dfmain['Event Date or Start Date']).fillna(dfmain['Status Update Date'])
dfmain['Status Update Date'] = dfmain['Status Update Date'].fillna(dfmain['Event Date or Start Date']).fillna(dfmain['Initiation Date'])

# Create Derived Event Date column with fallback logic
# Priority: Event Date or Start Date → Status Update Date → Initiation Date
dfmain['Derived Event Date'] = dfmain['Event Date or Start Date'].fillna(
    dfmain['Status Update Date']
).fillna(
    dfmain['Initiation Date']
)

Setting 4 unrealistic dates to null in Initiation Date
Setting 7 unrealistic dates to null in Status Update Date


# Pro101

In [12]:
# from rapidfuzz import fuzz

# # Extract student name (before "/") from Student Sponsor Name
# def extract_student_name(name):
#     if pd.isna(name):
#         return ''
#     name_str = str(name)
#     if '/' in name_str:
#         return name_str.split('/')[0].strip()
#     else:
#         return name_str.strip()

# # Add temporary column for student names
# dfmain['Student Name'] = dfmain['Student Sponsor Name'].apply(extract_student_name)

# # Find rows with Pro101 cert earned during other events
# pro101_during_other_event = dfmain[
#     (dfmain['Pro101 Certificates Earned'] > 0) &
#     (dfmain['WBL Opportunity Type'] != 'Professionalism 101 Training')
# ]

# print(f"Rows with Pro101 cert earned during OTHER events: {len(pro101_during_other_event)}")

# # Check if any already have matching standalone Pro101 rows
# potential_duplicates = []

# for idx, row in pro101_during_other_event.iterrows():
#     # Get student name from this row
#     student_name = row['Student Name']

#     if not student_name:  # Skip if no student name
#         continue

#     # Look for Pro101 Training rows with fuzzy match on student name
#     pro101_rows = dfmain[dfmain['WBL Opportunity Type'] == 'Professionalism 101 Training']

#     for pro101_idx, pro101_row in pro101_rows.iterrows():
#         pro101_student_name = pro101_row['Student Name']

#         if not pro101_student_name:
#             continue

#         # Fuzzy match on student names
#         similarity = fuzz.ratio(student_name.lower(), pro101_student_name.lower())

#         if similarity >= 85:  # 85% threshold
#             potential_duplicates.append((idx, pro101_idx, similarity))
#             break  # Found a match, move to next row

# print(f"\nRows that already have standalone Pro101 records: {len(potential_duplicates)}")

# if len(potential_duplicates) > 0:
#     print("\nSample matches (first 10):")
#     for orig_idx, pro101_idx, similarity in potential_duplicates[:10]:
#         print(f"\nOriginal event row {orig_idx} matches Pro101 row {pro101_idx} (similarity: {similarity}%)")
#         print(f"  Original: {dfmain.loc[orig_idx, 'Student Name']} - {dfmain.loc[orig_idx, 'WBL Opportunity Type']}")
#         print(f"  Pro101:   {dfmain.loc[pro101_idx, 'Student Name']} - {dfmain.loc[pro101_idx, 'WBL Opportunity Type']}")

In [13]:
# Drop the 2 existing Pro101 rows that we'll recreate from their matching events
dfmain = dfmain.drop([1098, 540])

# Find rows that need to be split (Pro101 earned during another event)
rows_to_split = dfmain[
    (dfmain['Pro101 Certificates Earned'] > 0) &
    (dfmain['WBL Opportunity Type'] != 'Professionalism 101 Training')
].copy()

# Create new Pro101 rows
new_pro101_rows = []

for idx, row in rows_to_split.iterrows():
    pro101_row = row.copy()
    pro101_row['Placement Status'] = 'Completed'
    pro101_row['Placement Status Category'] = 'Completed'
    pro101_row['Business Champion Name'] = 'PPBEA'
    pro101_row['Event Title'] = 'PPBEA Professionalism 101 Course'
    pro101_row['WBL Opportunity Type'] = 'Professionalism 101 Training'
    pro101_row['Complete Student Interactions'] = 1
    pro101_row['Complete Student Trainings'] = 0
    pro101_row['Complete Staff Trainings'] = 0
    pro101_row['Complete Student Internships'] = 0
    pro101_row['Declined or Cancelled Student Interactions'] = 0
    pro101_row['Pro101 Certificates Earned'] = 0
    new_pro101_rows.append(pro101_row)

# Add new Pro101 rows to dfmain
dfmain = pd.concat([dfmain, pd.DataFrame(new_pro101_rows)], ignore_index=True)

# Fix Completed Pro101 rows that have 0 in Complete Student Interactions
pro101_completed_wrong = dfmain[
    (dfmain['Placement Status'] == 'Completed') &
    (dfmain['WBL Opportunity Type'] == 'Professionalism 101 Training') &
    (dfmain['Complete Student Interactions'] == 0)
]

dfmain.loc[pro101_completed_wrong.index, 'Complete Student Interactions'] = 1

# Drop Pro101 column (no longer needed)
dfmain = dfmain.drop(columns=['Pro101 Certificates Earned'])

print(f"Created {len(new_pro101_rows)} new Pro101 rows")
dfmain.shape

Created 61 new Pro101 rows


(10128, 20)

# Duplicates/Pending

In [14]:
# Standardize WBL Opportunity Type
dfmain['WBL Opportunity Type'] = dfmain['WBL Opportunity Type'].replace({
    "Speaker's Bureau": "Speakers Bureau"
})

# Drop one-off WBL type
dfmain = dfmain[dfmain['WBL Opportunity Type'] != 'Jobs/Training/Apprenticeship']

# Clean text columns (strip whitespace/newlines)
dfmain = dfmain.copy()
dfmain['Business Champion Name'] = dfmain['Business Champion Name'].str.strip()
dfmain = dfmain.rename(columns={'Student and Sponsor\nor School POC Name': 'Student Sponsor Name'})
dfmain['Student Sponsor Name'] = dfmain['Student Sponsor Name'].str.strip()

# Define duplicate matching fields
completed_match_fields = [
    'Derived Event Date', 'Business Champion Name', 'Student Sponsor Name',
    'WBL Opportunity Type', 'District', 'Event Title', 'School or Program Site'
]
pending_declined_match_fields = [
    'Initiation Date', 'Business Champion Name', 'Student Sponsor Name',
    'WBL Opportunity Type', 'District', 'Event Title', 'School or Program Site'
]

# Drop Completed duplicates (keep first)
completed_dupes = dfmain[dfmain['Placement Status'] == 'Completed'].duplicated(
    subset=completed_match_fields, keep='first'
)
completed_dupe_indices = dfmain[dfmain['Placement Status'] == 'Completed'][completed_dupes].index
dfmain = dfmain.drop(completed_dupe_indices)

# Drop Pending/Declined duplicates (keep first)
non_completed_dupes = dfmain[dfmain['Placement Status'] != 'Completed'].duplicated(
    subset=pending_declined_match_fields, keep='first'
)
non_completed_dupe_indices = dfmain[dfmain['Placement Status'] != 'Completed'][non_completed_dupes].index
dfmain = dfmain.drop(non_completed_dupe_indices)

# Mark ‘Internship in Process” 2024-2025 rows as ‘Completed’
internship_2024_25 = (
    (dfmain['Placement Status'] == 'Internship In Process') &
    (dfmain['School Year'] == '2024-2025')
)
dfmain.loc[internship_2024_25, 'Placement Status'] = 'Completed'

# Transfer internship counts to Complete Student Internships for these rows
dfmain.loc[internship_2024_25, 'Complete Student Internships'] = dfmain.loc[internship_2024_25, 'Internships in Progress']
dfmain.loc[internship_2024_25, 'Internships in Progress'] = 0

# Reclassify remaining pending statuses as "Declined - Unfinished"
pending_statuses = [
    'Initial Contact Made', 'Pending-Scheduling', 'Scheduled Interview',
    'Internship In Process', 'Scheduled Event (Pending Completion)'
]
dfmain.loc[dfmain['Placement Status'].isin(pending_statuses), 'Placement Status'] = 'Declined - Unfinished'

# Transfer pending counts to declined counts for reclassified rows
dfmain.loc[
    dfmain['Placement Status'] == 'Declined - Unfinished',
    'Declined or Cancelled Student Interactions'
] = (
    dfmain.loc[dfmain['Placement Status'] == 'Declined - Unfinished', 'Pending Student Interactions'] +
    dfmain.loc[dfmain['Placement Status'] == 'Declined - Unfinished', 'Internships in Progress']
)

# Zero out the pending columns for these rows
dfmain.loc[dfmain['Placement Status'] == 'Declined - Unfinished', 'Pending Student Interactions'] = 0
dfmain.loc[dfmain['Placement Status'] == 'Declined - Unfinished', 'Internships in Progress'] = 0

# Drop the now-irrelevant pending columns
dfmain = dfmain.drop(columns=['Pending Student Interactions', 'Internships in Progress'])

# Verify
print(f"Final row count: {len(dfmain)}")
print(f"\n2024-2025 Internships now Completed: {internship_2024_25.sum()}")
print(f"\nPlacement Status counts:")
print(dfmain['Placement Status'].value_counts())

Final row count: 10085

2024-2025 Internships now Completed: 59

Placement Status counts:
Placement Status
Completed                           7660
Declined-Applicant                   475
Declined/Cancelled-Other             222
Declined - Business Unresponsive     219
Declined - Unfinished                218
Declined-Business Scheduling         217
Declined-Business                    210
Declined - Student Profile           176
Declined- Student Applicant          173
Declined - Student Other             111
Declined - Student Unresponsive       85
Declined - Staff Scheduling           75
Declined-Intern NOT Selected          66
Cancelled-COVID                       63
Declined - Staff Applicant            35
Declined - Staff Unresponsive         26
Declined-Opportunity FULL             24
Cancelled-Weather                     17
Terminated                            12
Cancelled-Illness                      1
Name: count, dtype: int64


# 'Placement Status' Parent Column

In [None]:
# Create parent category column
def categorize_placement_status(status):
    if status == 'Completed':
        return 'Completed'
    elif status in ['Cancelled-COVID', 'Cancelled-Weather', 'Cancelled-Illness']:
        return 'Cancelled'
    else:
        return 'Declined'

dfmain['Placement Status Category'] = dfmain['Placement Status'].apply(categorize_placement_status)

# Verify
dfmain['Placement Status Category'].value_counts()

Unnamed: 0_level_0,count
Placement Status Category,Unnamed: 1_level_1
Completed,7601
Declined,2344
Cancelled,81


# Placement Status & Numeric Columns Mismatch

In [None]:
# Define numeric columns
numeric_cols = [
    'Complete Student Trainings',
    'Complete Staff Trainings',
    'Complete Student Interactions',
    'Complete Student Internships',
    'Declined or Cancelled Student Interactions'
]

# Fix double-counting: rows with values in multiple numeric columns
dfmain['num_cols_with_values'] = (dfmain[numeric_cols] > 0).sum(axis=1)
rows_with_multiple = dfmain['num_cols_with_values'] > 1

# Zero out Complete Student Interactions for rows with Trainings + Interactions
dfmain.loc[rows_with_multiple, 'Complete Student Interactions'] = 0

# Drop helper column
dfmain = dfmain.drop(columns=['num_cols_with_values'])

# Create Cancelled column and split Declined/Cancelled
dfmain['Cancelled Student Interactions'] = 0
dfmain = dfmain.rename(columns={
    'Declined or Cancelled Student Interactions': 'Declined Student Interactions'
})

# Move Cancelled values to correct column based on Placement Status Category
cancelled_mask = dfmain['Placement Status Category'] == 'Cancelled'
dfmain.loc[cancelled_mask, 'Cancelled Student Interactions'] = dfmain.loc[cancelled_mask, 'Declined Student Interactions']
dfmain.loc[cancelled_mask, 'Declined Student Interactions'] = 0

# Update numeric_cols with new column names
numeric_cols = [
    'Complete Student Trainings',
    'Complete Staff Trainings',
    'Complete Student Interactions',
    'Complete Student Internships',
    'Declined Student Interactions',
    'Cancelled Student Interactions'
]

# Find all-zero rows
all_zeros = (dfmain[numeric_cols] == 0).all(axis=1)

# WBL Opportunity Type to numeric column mapping
wbl_to_column_map = {
    'Staff Training': 'Complete Staff Trainings',
    'Regional Advisory Meeting': 'Complete Staff Trainings',
    'Site Visit - Staff': 'Complete Staff Trainings',
    'Student Training': 'Complete Student Trainings',
    'Professionalism 101 Training': 'Complete Student Interactions',
    'Informational Interview Video': 'Complete Student Interactions',
    'Career Story Video': 'Complete Student Interactions',
    'e-WBL Informational Interview': 'Complete Student Interactions',
    'e-WBL Class Presentation': 'Complete Student Interactions',
    'Job Fair': 'Complete Student Interactions',
    'Class/Group Mentorship': 'Complete Student Interactions',
    'Industry Sponsored Project': 'Complete Student Interactions',
    'Class Presentation': 'Complete Student Interactions',
    'Job Shadow': 'Complete Student Interactions',
    'Site Visit': 'Complete Student Interactions',
    'Speakers Bureau': 'Complete Student Interactions',
    'Event': 'Complete Student Interactions',
    'Individual Mentorship': 'Complete Student Interactions',
    'Paid Job': 'Complete Student Interactions',
    'Internship 60': 'Complete Student Internships',
    'Internship 120': 'Complete Student Internships',
    'Internship 320': 'Complete Student Internships',
    'Apprenticeship': 'Complete Student Internships'
}

# Fill all-zero rows
for idx in dfmain[all_zeros].index:
    row = dfmain.loc[idx]
    wbl_type = row['WBL Opportunity Type']
    category = row['Placement Status Category']

    if category == 'Completed':
        if wbl_type in wbl_to_column_map:
            dfmain.loc[idx, wbl_to_column_map[wbl_type]] = 1
    elif category == 'Declined':
        dfmain.loc[idx, 'Declined Student Interactions'] = 1
    elif category == 'Cancelled':
        dfmain.loc[idx, 'Cancelled Student Interactions'] = 1

# Fix Placement Status & Numeric Column mismatches
# Fix 2 Completed rows that should be Declined based on PPBEA Notes
completed_should_be_declined = [3989, 8420]
dfmain.loc[completed_should_be_declined, 'Placement Status'] = 'Declined- Student Applicant'
dfmain.loc[completed_should_be_declined, 'Placement Status Category'] = 'Declined'

# Fix 2019-2020 Internship rows marked as Declined that should be Completed
declined_should_be_completed = dfmain[
    (dfmain['Placement Status'] == 'Declined - Unfinished') &
    (dfmain['School Year'] == '2019-2020') &
    (dfmain['WBL Opportunity Type'].isin(['Internship 60', 'Internship 120', 'Internship 320'])) &
    (dfmain['Complete Student Interactions'] > 0)
].index

dfmain.loc[declined_should_be_completed, 'Placement Status'] = 'Completed'
dfmain.loc[declined_should_be_completed, 'Placement Status Category'] = 'Completed'
dfmain.loc[declined_should_be_completed, 'Complete Student Internships'] = dfmain.loc[declined_should_be_completed, 'Complete Student Interactions']
dfmain.loc[declined_should_be_completed, 'Complete Student Interactions'] = 0

# Standardizing District Names and Reorganize Column Structure

In [None]:
# Standardize District Names

district_mapping = {
    'D11': 'Colorado Springs (D11)',
    'D20': 'Academy (D20)',
    'D49': 'El Paso County (D49)',
    'D2': 'Harrison (D2)',
    'D3': 'Widefield (D3)',
    'D8': 'Fountain-Fort Carson (D8)',
    'D12': 'Cheyenne Mountain (D12)',
    'D14': 'Manitou Springs (D14)',
    'D38': 'Lewis-Palmer (D38)',
    'CEC-CS': 'Colorado Springs Early College (CEC-CS)',
    'BLR': 'Banning Lewis Ranch (BLR)',
    'WPSD': 'Woodland Park (WPSD)',
    'Ellicott': 'Ellicott (D22)',
    'Peyton': 'Peyton (D23JT)',
    'Calhan': 'Calhan (RJ-1)',
    'Atlas Prep': 'Atlas Preparatory',
    'CPA/PPOS': 'CO Digital BOCES (CPA/PPOS)',
    'PTEC': 'Power Technical (PTEC)',
    'Goal H.S.': 'Goal High School',
    'Mon Impact': 'Monumental Impact',
    'Peak Ed': 'Peak Education',
    'Miami-Yoder': 'Miami-Yoder (JT-60)',
    'ECA': 'Evangel Christian Academy',
    'MET': 'Mountain Employment Training',
    'TCA': 'The Classical Academy',
    'CCV': 'Cripple Creek-Victor (RE-1)',
    'DYS': 'Division of Youth Services',
    'Vanguard': 'Vanguard School',
    'Homeschool': 'Homeschool',
    'Various': 'Various',
    'BBBS': 'Big Brothers Big Sisters',
    'Roundup': 'Roundup School'
}

dfmain['District'] = dfmain['District'].replace(district_mapping)

# Reorganize Column Structure

column_order = [
    'School Year',
    'Derived Event Date',
    'Event Title',
    'WBL Opportunity Type',
    'Placement Status Category',
    'Placement Status',
    'Business Champion Name',
    'Student Sponsor Name',
    'District',
    'School or Program Site',
    'Complete Student Interactions',
    'Complete Student Trainings',
    'Complete Student Internships',
    'Declined Student Interactions',
    'Cancelled Student Interactions',
    'Complete Staff Trainings',
    'Initiation Date',
    'Status Update Date',
    'Event Date or Start Date'
]

dfmain = dfmain[column_order]

In [None]:
dfmain.head()

Unnamed: 0,School Year,Derived Event Date,Event Title,WBL Opportunity Type,Placement Status Category,Placement Status,Business Champion Name,Student Sponsor Name,District,School or Program Site,Complete Student Interactions,Complete Student Trainings,Complete Student Internships,Declined Student Interactions,Cancelled Student Interactions,Complete Staff Trainings,Initiation Date,Status Update Date,Event Date or Start Date
0,2019-2020,2019-01-07,Advantage Manufacturing Class Presentation,Class Presentation,Declined,Declined-Business,Advantage Manufacturing,D8 Emily Sherwood,Fountain-Fort Carson (D8),,0.0,0.0,0.0,25.0,0,0.0,2019-12-17,2020-01-09,2019-01-07
1,2019-2020,2019-01-07,JPM Prototype Advanced Manufacturing Class Pre...,Class Presentation,Declined,Declined-Business,JPM Prototype,D8 Emily Sherwood,Fountain-Fort Carson (D8),,0.0,0.0,0.0,25.0,0,0.0,2019-12-17,2020-01-09,2019-01-07
2,2019-2020,2019-01-07,Techno-Chaos Robotics and Automation Class Pre...,Class Presentation,Completed,Completed,Techno Chaos,D8 Emily Sherwood,Fountain-Fort Carson (D8),,25.0,0.0,0.0,0.0,0,0.0,2019-12-17,2020-01-09,2019-01-07
3,2019-2020,2019-07-01,Town of Monument Administration Internship,Internship 120,Completed,Completed,Town of Monument,"Mark Hyatt, Falcon Aero Lab",El Paso County (D49),,1.0,0.0,0.0,0.0,0,0.0,2019-07-01,2019-12-02,2019-07-01
4,2019-2020,2019-08-02,BalSeal Robotic Focused Site Visit,Site Visit,Completed,Completed,BalSeal Engineering,,Lewis-Palmer (D38),,18.0,0.0,0.0,0.0,0,0.0,2019-07-01,2019-07-24,2019-08-02


# Final Data Summary

In [None]:
print("=" * 60)
print("FINAL DATASET SUMMARY")
print("=" * 60)

# Size
print(f"\nSize: {dfmain.shape[0]:,} rows × {dfmain.shape[1]} columns")

# Date Range
print(f"\nDate Range: {dfmain['Derived Event Date'].min().strftime('%B %d, %Y')} - {dfmain['Derived Event Date'].max().strftime('%B %d, %Y')}")

# Records by Year
print("\nRecords by Year:")
year_counts = dfmain['School Year'].value_counts().sort_index()
for year, count in year_counts.items():
    print(f"  {year}: {count:,}")

# Status Distribution (by Category)
print("\nStatus Distribution (by Category):")
status_counts = dfmain['Placement Status Category'].value_counts()
for status, count in status_counts.items():
    pct = count / len(dfmain) * 100
    print(f"  {status}: {count:,} ({pct:.1f}%)")

# Status Distribution (detailed)
print("\nStatus Distribution (detailed):")
detailed_status = dfmain['Placement Status'].value_counts()
for status, count in detailed_status.items():
    pct = count / len(dfmain) * 100
    print(f"  {status}: {count:,} ({pct:.1f}%)")

# Student Engagement Metrics
print("\nStudent Engagement Metrics:")
total_completed = dfmain['Complete Student Interactions'].sum() + dfmain['Complete Student Trainings'].sum() + dfmain['Complete Student Internships'].sum()
total_declined = dfmain['Declined Student Interactions'].sum()
total_cancelled = dfmain['Cancelled Student Interactions'].sum()
grand_total = total_completed + total_declined + total_cancelled

print(f"  Total Completed: {total_completed:,.0f}")
print(f"  Total Declined: {total_declined:,.0f}")
print(f"  Total Cancelled: {total_cancelled:,.0f}")
print(f"  Grand Total: {grand_total:,.0f} student engagements")

# Average students per completed event
completed_events = dfmain[dfmain['Placement Status Category'] == 'Completed']
avg_students = total_completed / len(completed_events) if len(completed_events) > 0 else 0
print(f"  Average Students per Completed Event: {avg_students:.1f}")

# Staff Metrics
print("\nStaff Metrics:")
print(f"  Staff Trainings: {dfmain['Complete Staff Trainings'].sum():,.0f}")

# Top 5 Districts
print("\nTop 5 Districts:")
top_districts = dfmain['District'].value_counts().head(5)
for i, (district, count) in enumerate(top_districts.items(), 1):
    print(f"  {i}. {district}: {count:,} events")

# Top 5 WBL Types
print("\nTop 5 WBL Opportunity Types:")
top_wbl = dfmain['WBL Opportunity Type'].value_counts().head(5)
for i, (wbl, count) in enumerate(top_wbl.items(), 1):
    print(f"  {i}. {wbl}: {count:,} events")

# Top 5 Businesses
print("\nTop 5 Business Partners:")
top_business = dfmain['Business Champion Name'].value_counts().head(5)
for i, (business, count) in enumerate(top_business.items(), 1):
    print(f"  {i}. {business}: {count:,} events")

print("\n" + "=" * 60)

FINAL DATASET SUMMARY

Size: 10,085 rows × 19 columns

Date Range: January 07, 2019 - September 22, 2025

Records by Year:
  2019-2020: 331
  2020-2021: 1,927
  2021-2022: 2,073
  2022-2023: 2,754
  2023-2024: 1,559
  2024-2025: 1,441

Status Distribution (by Category):
  Completed: 7,686 (76.2%)
  Declined: 2,318 (23.0%)
  Cancelled: 81 (0.8%)

Status Distribution (detailed):
  Completed: 7,686 (76.2%)
  Declined-Applicant: 475 (4.7%)
  Declined/Cancelled-Other: 222 (2.2%)
  Declined - Business Unresponsive: 219 (2.2%)
  Declined-Business Scheduling: 217 (2.2%)
  Declined-Business: 210 (2.1%)
  Declined - Unfinished: 190 (1.9%)
  Declined - Student Profile: 176 (1.7%)
  Declined- Student Applicant: 175 (1.7%)
  Declined - Student Other: 111 (1.1%)
  Declined - Student Unresponsive: 85 (0.8%)
  Declined - Staff Scheduling: 75 (0.7%)
  Declined-Intern NOT Selected: 66 (0.7%)
  Cancelled-COVID: 63 (0.6%)
  Declined - Staff Applicant: 35 (0.3%)
  Declined - Staff Unresponsive: 26 (0.3%)
 

# Export

In [None]:
import os

output_path = '/content/drive/MyDrive/Work/BEA/2025 BEA Data Project Shared Folder/Data/(Main) Data Sources/Existing/PPBEA Pipeline/Cleaned/'
os.makedirs(output_path, exist_ok=True)

output_file = f'{output_path}PPBEA_Pipeline_2019-2025_Cleaned.csv'
dfmain.to_csv(output_file, index=False)

print(f"Cleaned data saved to: {output_file}")
print(f"Final shape: {dfmain.shape}")

Cleaned data saved to: /content/drive/MyDrive/Work/BEA/2025 BEA Data Project Shared Folder/Data/(Main) Data Sources/Existing/PPBEA Pipeline/Cleaned/PPBEA_Pipeline_2019-2025_Cleaned.csv
Final shape: (10085, 19)


#