In [None]:
import pandas as pd  # type: ignore

# Load the first sheet of the Excel file into a DataFrame
#excel_source = pd.read_excel('processed_data/master_CMS.xlsx', sheet_name=0)

#excel_source.to_csv('processed_data/times_df.csv', index=False)
#print("DataFrame saved as 'processed_data/times_df.csv'")

times_df = pd.read_csv('processed_data/times_df.csv')

# Display basic information about the DataFrame
print(f"Shape: {times_df.shape}")
print(f"Columns: {list(times_df.columns)}")
times_df.head()


In [None]:
# List unique values for sheet, sex, and event columns
print("Unique values in 'sheet' column:")
print(times_df['SHEET'].unique())
print("\nUnique values in 'sex' column:")
print(times_df['SEX'].unique())
print("\nUnique values in 'event' column:")
print(times_df['EVENT'].unique())


In [None]:
# Extract and categorize unique event types
events = times_df['EVENT'].unique()

# Custom sorting function for numeric distances
def numeric_sort_key(event):
    import re
    # Extract the number at the beginning of the event
    match = re.match(r'(\d+)', event)
    if match:
        return int(match.group(1))
    return 0

# Custom sorting function for diving events (1-meter before 3-meter, 6 dives before 11 dives)
def diving_sort_key(event):
    if '1-METER' in event:
        meter_priority = 0
    elif '3-METER' in event:
        meter_priority = 1
    else:
        meter_priority = 2
    
    if '6 dives' in event:
        dive_priority = 0  # 6 dives first
    elif '11 dives' in event:
        dive_priority = 1  # 11 dives second
    else:
        dive_priority = 2  # no dives specified last
    
    return (meter_priority, dive_priority, event)

# Custom sorting function for relay events (by stroke type, then distance)
def relay_sort_key(event):
    stroke_order = {'FREE': 0, 'BACK': 1, 'BREAST': 2, 'FLY': 3, 'MEDLEY': 4}
    for stroke, order in stroke_order.items():
        if stroke in event:
            return (order, numeric_sort_key(event), event)
    return (5, numeric_sort_key(event), event)

# Custom sorting function for split events (by stroke type, then distance)
def spl_sort_key(event):
    stroke_order = {'FREE': 0, 'BACK': 1, 'BREAST': 2, 'FLY': 3}
    for stroke, order in stroke_order.items():
        if stroke in event:
            return (order, numeric_sort_key(event), event)
    return (4, numeric_sort_key(event), event)

# Categorize events by type - pure events only (no RELAY or Spl.)
free_events = sorted([e for e in events if 'FREE' in e and 'RELAY' not in e and 'Spl.' not in e], key=numeric_sort_key)
back_events = sorted([e for e in events if 'BACK' in e and 'RELAY' not in e and 'Spl.' not in e], key=numeric_sort_key)
breast_events = sorted([e for e in events if 'BREAST' in e and 'RELAY' not in e and 'Spl.' not in e], key=numeric_sort_key)
fly_events = sorted([e for e in events if 'FLY' in e and 'RELAY' not in e and 'Spl.' not in e], key=numeric_sort_key)
im_events = sorted([e for e in events if 'IM' in e and 'RELAY' not in e and 'Spl.' not in e], key=numeric_sort_key)
diving_events = sorted([e for e in events if 'METER' in e], key=diving_sort_key)

# RELAY events (excluding those with Spl.)
relay_events = sorted([e for e in events if 'RELAY' in e and 'Spl.' not in e], key=relay_sort_key)

# Split events (Spl. - including those with both RELAY and Spl.)
spl_events = sorted([e for e in events if 'Spl.' in e], key=spl_sort_key)

# Print each category
print("FREE:")
for event in free_events:
    print(f"  {event}")

print("\nBACK:")
for event in back_events:
    print(f"  {event}")

print("\nBREAST:")
for event in breast_events:
    print(f"  {event}")

print("\nFLY:")
for event in fly_events:
    print(f"  {event}")

print("\nIM:")
for event in im_events:
    print(f"  {event}")

print("\nDIVING (METER):")
for event in diving_events:
    print(f"  {event}")

print("\nRELAY:")
for event in relay_events:
    print(f"  {event}")

print("\nSpl.:")
for event in spl_events:
    print(f"  {event}")


In [45]:
sheet_order = [
    'CMS All Time Top 10',
    'CMS Axelrood Pool Records',
    'CMS Frosh Swimming & Diving Records',
    'Development of Team Records (October 2001 to March 2025)', 
    'CMS at UCSD',
    'CMS at Cal Baptist Distance Meet',
    'CMS at PP',
    'CMS at PP Combined', 
    'CMS SCIAC Champions',
    'SCIAC All Time Top 10 Performers',
    'SCIAC Records',
    'NCAA TOP 20'
]

sex_order = ['Athena', 'Stag', 'Women', 'Men']

# Create ordered list of all events
event_order = [
    # FREE
    '50 FREE',
    '100 FREE',
    '200 FREE',
    '500 FREE',
    '1000 FREE',
    '1650 FREE',
    
    # BACK
    '50 BACK',
    '100 BACK',
    '200 BACK',
    '300 BACK',
    
    # BREAST
    '50 BREAST',
    '100 BREAST',
    '200 BREAST',
    '300 BREAST',
    
    # FLY
    '100 FLY',
    '200 FLY',
    '300 FLY',
    
    # IM
    '200 IM',
    '300 IM',
    '400 IM',
    
    # DIVING (METER)
    '1-METER (6 dives)',
    '1-METER (11 dives)',
    '1-METER',
    '3-METER (6 dives)',
    '3-METER (11 dives)',
    '3-METER',
    
    # RELAY
    '200 FREE RELAY',
    '400 FREE RELAY',
    '500 FREE RELAY- (50-100-150-200)',
    '800 FREE RELAY',
    '200 MEDLEY RELAY',
    '400 MEDLEY RELAY',
    '500 MEDLEY RELAY - (200 BACK-150 BR-100 FL-50 FS)'

    
    # Spl.
    '50 FREE - RELAY Spl.',
    '50 FREE Spl.',
    '100 FREE - RELAY Spl.',
    '100 FREE Spl.',
    '200 FREE - RELAY Spl.',
    '200 FREE Spl.',
    '50 BACK - RELAY Spl.',
    '50 BACK Spl.',
    '50 BREAST - RELAY Spl.',
    '50 BREAST Spl.',
    '100 BREAST - RELAY Spl.',
    '100 BREAST Spl.',
    '50 FLY - RELAY Spl.',
    '50 FLY Spl.',
    '100 FLY - RELAY Spl.',
    '100 FLY Spl.'
]

In [47]:
# Debug: Check unique values in SHEET and SEX
print("\nUnique SHEET values:")
print(times_df['SHEET'].unique())
print("\nUnique SEX values:")
print(times_df['SEX'].unique())

# Create hierarchical sorting function
def create_sort_key(row):
    # Get the index position in each order list
    sheet_idx = sheet_order.index(row['SHEET']) if row['SHEET'] in sheet_order else len(sheet_order)
    sex_idx = sex_order.index(row['SEX']) if row['SEX'] in sex_order else len(sex_order)
    event_idx = event_order.index(row['EVENT']) if row['EVENT'] in event_order else len(event_order)
    
    # Return tuple for hierarchical sorting
    return (sheet_idx, sex_idx, event_idx)

# Apply the sorting
ordered_df = times_df.copy()
ordered_df['sort_key'] = ordered_df.apply(create_sort_key, axis=1)

# Debug: Show some sort keys
print("\nSample sort keys:")
print(ordered_df[['SHEET', 'SEX', 'EVENT', 'sort_key']].head(10))

ordered_df = ordered_df.sort_values('sort_key').drop('sort_key', axis=1)

# Save to CSV
ordered_df.to_csv('processed_data/ordered_times.csv', index=False)

print(f"\nCreated ordered_times.csv with {len(ordered_df)} rows")
print("First few rows:")
# Use the actual column names
time_col = [col for col in ordered_df.columns if 'TIME' in col.upper()][0]
print(ordered_df[['SHEET', 'SEX', 'EVENT', time_col, 'NAME']].head(10))



Unique SHEET values:
['CMS at PP' 'CMS Axelrood Pool Records'
 'CMS Frosh Swimming & Diving Records' 'NCAA TOP 20' 'CMS at UCSD'
 'CMS SCIAC Champions' 'CMS at PP Combined' 'CMS All Time Top 10'
 'SCIAC All Time Top 10 Performers' 'CMS at Cal Baptist Distance Meet'
 'Development of Team Records (October 2001 to March 2025)'
 'SCIAC Records']

Unique SEX values:
['Athena' 'Stag' 'Women' 'Men']

Sample sort keys:
       SHEET     SEX    EVENT   sort_key
0  CMS at PP  Athena  50 FREE  (6, 0, 0)
1  CMS at PP  Athena  50 FREE  (6, 0, 0)
2  CMS at PP  Athena  50 FREE  (6, 0, 0)
3  CMS at PP  Athena  50 FREE  (6, 0, 0)
4  CMS at PP  Athena  50 FREE  (6, 0, 0)
5  CMS at PP  Athena  50 FREE  (6, 0, 0)
6  CMS at PP  Athena  50 FREE  (6, 0, 0)
7  CMS at PP  Athena  50 FREE  (6, 0, 0)
8  CMS at PP  Athena  50 FREE  (6, 0, 0)
9  CMS at PP  Athena  50 FREE  (6, 0, 0)

Created ordered_times.csv with 4173 rows
First few rows:
                    SHEET     SEX    EVENT   TIME              NAME
2417  C