# ============================================================================
# SWIMMING FINA BIAS ANALYSIS - JUPYTER NOTEBOOK
# Testing the hypothesis that breaststroke events have inflated FINA points
# ============================================================================

# %% [markdown]
# # Swimming FINA Bias Analysis
# 
# This notebook tests the hypothesis that certain swimming events (particularly breaststroke)
# have systematically inflated FINA points compared to their actual competitive difficulty.
# 
# ## Research Question
# Do breaststroke events receive inflated FINA points relative to their percentile performance
# compared to other strokes like freestyle?
# 
# ## Methodology
# - Stratified random sampling across NCAA Divisions 1-3 and NAIA
# - Focus on men's SCY events: 100/200 Breast, 100/200 Free
# - Statistical comparison using percentile-normalized FINA points

In [2]:
# %% Cell 1: Setup and Imports
import warnings
warnings.filterwarnings('ignore')

# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import time
import random
from datetime import datetime

# SwimCloud scraping
try:
    from SwimScraper import SwimScraper as ss
    print("✓ SwimScraper imported successfully")
except ImportError:
    print("❌ SwimScraper not found. Install with: pip install SwimScraper")
    raise

# Set random seeds for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print(f"Analysis started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

✓ SwimScraper imported successfully
Analysis started: 2025-08-29 19:53:09


In [3]:
# %% Cell 2: Configuration and Constants (REPLACEMENT)

# FINA Base Times for men's SCY (approximate - update with official values)
FINA_BASE_TIMES = {
    '50 Free': 17.63,      # Caeleb Dressel — NCAA DI record :contentReference[oaicite:0]{index=0}
    '100 Free': 39.83,     # Jordan Crooks — NCAA DI record :contentReference[oaicite:1]{index=1}
    '200 Free': 88.33,     # Luke Hobson — recorded as 1:28.33 = 88.33 seconds :contentReference[oaicite:2]{index=2}
    '100 Breast': 49.51,   # Julian Smith — NCAA record :contentReference[oaicite:3]{index=3}
    '200 Breast': 106.35,  # Leon Marchand — NCAA record (1:46.35 = 106.35 s) :contentReference[oaicite:4]{index=4}
    '100 Back': 43.20,     # Hubi Kos — NCAA record :contentReference[oaicite:5]{index=5}
    '100 Fly': 42.80,      # Caeleb Dressel — NCAA record :contentReference[oaicite:6]{index=6}
    '200 IM': 96.34        # Leon Marchand — NCAA record (1:36.34 = 96.34 s) :contentReference[oaicite:7]{index=7}
}


# Analysis configuration
TARGET_SAMPLE_SIZE = 50   # Flexible — change this to rerun at different scales
PRIORITY_EVENTS = ['100 Breast', '200 Breast', '100 Free', '200 Free']
CURRENT_YEAR = 2024

# Minimum per division to avoid zero-sampling
MIN_PER_DIVISION = 10
DEFAULT_ROSTER_SIZE = 25   # fallback if roster not returned
D1_TIERS = 3               # stratify D1 into rank tiers

print(f"Target sample size: {TARGET_SAMPLE_SIZE}")
print(f"Priority events: {PRIORITY_EVENTS}")
print("Division allocation will be computed dynamically based on rosters")


Target sample size: 50
Priority events: ['100 Breast', '200 Breast', '100 Free', '200 Free']
Division allocation will be computed dynamically based on rosters


In [4]:
# %% Cell 3: Helper Functions
def time_to_seconds(time_str):
    """Convert swimming time string (MM:SS.ss or SS.ss) to seconds"""
    try:
        if ':' in str(time_str):
            parts = str(time_str).split(':')
            minutes = float(parts[0])
            seconds = float(parts[1])
            return minutes * 60 + seconds
        else:
            return float(time_str)
    except:
        return 9999.99

def calculate_fina_points(time_seconds, event):
    """Calculate FINA points using official formula: 1000 * (B/T)^3"""
    if event not in FINA_BASE_TIMES or time_seconds <= 0 or time_seconds >= 999:
        return None
    
    base_time = FINA_BASE_TIMES[event]
    fina_points = 1000 * (base_time / time_seconds) ** 3
    return round(fina_points, 2)

def safe_api_call(func, *args, **kwargs):
    """Wrapper for API calls with error handling and debug logging"""
    try:
        result = func(*args, **kwargs)
        time.sleep(0.2)  # prevent SwimCloud blocking
        return result
    except Exception as e:
        print(f"[ERROR] {func.__name__} failed. args={args}, kwargs={kwargs} -> {e}")
        return None

# --- PATCH: collect swimmer data safely ---
def collect_swimmer_data(swimmer, team_name, team_id, division):
    """Collect times for a single swimmer in priority events"""
    if not swimmer or not isinstance(swimmer, dict):
        print(f"  [WARN] Invalid swimmer object on {team_name} ({team_id})")
        return None

    swimmer_data = {
        'swimmer_ID': swimmer.get('swimmer_ID'),
        'swimmer_name': swimmer.get('swimmer_name', 'Unknown'),
        'team_name': team_name,
        'team_ID': team_id,
        'division': division,
        'times': {}
    }

    for event in PRIORITY_EVENTS:
        times = safe_api_call(ss.getSwimmerTimes, swimmer_data['swimmer_ID'], event)
        if not times:
            print(f"    [INFO] No times for {swimmer_data['swimmer_name']} ({event})")
            continue

        try:
            # pick season best
            season_best = min(times, key=lambda x: time_to_seconds(x.get('time', '99:99.99')))
            time_seconds = time_to_seconds(season_best.get('time'))
            if time_seconds < 999:
                swimmer_data['times'][event] = {
                    'time_str': season_best.get('time'),
                    'time_seconds': time_seconds,
                    'meet_name': season_best.get('meet_name', ''),
                    'year': season_best.get('year', '')
                }
        except Exception as e:
            print(f"    [ERROR] Processing times failed for {swimmer_data['swimmer_name']} ({event}): {e}")
            continue

    return swimmer_data if swimmer_data['times'] else None

print("✓ Helper functions defined")


✓ Helper functions defined


In [5]:
# %% Cell 4: Data Collection Functions (REPLACEMENT)

def get_division_rosters(year=CURRENT_YEAR, gender="M"):
    """Fetch all teams in each division and estimate roster sizes"""
    divisions = { 'Division 1': [], 'Division 2': [], 'Division 3': [], 'NAIA': [] }

    for div in divisions.keys():
        teams = safe_api_call(ss.getCollegeTeams, division_names=[div])
        if not teams:
            continue

        for t in teams:
            team_id = t.get('team_ID')
            team_name = t.get('team_name')
            roster = safe_api_call(ss.getRoster, team=team_name, team_ID=team_id, gender=gender, year=year)
            roster_size = len(roster) if roster else DEFAULT_ROSTER_SIZE
            divisions[div].append({"team_ID": team_id, "team_name": team_name, "roster_size": roster_size})

    return divisions


def compute_division_targets(divisions, total_sample_size=TARGET_SAMPLE_SIZE, min_per_division=MIN_PER_DIVISION):
    """Compute division quotas proportional to estimated roster sizes"""
    est_counts = {div: sum(t['roster_size'] for t in teams) for div, teams in divisions.items()}
    total = sum(est_counts.values()) or 1

    targets = {
        div: max(int(round((count/total) * total_sample_size)), min_per_division)
        for div, count in est_counts.items()
    }

    # normalize so sum matches target_sample_size
    diff = total_sample_size - sum(targets.values())
    while diff != 0:
        for div in sorted(targets, key=lambda d: est_counts[d], reverse=True):
            if diff > 0:
                targets[div] += 1; diff -= 1
            elif diff < 0 and targets[div] > min_per_division:
                targets[div] -= 1; diff += 1
            if diff == 0: break

    print("Division targets:", targets)
    return targets


def allocate_team_samples(teams, div_target):
    """Allocate swimmers to teams proportional to roster size"""
    total_roster = sum(t['roster_size'] for t in teams) or 1
    raw_allocs = [(t, (t['roster_size']/total_roster)*div_target) for t in teams]
    rounded = [(t, int(round(val))) for t, val in raw_allocs]

    # adjust rounding
    diff = div_target - sum(val for _, val in rounded)
    idx = 0
    while diff != 0 and rounded:
        t, val = rounded[idx % len(rounded)]
        new_val = val + (1 if diff > 0 else -1)
        if new_val >= 0:
            rounded[idx % len(rounded)] = (t, new_val)
            diff += -1 if diff > 0 else 1
        idx += 1

    return [(t, min(count, t['roster_size'])) for t, count in rounded]


In [6]:
# %% Cell 5: Main Data Collection (REPLACEMENT WITH AUTO STEP SIZE)

def run_data_collection_proportional():
    print("="*60)
    print("STARTING PROPORTIONAL DATA COLLECTION")
    print("="*60)

    start_time = time.time()
    all_swimmers = []

    # get rosters for each division
    divisions = get_division_rosters()
    division_targets = compute_division_targets(divisions, TARGET_SAMPLE_SIZE)

    for div, div_target in division_targets.items():
        teams = divisions[div]
        if not teams or div_target <= 0:
            continue

        print(f"\n--- Collecting {div_target} swimmers from {div} ---")

        if div == "Division 1":
            # RANKED LIST OF TEAMS
            ranked_teams = safe_api_call(ss.getTeamRankingsList, gender="M", year=CURRENT_YEAR)
            if not ranked_teams:
                print("[WARN] Could not get Division 1 rankings, skipping")
                continue

            n_ranked = len(ranked_teams)

            # Estimate how many D1 teams to include (~10 swimmers/team)
            est_teams_needed = max(1, div_target // 10)
            step_size = max(1, n_ranked // est_teams_needed)
            print(f"D1 stratified step size = {step_size} (target ~{est_teams_needed} teams)")

            # Select every kth team
            d1_selected_teams = ranked_teams[::step_size]

            # Convert into team entries with roster sizes
            d1_team_entries = []
            for t in d1_selected_teams:
                if not t or not isinstance(t, dict):
                    print(f"  [WARN] Invalid team object in D1 rankings, skipping")
                    continue

                team_id, team_name = t.get("team_ID"), t.get("team_name")
                if not team_id or not team_name:
                    print(f"  [WARN] Missing team_id/team_name in ranking entry: {t}")
                    continue

                roster = safe_api_call(
    ss.getRoster, team_ID=team_id, gender='M', year=CURRENT_YEAR
)
                if roster is None:
                    print(f"  [INFO] Empty or failed roster fetch for {team_name} ({team_id}), using default size")
                    roster_size = DEFAULT_ROSTER_SIZE
                else:
                    roster_size = len(roster)

                d1_team_entries.append({
                    "team_ID": team_id,
                    "team_name": team_name,
                    "roster_size": roster_size
                })

            if not d1_team_entries:
                print("[WARN] No valid D1 team entries, skipping division")
                continue

            # Allocate swimmers proportional to roster size within the sampled teams
            team_allocs = allocate_team_samples(d1_team_entries, div_target)

        else:
            # For D2, D3, and NAIA → proportional allocation
            team_allocs = allocate_team_samples(teams, div_target)

        # Collect swimmers according to team_allocs
        div_swimmers = []
        for team_entry, n in team_allocs:
            if n <= 0:
                continue

            team_id = team_entry.get("team_ID") or team_entry.get("teamID")
            team_name = team_entry.get("team_name") or team_entry.get("teamName", f"Team_{team_id}")

            if not team_id:
                print(f"  [WARN] Missing team_id in entry: {team_entry}")
                continue

            roster = safe_api_call(ss.getRoster, team_ID=team_id, gender="M", year=CURRENT_YEAR)
            if not roster:
                print(f"  [SKIP] No roster for {team_name} ({team_id})")
                continue

            chosen = random.sample(roster, min(n, len(roster)))
            for swimmer in chosen:
                swimmer_data = collect_swimmer_data(swimmer, team_name, team_id, div)
                if swimmer_data:
                    div_swimmers.append(swimmer_data)
                    if len(div_swimmers) >= div_target:
                        break
            if len(div_swimmers) >= div_target:
                break

        print(f"✓ {div} complete: {len(div_swimmers)} swimmers")
        all_swimmers.extend(div_swimmers)

    elapsed = (time.time() - start_time) / 60
    print(f"\nTotal swimmers collected: {len(all_swimmers)} in {elapsed:.1f} min")
    return all_swimmers

print("Ready to start proportional data collection. Run the next cell to begin.")


Ready to start proportional data collection. Run the next cell to begin.


In [7]:
# %% Cell 6: Execute Data Collection (REPLACEMENT)

swimmers_raw_data = run_data_collection_proportional()

print(f"\n🎉 Collection finished! Got data for {len(swimmers_raw_data)} swimmers")



STARTING PROPORTIONAL DATA COLLECTION
[ERROR] getRoster failed. args=(), kwargs={'team': 'American University', 'team_ID': 214, 'gender': 'M', 'year': 2024} -> 'NoneType' object has no attribute 'text'
[ERROR] getRoster failed. args=(), kwargs={'team': 'Arizona State University', 'team_ID': 87, 'gender': 'M', 'year': 2024} -> 'NoneType' object has no attribute 'text'
[ERROR] getRoster failed. args=(), kwargs={'team': 'Auburn University', 'team_ID': 127, 'gender': 'M', 'year': 2024} -> 'NoneType' object has no attribute 'text'
[ERROR] getRoster failed. args=(), kwargs={'team': 'Ball State University', 'team_ID': 221, 'gender': 'M', 'year': 2024} -> 'NoneType' object has no attribute 'text'
[ERROR] getRoster failed. args=(), kwargs={'team': 'Bellarmine University', 'team_ID': 10002386, 'gender': 'M', 'year': 2024} -> 'NoneType' object has no attribute 'text'
[ERROR] getRoster failed. args=(), kwargs={'team': 'Binghamton University', 'team_ID': 383, 'gender': 'M', 'year': 2024} -> 'NoneTy

KeyboardInterrupt: 

In [None]:
# %% Cell 7: Data Processing and Analysis
def process_raw_data(swimmers_raw_data):
    """Convert raw swimmer data to analysis-ready DataFrame"""
    print("Processing raw data into analysis format...")
    
    analysis_records = []
    event_times = {event: [] for event in PRIORITY_EVENTS}
    
    # Flatten data structure
    for swimmer in swimmers_raw_data:
        for event, time_data in swimmer['times'].items():
            time_seconds = time_data['time_seconds']
            fina_points = calculate_fina_points(time_seconds, event)
            
            if fina_points and fina_points > 0:
                record = {
                    'swimmer_ID': swimmer['swimmer_ID'],
                    'swimmer_name': swimmer['swimmer_name'],
                    'team_name': swimmer['team_name'],
                    'division': swimmer['division'],
                    'event': event,
                    'time_str': time_data['time_str'],
                    'time_seconds': time_seconds,
                    'fina_points': fina_points,
                    'meet_name': time_data.get('meet_name', ''),
                    'year': time_data.get('year', '')
                }
                analysis_records.append(record)
                event_times[event].append(time_seconds)
    
    # Calculate percentiles within each event
    for record in analysis_records:
        event = record['event']
        time_seconds = record['time_seconds']
        event_times_list = event_times[event]
        
        if len(event_times_list) > 1:
            # Percentile = percentage of swimmers this time beats
            beats_count = sum(1 for t in event_times_list if t > time_seconds)
            percentile = (beats_count / len(event_times_list)) * 100
            record['percentile'] = round(percentile, 2)
        else:
            record['percentile'] = 50.0
    
    df = pd.DataFrame(analysis_records)
    print(f"✓ Processed {len(df)} swimmer-event records")
    return df

# Process the data
df_analysis = process_raw_data(swimmers_raw_data)

# Quick data summary
print("\n" + "=" * 50)
print("DATA SUMMARY")
print("=" * 50)
print(f"Total records: {len(df_analysis)}")
print(f"Unique swimmers: {df_analysis['swimmer_ID'].nunique()}")
print(f"Events: {list(df_analysis['event'].unique())}")
print(f"Divisions: {df_analysis['division'].value_counts().to_dict()}")

In [None]:
# %% Cell 8: Statistical Analysis
def run_statistical_analysis(df):
    """Perform the main statistical analysis"""
    print("RUNNING STATISTICAL ANALYSIS")
    print("=" * 50)
    
    # Event bias ranking
    event_stats = df.groupby('event').agg({
        'fina_points': ['median', 'mean', 'std', 'count'],
        'percentile': ['median', 'mean']
    }).round(2)
    
    event_stats.columns = ['median_fina', 'mean_fina', 'std_fina', 'count', 
                          'median_percentile', 'mean_percentile']
    event_stats = event_stats.reset_index().sort_values('median_fina', ascending=False)
    
    print("EVENT BIAS RANKING (by median FINA points):")
    print(event_stats[['event', 'median_fina', 'count', 'median_percentile']])
    
    # Breaststroke vs Freestyle hypothesis test
    breast_data = df[df['event'].str.contains('Breast')]
    free_data = df[df['event'].str.contains('Free')]
    
    print(f"\nBREASTSTROKE HYPOTHESIS TEST:")
    print(f"Breaststroke records: {len(breast_data)}")
    print(f"Freestyle records: {len(free_data)}")
    
    if len(breast_data) > 10 and len(free_data) > 10:
        breast_median = breast_data['fina_points'].median()
        free_median = free_data['fina_points'].median()
        
        print(f"Breaststroke median FINA: {breast_median:.1f}")
        print(f"Freestyle median FINA: {free_median:.1f}")
        print(f"Difference: {breast_median - free_median:.1f} points")
        
        # Statistical significance test
        stat, p_value = stats.mannwhitneyu(
            breast_data['fina_points'], 
            free_data['fina_points'], 
            alternative='greater'
        )
        
        print(f"Mann-Whitney U test (breast > free): p = {p_value:.6f}")
        
        if p_value < 0.05:
            print("✓ SIGNIFICANT: Breaststroke FINA points are inflated!")
        else:
            print("✗ No significant evidence of breaststroke inflation")
        
        # Effect size (Cohen's d)
        breast_mean = breast_data['fina_points'].mean()
        free_mean = free_data['fina_points'].mean()
        pooled_std = np.sqrt(((len(breast_data)-1)*breast_data['fina_points'].var() + 
                             (len(free_data)-1)*free_data['fina_points'].var()) / 
                            (len(breast_data) + len(free_data) - 2))
        cohens_d = (breast_mean - free_mean) / pooled_std
        print(f"Effect size (Cohen's d): {cohens_d:.3f}")
        
        if abs(cohens_d) > 0.5:
            print("✓ Large effect size - practically significant difference")
        elif abs(cohens_d) > 0.3:
            print("✓ Medium effect size - moderate difference")
        else:
            print("Small effect size")
    
    return event_stats

# Run analysis
event_statistics = run_statistical_analysis(df_analysis)

In [None]:
# %% Cell 9: Visualization
def create_analysis_plots(df, event_stats):
    """Create comprehensive visualization plots"""
    
    plt.style.use('default')
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Swimming FINA Bias Analysis Results', fontsize=16, fontweight='bold')
    
    # 1. Event bias ranking
    axes[0,0].bar(range(len(event_stats)), event_stats['median_fina'], 
                  color=['lightcoral' if 'Breast' in event else 'lightblue' 
                         for event in event_stats['event']])
    axes[0,0].set_xticks(range(len(event_stats)))
    axes[0,0].set_xticklabels(event_stats['event'], rotation=45, ha='right')
    axes[0,0].set_title('Event Bias Ranking\n(Median FINA Points)')
    axes[0,0].set_ylabel('Median FINA Points')
    axes[0,0].grid(True, alpha=0.3)
    
    # 2. Breaststroke vs Freestyle boxplot
    breast_data = df[df['event'].str.contains('Breast')]['fina_points']
    free_data = df[df['event'].str.contains('Free')]['fina_points']
    
    axes[0,1].boxplot([breast_data, free_data], 
                      labels=['Breaststroke', 'Freestyle'],
                      patch_artist=True,
                      boxprops=dict(facecolor='lightcoral'),
                      medianprops=dict(color='red', linewidth=2))
    axes[0,1].set_title('Breaststroke vs Freestyle\nFINA Points Distribution')
    axes[0,1].set_ylabel('FINA Points')
    axes[0,1].grid(True, alpha=0.3)
    
    # 3. FINA points vs Percentile scatter
    colors = {'100 Breast': 'red', '200 Breast': 'darkred', 
              '100 Free': 'blue', '200 Free': 'darkblue'}
    
    for event in df['event'].unique():
        event_data = df[df['event'] == event]
        if len(event_data) > 5:
            axes[0,2].scatter(event_data['percentile'], event_data['fina_points'],
                            alpha=0.6, label=event, s=30, c=colors.get(event, 'gray'))
    
    axes[0,2].set_xlabel('Percentile Rank')
    axes[0,2].set_ylabel('FINA Points')
    axes[0,2].set_title('FINA Points vs Percentile\nby Event')
    axes[0,2].legend(fontsize=8)
    axes[0,2].grid(True, alpha=0.3)
    
    # 4. Division distribution
    div_counts = df['division'].value_counts()
    colors_div = plt.cm.Set3(np.linspace(0, 1, len(div_counts)))
    axes[1,0].pie(div_counts.values, labels=div_counts.index, autopct='%1.1f%%',
                  colors=colors_div)
    axes[1,0].set_title('Sample Distribution\nby Division')
    
    # 5. Sample sizes by event
    event_counts = df['event'].value_counts()
    bars = axes[1,1].bar(range(len(event_counts)), event_counts.values,
                        color=['lightcoral' if 'Breast' in event else 'lightblue' 
                               for event in event_counts.index])
    axes[1,1].set_xticks(range(len(event_counts)))
    axes[1,1].set_xticklabels(event_counts.index, rotation=45, ha='right')
    axes[1,1].set_title('Sample Sizes by Event')
    axes[1,1].set_ylabel('Number of Swimmers')
    
    # Add value labels on bars
    for i, bar in enumerate(bars):
        height = bar.get_height()
        axes[1,1].text(bar.get_x() + bar.get_width()/2., height + 1,
                       f'{int(height)}', ha='center', va='bottom', fontsize=10)
    
    # 6. Time distribution comparison
    df_breast_100 = df[df['event'] == '100 Breast']['time_seconds']
    df_free_100 = df[df['event'] == '100 Free']['time_seconds']
    
    axes[1,2].hist(df_breast_100, alpha=0.7, label='100 Breast', 
                   color='lightcoral', bins=15)
    axes[1,2].hist(df_free_100, alpha=0.7, label='100 Free', 
                   color='lightblue', bins=15)
    axes[1,2].set_xlabel('Time (seconds)')
    axes[1,2].set_ylabel('Frequency')
    axes[1,2].set_title('Time Distribution\n100 Breast vs 100 Free')
    axes[1,2].legend()
    axes[1,2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('fina_bias_analysis_complete.png', dpi=300, bbox_inches='tight')
    plt.show()

# Create visualizations
create_analysis_plots(df_analysis, event_statistics)

In [None]:
# %% Cell 10: Save Results and Summary
def save_results_and_summary(df, event_stats, swimmers_raw):
    """Save all results and create summary report"""
    
    # Save main datasets
    df.to_csv('swimming_fina_analysis.csv', index=False)
    event_stats.to_csv('event_bias_statistics.csv', index=False)
    
    # Create summary report
    summary_report = f"""
SWIMMING FINA BIAS ANALYSIS - SUMMARY REPORT
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
============================================

SAMPLE CHARACTERISTICS:
- Total swimmer-event records: {len(df)}
- Unique swimmers: {df['swimmer_ID'].nunique()}
- Events analyzed: {', '.join(df['event'].unique())}
- Divisions represented: {', '.join(df['division'].unique())}

DIVISION BREAKDOWN:
{df['division'].value_counts().to_string()}

EVENT SAMPLE SIZES:
{df['event'].value_counts().to_string()}

KEY FINDINGS:
============

EVENT BIAS RANKING (by median FINA points):
{event_stats[['event', 'median_fina', 'count']].to_string(index=False)}

HYPOTHESIS TEST RESULTS:
{'-' * 30}
"""
    
    # Add hypothesis test results
    breast_data = df[df['event'].str.contains('Breast')]['fina_points']
    free_data = df[df['event'].str.contains('Free')]['fina_points']
    
    if len(breast_data) > 10 and len(free_data) > 10:
        stat, p_value = stats.mannwhitneyu(breast_data, free_data, alternative='greater')
        
        summary_report += f"""
Breaststroke median FINA: {breast_data.median():.1f}
Freestyle median FINA: {free_data.median():.1f}
Difference: {breast_data.median() - free_data.median():.1f} points

Statistical Test (Mann-Whitney U):
P-value: {p_value:.6f}
Result: {'SIGNIFICANT BIAS DETECTED' if p_value < 0.05 else 'NO SIGNIFICANT BIAS'}

Effect Size (Cohen's d): {((breast_data.mean() - free_data.mean()) / 
                          np.sqrt(((len(breast_data)-1)*breast_data.var() + 
                                  (len(free_data)-1)*free_data.var()) / 
                                 (len(breast_data) + len(free_data) - 2))):.3f}
"""
    
    summary_report += f"""

CONCLUSIONS:
============
{'✓ HYPOTHESIS SUPPORTED: Breaststroke events show inflated FINA points' 
 if len(breast_data) > 10 and len(free_data) > 10 and 
    stats.mannwhitneyu(breast_data, free_data, alternative='greater')[1] < 0.05
 else '✗ HYPOTHESIS NOT SUPPORTED: No significant bias detected'}

FILES GENERATED:
- swimming_fina_analysis.csv (detailed results)
- event_bias_statistics.csv (summary statistics)
- fina_bias_analysis_complete.png (visualizations)
- analysis_summary_report.txt (this report)
"""
    
    # Save summary report
    with open('analysis_summary_report.txt', 'w') as f:
        f.write(summary_report)
    
    print("Results saved successfully!")
    print("\nFiles created:")
    print("- swimming_fina_analysis.csv")
    print("- event_bias_statistics.csv") 
    print("- fina_bias_analysis_complete.png")
    print("- analysis_summary_report.txt")
    
    return summary_report

# Save all results
final_summary = save_results_and_summary(df_analysis, event_statistics, swimmers_raw_data)
print("\n" + "=" * 60)
print("ANALYSIS COMPLETE!")
print("=" * 60)
print(final_summary)

# %% [markdown]
# ## Analysis Complete!
# 
# This notebook has successfully:
# 1. ✅ Collected stratified sample data from NCAA/NAIA divisions
# 2. ✅ Calculated FINA points and percentiles for each swimmer-event
# 3. ✅ Tested the breaststroke inflation hypothesis
# 4. ✅ Generated comprehensive visualizations
# 5. ✅ Saved results for further analysis
# 
# ### Key Takeaways:
# - Review the statistical test results above
# - Check the visualizations for patterns
# - Examine the saved CSV files for detailed data
# 
# ### Next Steps:
# - Validate results with official FINA base times
# - Expand analysis to include more events
# - Consider temporal trends in bias patterns