# 02 - Data Combining

This notebook combines all Kaggle CSVs into a single master table `master_races.csv` with one row per (raceId, driverId) for all races from 1994 onwards.

**Key Features:**
- Combines results, races, circuits, drivers, constructors, qualifying, standings, and sprint results
- Intelligent column deduplication (removes duplicates if data is identical, prefixes if different)
- Adds placeholder columns for future FastF1 features
- Includes FastF1 data preview (for exploration, not combination)

**Output:** `data/processed/master_races.csv`


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set up paths
# Get project root (works whether running from notebooks/ or F1/ folder)
PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent

KAGGLE_ROOT = PROJECT_ROOT / "data" / "raw" / "kaggle"
FASTF1_ROOT = PROJECT_ROOT / "data" / "raw" / "fastf1_2018plus"
PROCESSED_ROOT = PROJECT_ROOT / "data" / "processed"
PROCESSED_ROOT.mkdir(parents=True, exist_ok=True)

print(f"Kaggle data: {KAGGLE_ROOT}")
print(f"FastF1 data: {FASTF1_ROOT}")
print(f"Output: {PROCESSED_ROOT}")


Kaggle data: C:\Users\erikv\Downloads\F1\data\raw\kaggle
FastF1 data: C:\Users\erikv\Downloads\F1\data\raw\fastf1_2018plus
Output: C:\Users\erikv\Downloads\F1\data\processed


## 1. Data Loading & FastF1 Preview

Load all Kaggle CSVs and preview FastF1 data structure (for exploration only).


In [4]:
# Modular function to load Kaggle CSV
def load_kaggle_csv(name, path=None):
    """Load a Kaggle CSV file and return DataFrame with source tracking."""
    if path is None:
        path = KAGGLE_ROOT / f"{name}.csv"
    if not path.exists():
        print(f"Warning: {name}.csv not found at {path}")
        return None
    df = pd.read_csv(path)
    df.attrs['source'] = 'kaggle'
    df.attrs['name'] = name
    return df

# Load all Kaggle CSVs
kaggle_files = {
    'races': 'races',
    'results': 'results',
    'drivers': 'drivers',
    'constructors': 'constructors',
    'circuits': 'circuits',
    'qualifying': 'qualifying',
    'driver_standings': 'driver_standings',
    'constructor_standings': 'constructor_standings',
    'constructor_results': 'constructor_results',
    'sprint_results': 'sprint_results'
}

kaggle_data = {}
for key, filename in kaggle_files.items():
    df = load_kaggle_csv(filename)
    if df is not None:
        kaggle_data[key] = df
        print(f"Loaded {key}: {df.shape[0]:,} rows, {df.shape[1]} columns")

print(f"\nTotal Kaggle datasets loaded: {len(kaggle_data)}")


Loaded races: 1,125 rows, 18 columns
Loaded results: 26,759 rows, 18 columns
Loaded drivers: 861 rows, 9 columns
Loaded constructors: 212 rows, 5 columns
Loaded circuits: 77 rows, 9 columns
Loaded qualifying: 10,494 rows, 9 columns
Loaded driver_standings: 34,863 rows, 7 columns
Loaded constructor_standings: 13,391 rows, 7 columns
Loaded constructor_results: 12,625 rows, 5 columns
Loaded sprint_results: 360 rows, 16 columns

Total Kaggle datasets loaded: 10


In [None]:
# Preview FastF1 data structure (for exploration, not combination)
print("FastF1 Data Preview (2018-2024):")
print("=" * 60)

fastf1_years = range(2018, 2025)
fastf1_datasets = ['RESULTS', 'LAPS', 'TELEMETRY', 'WEATHER']

for dataset in fastf1_datasets:
    print(f"\n{dataset} Dataset:")
    sample_loaded = False
    for year in fastf1_years:
        path = FASTF1_ROOT / f"ALL_{dataset}_{year}.csv"
        if path.exists():
            try:
                df = pd.read_csv(path, nrows=5, low_memory=False)  # Just preview
                if not sample_loaded:
                    print(f"  Columns ({len(df.columns)}): {list(df.columns)[:10]}...")
                    print(f"  Sample row:")
                    print(f"    {df.iloc[0].to_dict()}")
                    sample_loaded = True
                print(f"  {year}: File exists")
            except Exception as e:
                print(f"  {year}: Error loading - {e}")
        else:
            print(f"  {year}: File not found")
    
    if not sample_loaded:
        print(f"  No data files found for {dataset}")

print("\nNote: FastF1 data will be explored separately for feature extraction.")


## 2. Intelligent Column Deduplication Function

This function handles duplicate column names AFTER merging:
- **Merge first** with temporary suffixes to preserve all rows
- **Then compare** columns to detect duplicates
- If data is identical: drop duplicate column
- If data differs: prefix with source CSV name

**Why merge first?** This ensures we compare all rows (including those that don't match in both datasets) rather than only comparing rows that match in a pre-merge inner join.


In [5]:
def handle_duplicate_columns_after_merge(merged_df, source_name, temp_suffix='_temp'):
    """
    Handle duplicate columns after merging by comparing suffixed columns.
    
    This function works on already-merged data, ensuring all rows are considered.
    When pandas merges with suffixes, conflicting columns get suffixed.
    
    Parameters:
    - merged_df: DataFrame after merge (with suffixed columns)
    - source_name: Name of source CSV (for prefixing different columns)
    - temp_suffix: Suffix used during merge (default '_temp')
    
    Returns:
    - cleaned_df: DataFrame with duplicates handled (identical dropped, different prefixed)
    - dropped_cols: List of columns that were dropped (identical data)
    - renamed_cols: Dict of columns that were renamed (different data)
    """
    if merged_df is None or merged_df.empty:
        return merged_df, [], {}
    
    # Find columns that have the temp suffix (these are conflicts from the merge)
    temp_cols = [col for col in merged_df.columns if col.endswith(temp_suffix)]
    original_cols = [col[:-len(temp_suffix)] for col in temp_cols]
    
    columns_to_drop = []
    rename_dict = {}
    
    for orig_col, temp_col in zip(original_cols, temp_cols):
        if orig_col in merged_df.columns:
            # Compare original and temp columns across all rows
            # Handle NaN comparisons properly
            both_exist_mask = merged_df[orig_col].notna() & merged_df[temp_col].notna()
            orig_only_mask = merged_df[orig_col].notna() & merged_df[temp_col].isna()
            temp_only_mask = merged_df[orig_col].isna() & merged_df[temp_col].notna()
            
            # Check if values are identical where both exist
            if both_exist_mask.any():
                identical = (merged_df.loc[both_exist_mask, orig_col] == merged_df.loc[both_exist_mask, temp_col]).all()
            else:
                identical = True  # No overlapping values to compare
            
            # If identical where both exist, and no mismatched NaN patterns, drop temp
            if identical and not orig_only_mask.any() and not temp_only_mask.any():
                # Data is identical across all rows, drop the temp column
                columns_to_drop.append(temp_col)
            else:
                # Data differs or has different NaN patterns, rename temp column
                rename_dict[temp_col] = f"{source_name}_{orig_col}"
        else:
            # Original column doesn't exist (shouldn't happen, but handle it)
            # Just rename the temp column
            rename_dict[temp_col] = f"{source_name}_{orig_col}"
    
    # Apply renaming and dropping
    cleaned_df = merged_df.drop(columns=columns_to_drop)
    cleaned_df = cleaned_df.rename(columns=rename_dict)
    
    return cleaned_df, columns_to_drop, rename_dict

print("Column deduplication function created.")
print("This function handles duplicates AFTER merging to ensure all rows are considered.")


Column deduplication function created.
This function handles duplicates AFTER merging to ensure all rows are considered.


In [6]:
# Start with results.csv as base
if 'results' not in kaggle_data:
    raise ValueError("results.csv not found!")

master = kaggle_data['results'].copy()
print(f"Base table (results): {master.shape[0]:,} rows, {master.shape[1]} columns")

# Filter to year >= 1994 by merging with races first
if 'races' in kaggle_data:
    races = kaggle_data['races'].copy()
    races['date'] = pd.to_datetime(races['date'], errors='coerce')
    
    # Merge races to get year
    master = master.merge(
        races[['raceId', 'year', 'round', 'circuitId', 'date', 'name']],
        on='raceId',
        how='left'
    )
    
    # Filter to 1994+
    master = master[master['year'] >= 1994].copy()
    print(f"After filtering to 1994+: {master.shape[0]:,} rows")
    
    # Merge circuits
    if 'circuits' in kaggle_data:
        circuits = kaggle_data['circuits'].copy()
        # Rename 'name' column from circuits to avoid conflict with races 'name'
        circuits = circuits.rename(columns={'name': 'circuit_name'})
        master = master.merge(
            circuits,
            on='circuitId',
            how='left',
            suffixes=('', '_circuit')
        )
        print(f"After merging circuits: {master.shape[0]:,} rows, {master.shape[1]} columns")
    
    # Merge drivers
    if 'drivers' in kaggle_data:
        drivers = kaggle_data['drivers'].copy()
        master = master.merge(
            drivers,
            on='driverId',
            how='left',
            suffixes=('', '_driver')
        )
        print(f"After merging drivers: {master.shape[0]:,} rows, {master.shape[1]} columns")
    
    # Merge constructors
    if 'constructors' in kaggle_data:
        constructors = kaggle_data['constructors'].copy()
        master = master.merge(
            constructors,
            on='constructorId',
            how='left',
            suffixes=('', '_constructor')
        )
        print(f"After merging constructors: {master.shape[0]:,} rows, {master.shape[1]} columns")

print(f"\nBase table created: {master.shape}")
print(f"Columns: {list(master.columns)[:10]}... ({len(master.columns)} total)")


Base table (results): 26,759 rows, 18 columns
After filtering to 1994+: 12,358 rows
After merging circuits: 12,358 rows, 31 columns
After merging drivers: 12,358 rows, 39 columns
After merging constructors: 12,358 rows, 43 columns

Base table created: (12358, 43)
Columns: ['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid', 'position', 'positionText', 'positionOrder', 'points']... (43 total)


In [7]:
# Merge driver_standings
if 'driver_standings' in kaggle_data:
    driver_standings = kaggle_data['driver_standings'].copy()
    
    # Merge first with temp suffix
    master = master.merge(
        driver_standings,
        on=['raceId', 'driverId'],
        how='left',
        suffixes=('', '_temp')
    )
    
    # Handle duplicates after merge
    master, dropped_cols, renamed_cols = handle_duplicate_columns_after_merge(
        master, 'driver_standings', temp_suffix='_temp'
    )
    
    print(f"After merging driver_standings: {master.shape[0]:,} rows, {master.shape[1]} columns")
    if dropped_cols:
        print(f"  Dropped identical columns: {[col.replace('_temp', '') for col in dropped_cols]}")
    if renamed_cols:
        print(f"  Renamed different columns: {list(renamed_cols.values())[:3]}...")

# Merge constructor_standings
if 'constructor_standings' in kaggle_data:
    constructor_standings = kaggle_data['constructor_standings'].copy()
    
    # Merge first with temp suffix
    master = master.merge(
        constructor_standings,
        on=['raceId', 'constructorId'],
        how='left',
        suffixes=('', '_temp')
    )
    
    # Handle duplicates after merge
    master, dropped_cols, renamed_cols = handle_duplicate_columns_after_merge(
        master, 'constructor_standings', temp_suffix='_temp'
    )
    
    print(f"After merging constructor_standings: {master.shape[0]:,} rows, {master.shape[1]} columns")
    if dropped_cols:
        print(f"  Dropped identical columns: {[col.replace('_temp', '') for col in dropped_cols]}")
    if renamed_cols:
        print(f"  Renamed different columns: {list(renamed_cols.values())[:3]}...")

# Merge constructor_results
if 'constructor_results' in kaggle_data:
    constructor_results = kaggle_data['constructor_results'].copy()
    
    # Merge first with temp suffix
    master = master.merge(
        constructor_results,
        on=['raceId', 'constructorId'],
        how='left',
        suffixes=('', '_temp')
    )
    
    # Handle duplicates after merge
    master, dropped_cols, renamed_cols = handle_duplicate_columns_after_merge(
        master, 'constructor_results', temp_suffix='_temp'
    )
    
    print(f"After merging constructor_results: {master.shape[0]:,} rows, {master.shape[1]} columns")
    if dropped_cols:
        print(f"  Dropped identical columns: {[col.replace('_temp', '') for col in dropped_cols]}")
    if renamed_cols:
        print(f"  Renamed different columns: {list(renamed_cols.values())[:3]}...")

# Merge qualifying
if 'qualifying' in kaggle_data:
    qualifying = kaggle_data['qualifying'].copy()
    
    # Merge first with temp suffix
    master = master.merge(
        qualifying,
        on=['raceId', 'driverId'],
        how='left',
        suffixes=('', '_temp')
    )
    
    # Handle duplicates after merge
    master, dropped_cols, renamed_cols = handle_duplicate_columns_after_merge(
        master, 'qualifying', temp_suffix='_temp'
    )
    
    print(f"After merging qualifying: {master.shape[0]:,} rows, {master.shape[1]} columns")
    if dropped_cols:
        print(f"  Dropped identical columns: {[col.replace('_temp', '') for col in dropped_cols]}")
    if renamed_cols:
        print(f"  Renamed different columns: {list(renamed_cols.values())[:3]}...")

# Merge sprint_results (prefix all columns)
if 'sprint_results' in kaggle_data:
    sprint_results = kaggle_data['sprint_results'].copy()
    
    # For sprint_results, we want to prefix most columns except merge keys
    merge_keys = ['raceId', 'driverId']
    sprint_rename_dict = {}
    for col in sprint_results.columns:
        if col not in merge_keys:
            sprint_rename_dict[col] = f"sprint_results_{col}"
    
    sprint_results_renamed = sprint_results.rename(columns=sprint_rename_dict)
    
    master = master.merge(
        sprint_results_renamed,
        on=['raceId', 'driverId'],
        how='left',
        suffixes=('', '_sprint')
    )
    print(f"After merging sprint_results: {master.shape[0]:,} rows, {master.shape[1]} columns")
    print(f"  Renamed columns: {list(sprint_rename_dict.values())[:5]}...")

print(f"\nMaster table after all merges: {master.shape}")


After merging driver_standings: 12,358 rows, 48 columns
  Renamed different columns: ['driver_standings_points', 'driver_standings_position', 'driver_standings_positionText']...
After merging constructor_standings: 12,358 rows, 53 columns
  Renamed different columns: ['constructor_standings_points', 'constructor_standings_position', 'constructor_standings_positionText']...
After merging constructor_results: 12,358 rows, 56 columns
  Renamed different columns: ['constructor_results_points']...
After merging qualifying: 12,358 rows, 63 columns
  Renamed different columns: ['qualifying_constructorId', 'qualifying_number', 'qualifying_position']...
After merging sprint_results: 12,358 rows, 77 columns
  Renamed columns: ['sprint_results_resultId', 'sprint_results_constructorId', 'sprint_results_number', 'sprint_results_grid', 'sprint_results_position']...

Master table after all merges: (12358, 77)


**Expected Output:**
- Row counts after each merge (should remain constant - one row per raceId+driverId)
- Column counts increasing as we add data from each source
- Messages about dropped/renamed columns
- **What to look for:**
  - **Row count:** Should stay the same (~20K-25K rows) - merges are left joins on existing keys
  - **Dropped columns:** These had identical data to existing columns (e.g., `points` might be identical in results and driver_standings)
  - **Renamed columns:** These had different data, so we prefix them (e.g., `driver_standings_points` vs `results_points`)
  - **Good signs:** 
    - Row count remains stable
    - Appropriate columns are dropped/renamed based on data comparison
  - **Warning signs:**
    - Row count changes unexpectedly (indicates merge key issues)
    - Many columns being renamed when they should be identical (data quality issue)


## 5. Placeholder Columns for Future Features

Add placeholder columns for FastF1 features that will be extracted later.


In [8]:
# Add placeholder columns for future FastF1 features
placeholder_features = {
    'lap_time_variance': 'Variance in lap times (from FastF1 ALL_LAPS)',
    'throttle_variance': 'Variance in throttle usage (from FastF1 ALL_TELEMETRY)',
    'overtake_attempts': 'Number of overtake attempts (from telemetry analysis)',
    'avg_pit_stops': 'Average pit stops per race (from pit_stops.csv aggregation)'
}

for col_name, description in placeholder_features.items():
    master[col_name] = np.nan

print("Added placeholder columns:")
for col_name, description in placeholder_features.items():
    print(f"  - {col_name}: {description}")

print(f"\nMaster table with placeholders: {master.shape}")


Added placeholder columns:
  - lap_time_variance: Variance in lap times (from FastF1 ALL_LAPS)
  - throttle_variance: Variance in throttle usage (from FastF1 ALL_TELEMETRY)
  - overtake_attempts: Number of overtake attempts (from telemetry analysis)
  - avg_pit_stops: Average pit stops per race (from pit_stops.csv aggregation)

Master table with placeholders: (12358, 81)


## 6. Data Validation

Verify data integrity: one row per (raceId, driverId), no unexpected duplicates, date ranges valid.


In [9]:
# Verify one row per (raceId, driverId)
duplicate_check = master.groupby(['raceId', 'driverId']).size()
duplicates = duplicate_check[duplicate_check > 1]
if len(duplicates) > 0:
    print(f"WARNING: Found {len(duplicates)} duplicate (raceId, driverId) combinations:")
    print(duplicates.head(10))
else:
    print("✓ No duplicate (raceId, driverId) combinations found")

# Check date ranges
if 'date' in master.columns:
    master['date'] = pd.to_datetime(master['date'], errors='coerce')
    print(f"\nDate range: {master['date'].min()} to {master['date'].max()}")
    print(f"Years: {master['year'].min()} - {master['year'].max()}")
    invalid_dates = master['date'].isnull().sum()
    if invalid_dates > 0:
        print(f"WARNING: {invalid_dates} rows with invalid dates")

# Check merge success rates
print(f"\nMerge success rates:")
if 'driverStandingsId' in master.columns:
    driver_standings_coverage = master['driverStandingsId'].notna().sum() / len(master) * 100
    print(f"  driver_standings: {driver_standings_coverage:.1f}%")
if 'constructorStandingsId' in master.columns:
    constructor_standings_coverage = master['constructorStandingsId'].notna().sum() / len(master) * 100
    print(f"  constructor_standings: {constructor_standings_coverage:.1f}%")
if 'qualifyId' in master.columns:
    qualifying_coverage = master['qualifyId'].notna().sum() / len(master) * 100
    print(f"  qualifying: {qualifying_coverage:.1f}%")
if 'sprint_results_resultId' in master.columns:
    sprint_coverage = master['sprint_results_resultId'].notna().sum() / len(master) * 100
    print(f"  sprint_results: {sprint_coverage:.1f}%")

print(f"\nFinal master table: {master.shape[0]:,} rows, {master.shape[1]} columns")


✓ No duplicate (raceId, driverId) combinations found

Date range: 1994-03-27 00:00:00 to 2024-12-08 00:00:00
Years: 1994 - 2024

Merge success rates:
  driver_standings: 97.5%
  constructor_standings: 99.0%
  qualifying: 84.9%
  sprint_results: 2.9%

Final master table: 12,358 rows, 81 columns


## 7. Target Variable Creation

Create the target variable: podium (1 if positionOrder <= 3, else 0).


In [10]:
# Create target variable
master['podium'] = (master['positionOrder'] <= 3).astype(int)

# Verify target distribution
print("Target Variable Distribution:")
print(master['podium'].value_counts())
print(f"\nPodium rate: {master['podium'].mean():.2%}")

# Check by year
if 'year' in master.columns:
    podium_by_year = master.groupby('year')['podium'].agg(['sum', 'count', 'mean'])
    podium_by_year.columns = ['podiums', 'total', 'podium_rate']
    print(f"\nPodium rate by year (sample):")
    print(podium_by_year.tail(10).to_string())


Target Variable Distribution:
podium
0    10627
1     1731
Name: count, dtype: int64

Podium rate: 14.01%

Podium rate by year (sample):
      podiums  total  podium_rate
year                             
2015       57    378     0.150794
2016       63    462     0.136364
2017       60    400     0.150000
2018       63    420     0.150000
2019       63    420     0.150000
2020       51    340     0.150000
2021       66    440     0.150000
2022       66    440     0.150000
2023       66    440     0.150000
2024       72    479     0.150313


## 8. Final Master Table Export

Save the master table and generate schema documentation.


In [11]:
# Save master table
output_path = PROCESSED_ROOT / "master_races.csv"
master.to_csv(output_path, index=False)
print(f"Master table saved to: {output_path}")
print(f"  Rows: {master.shape[0]:,}")
print(f"  Columns: {master.shape[1]}")

# Display sample rows
print(f"\nSample rows:")
print(master.head(3).to_string())

# Display column summary
print(f"\nColumn summary:")
print(f"  Total columns: {len(master.columns)}")
print(f"  Numeric columns: {len(master.select_dtypes(include=[np.number]).columns)}")
print(f"  Categorical columns: {len(master.select_dtypes(include=['object']).columns)}")
print(f"  Date columns: {len(master.select_dtypes(include=['datetime64']).columns)}")

# Memory usage
print(f"\nMemory usage: {master.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


Master table saved to: C:\Users\erikv\Downloads\F1\data\processed\master_races.csv
  Rows: 12,358
  Columns: 82

Sample rows:
   resultId  raceId  driverId  constructorId number  grid position positionText  positionOrder  points  laps         time milliseconds fastestLap rank fastestLapTime fastestLapSpeed  statusId  year  round  circuitId       date                   name   circuitRef                    circuit_name   location    country      lat      lng  alt                                                        url driverRef number_driver code forename   surname         dob nationality                                   url_driver constructorRef name_constructor nationality_constructor                                               url_constructor  driverStandingsId  driver_standings_points  driver_standings_position driver_standings_positionText  wins  constructorStandingsId  constructor_standings_points  constructor_standings_position constructor_standings_positionText  constructor

In [12]:
# Generate schema documentation
schema_doc = {
    'table_name': 'master_races',
    'description': 'Combined F1 race data (1994+) with one row per (raceId, driverId)',
    'row_count': len(master),
    'column_count': len(master.columns),
    'columns': {}
}

# Group columns by source
column_sources = {
    'results': ['resultId', 'number', 'grid', 'position', 'positionText', 'positionOrder', 
                'points', 'laps', 'time', 'milliseconds', 'fastestLap', 'rank', 
                'fastestLapTime', 'fastestLapSpeed', 'statusId'],
    'races': ['year', 'round', 'date', 'name'],
    'circuits': ['circuit_name', 'location', 'country', 'lat', 'lng', 'alt'],
    'drivers': ['driverRef', 'code', 'forename', 'surname', 'dob', 'nationality'],
    'constructors': ['constructorRef', 'name', 'nationality'],
    'driver_standings': ['driverStandingsId', 'points', 'position', 'positionText', 'wins'],
    'constructor_standings': ['constructorStandingsId', 'points', 'position', 'positionText', 'wins'],
    'constructor_results': ['constructorResultsId', 'points', 'status'],
    'qualifying': ['qualifyId', 'position', 'q1', 'q2', 'q3'],
    'sprint_results': [col for col in master.columns if col.startswith('sprint_results_')],
    'placeholders': ['lap_time_variance', 'throttle_variance', 'overtake_attempts', 'avg_pit_stops'],
    'target': ['podium']
}

for source, cols in column_sources.items():
    for col in cols:
        if col in master.columns:
            schema_doc['columns'][col] = {
                'source': source,
                'dtype': str(master[col].dtype),
                'null_count': int(master[col].isnull().sum()),
                'null_pct': float(master[col].isnull().sum() / len(master) * 100)
            }

# Save schema
import json
schema_path = PROCESSED_ROOT / "master_races_schema.md"
with open(schema_path, 'w') as f:
    f.write("# Master Races Schema\n\n")
    f.write(f"- **Rows**: {schema_doc['row_count']:,}\n")
    f.write(f"- **Columns**: {schema_doc['column_count']}\n")
    f.write(f"- **Description**: {schema_doc['description']}\n\n")
    f.write("## Column Sources\n\n")
    for source, cols in column_sources.items():
        actual_cols = [c for c in cols if c in master.columns]
        if actual_cols:
            f.write(f"### {source}\n")
            for col in actual_cols[:10]:  # Show first 10
                info = schema_doc['columns'][col]
                f.write(f"- `{col}` ({info['dtype']}): {info['null_pct']:.1f}% null\n")
            if len(actual_cols) > 10:
                f.write(f"- ... and {len(actual_cols) - 10} more\n")
            f.write("\n")

print(f"Schema documentation saved to: {schema_path}")


Schema documentation saved to: C:\Users\erikv\Downloads\F1\data\processed\master_races_schema.md
