# 01 - Data Verification

This notebook performs comprehensive data quality and consistency verification for both Kaggle (Ergast) and FastF1 datasets. It checks:

1. **Completeness**: Year coverage, race coverage, driver/constructor metadata
2. **Data Quality**: Missing values, outliers, data types, range validation
3. **Cross-Source Consistency**: Matching between Kaggle and FastF1 data (2018-2024 overlap)

The goal is to identify any data issues before combining datasets in the next notebook.

**How to use this notebook:**
- Run each cell sequentially
- After each code cell, read the markdown note that follows to understand what the output means
- Look for warning signs mentioned in the notes
- The final report summarizes all findings for future reference


In [6]:
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set up paths using pathlib.Path for proper path joining
PROJECT_ROOT = Path(r"C:\Users\erikv\Downloads\F1")
KAGGLE_ROOT = PROJECT_ROOT / "data" / "raw" / "kaggle"
FASTF1_ROOT = PROJECT_ROOT / "data" / "raw" / "fastf1_2018plus"
PROCESSED_ROOT = PROJECT_ROOT / "data" / "processed"
PROCESSED_ROOT.mkdir(parents=True, exist_ok=True)

print(f"Kaggle data: {KAGGLE_ROOT}")
print(f"FastF1 data: {FASTF1_ROOT}")


Kaggle data: C:\Users\erikv\Downloads\F1\data\raw\kaggle
FastF1 data: C:\Users\erikv\Downloads\F1\data\raw\fastf1_2018plus


## 1. Data Loading & Overview

Load all Kaggle CSVs and FastF1 CSVs, display basic statistics.


In [None]:
# Modular function to load Kaggle CSV
def load_kaggle_csv(name, path=None):
    """Load a Kaggle CSV file and return DataFrame with source tracking."""
    if path is None:
        path = KAGGLE_ROOT / f"{name}.csv"
    if not path.exists():
        print(f"Warning: {name}.csv not found at {path}")
        return None
    df = pd.read_csv(path)
    df.attrs['source'] = 'kaggle'
    df.attrs['name'] = name
    return df

# Load all Kaggle CSVs
kaggle_files = {
    'races': 'races',
    'results': 'results',
    'drivers': 'drivers',
    'constructors': 'constructors',
    'circuits': 'circuits',
    'qualifying': 'qualifying',
    'driver_standings': 'driver_standings',
    'constructor_standings': 'constructor_standings',
    'constructor_results': 'constructor_results',
    'sprint_results': 'sprint_results',
    'pit_stops': 'pit_stops',
    'lap_times': 'lap_times'
}

kaggle_data = {}
for key, filename in kaggle_files.items():
    df = load_kaggle_csv(filename)
    if df is not None:
        kaggle_data[key] = df
        print(f"Loaded {key}: {df.shape[0]:,} rows, {df.shape[1]} columns")

print(f"\nTotal Kaggle datasets loaded: {len(kaggle_data)}")


Loaded races: 1,125 rows, 18 columns
Loaded results: 26,759 rows, 18 columns
Loaded drivers: 861 rows, 9 columns
Loaded constructors: 212 rows, 5 columns
Loaded circuits: 77 rows, 9 columns
Loaded qualifying: 10,494 rows, 9 columns
Loaded driver_standings: 34,863 rows, 7 columns
Loaded constructor_standings: 13,391 rows, 7 columns
Loaded constructor_results: 12,625 rows, 5 columns
Loaded sprint_results: 360 rows, 16 columns
Loaded pit_stops: 11,371 rows, 7 columns
Loaded lap_times: 589,081 rows, 6 columns

Total Kaggle datasets loaded: 12


In [None]:
# Load FastF1 CSVs (2018-2024)
fastf1_years = range(2018, 2025)
fastf1_datasets = ['RESULTS', 'LAPS', 'TELEMETRY', 'WEATHER']

fastf1_data = {}
for year in fastf1_years:
    fastf1_data[year] = {}
    for dataset in fastf1_datasets:
        path = FASTF1_ROOT / f"ALL_{dataset}_{year}.csv"
        if path.exists():
            try:
                df = pd.read_csv(path, low_memory=False)
                fastf1_data[year][dataset] = df
                print(f"FastF1 {year} {dataset}: {df.shape[0]:,} rows, {df.shape[1]} columns")
            except Exception as e:
                print(f"Error loading FastF1 {year} {dataset}: {e}")
        else:
            print(f"FastF1 {year} {dataset}: File not found")

print(f"\nFastF1 data loaded for years: {list(fastf1_data.keys())}")


FastF1 2018 RESULTS: 2,100 rows, 25 columns
FastF1 2018 LAPS: 58,002 rows, 34 columns


KeyboardInterrupt: 

## 2. Completeness Checks

Check year coverage, race coverage, and metadata completeness.


In [None]:
# Parameterized completeness check function
def check_completeness(df, name, key_cols=None):
    """Check completeness of a dataset. Returns summary dictionary."""
    if df is None or df.empty:
        return {'name': name, 'status': 'empty', 'rows': 0}
    
    summary = {
        'name': name,
        'rows': len(df),
        'columns': len(df.columns),
        'missing_pct': df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100
    }
    
    # Check key columns if provided
    if key_cols:
        missing_keys = []
        for col in key_cols:
            if col in df.columns:
                missing_count = df[col].isnull().sum()
                summary[f'{col}_missing'] = missing_count
                summary[f'{col}_missing_pct'] = missing_count / len(df) * 100
            else:
                missing_keys.append(col)
        if missing_keys:
            summary['missing_key_cols'] = missing_keys
    
    return summary

# Check completeness for all Kaggle datasets
completeness_results = []
for name, df in kaggle_data.items():
    # Define key columns for each dataset
    key_cols_map = {
        'races': ['raceId', 'year', 'round', 'circuitId'],
        'results': ['resultId', 'raceId', 'driverId', 'constructorId'],
        'drivers': ['driverId'],
        'constructors': ['constructorId'],
        'circuits': ['circuitId'],
        'qualifying': ['qualifyId', 'raceId', 'driverId'],
        'driver_standings': ['driverStandingsId', 'raceId', 'driverId'],
        'constructor_standings': ['constructorStandingsId', 'raceId', 'constructorId'],
        'constructor_results': ['constructorResultsId', 'raceId', 'constructorId'],
        'sprint_results': ['resultId', 'raceId', 'driverId']
    }
    key_cols = key_cols_map.get(name, [])
    result = check_completeness(df, name, key_cols)
    completeness_results.append(result)

completeness_df = pd.DataFrame(completeness_results)
print("Kaggle Data Completeness Summary:")
print(completeness_df.to_string(index=False))


Kaggle Data Completeness Summary:
                 name   rows  columns  missing_pct  raceId_missing  raceId_missing_pct  year_missing  year_missing_pct  round_missing  round_missing_pct  circuitId_missing  circuitId_missing_pct  resultId_missing  resultId_missing_pct  driverId_missing  driverId_missing_pct  constructorId_missing  constructorId_missing_pct  qualifyId_missing  qualifyId_missing_pct  driverStandingsId_missing  driverStandingsId_missing_pct  constructorStandingsId_missing  constructorStandingsId_missing_pct  constructorResultsId_missing  constructorResultsId_missing_pct
                races   1125       18     0.000000             0.0                 0.0           0.0               0.0            0.0                0.0                0.0                    0.0               NaN                   NaN               NaN                   NaN                    NaN                        NaN                NaN                    NaN                        NaN                

In [11]:
# Year coverage check
if 'races' in kaggle_data:
    races = kaggle_data['races']
    races['date'] = pd.to_datetime(races['date'], errors='coerce')
    year_coverage = races.groupby('year').agg({
        'raceId': 'count',
        'round': 'max'
    }).rename(columns={'raceId': 'num_races'})
    
    print("Year Coverage (Kaggle):")
    print(f"  Years: {races['year'].min()} - {races['year'].max()}")
    print(f"  Total races: {len(races)}")
    print(f"  Years >= 1994: {len(races[races['year'] >= 1994])} races")
    print(f"\nRaces per year (1994-2024):")
    print(year_coverage[year_coverage.index >= 1994].to_string())
    
    # Check for missing rounds
    missing_rounds = []
    for year in range(1994, 2025):
        year_races = races[races['year'] == year]
        if len(year_races) > 0:
            expected_rounds = set(range(1, year_races['round'].max() + 1))
            actual_rounds = set(year_races['round'].unique())
            missing = expected_rounds - actual_rounds
            if missing:
                missing_rounds.append({'year': year, 'missing_rounds': sorted(missing)})
    
    if missing_rounds:
        print(f"\nMissing rounds detected: {missing_rounds}")
    else:
        print("\nNo missing rounds detected (all rounds sequential)")


Year Coverage (Kaggle):
  Years: 1950 - 2024
  Total races: 1125
  Years >= 1994: 577 races

Races per year (1994-2024):
      num_races  round
year                  
1994         16     16
1995         17     17
1996         16     16
1997         17     17
1998         16     16
1999         16     16
2000         17     17
2001         17     17
2002         17     17
2003         16     16
2004         18     18
2005         19     19
2006         18     18
2007         17     17
2008         18     18
2009         17     17
2010         19     19
2011         19     19
2012         20     20
2013         19     19
2014         19     19
2015         19     19
2016         21     21
2017         20     20
2018         21     21
2019         21     21
2020         17     17
2021         22     22
2022         22     22
2023         22     22
2024         24     24

No missing rounds detected (all rounds sequential)


In [12]:
# Driver and Constructor coverage
if 'drivers' in kaggle_data:
    drivers = kaggle_data['drivers']
    print(f"Driver Coverage:")
    print(f"  Total drivers: {len(drivers)}")
    print(f"  Missing forename: {drivers['forename'].isnull().sum()}")
    print(f"  Missing surname: {drivers['surname'].isnull().sum()}")
    print(f"  Missing nationality: {drivers['nationality'].isnull().sum()}")

if 'constructors' in kaggle_data:
    constructors = kaggle_data['constructors']
    print(f"\nConstructor Coverage:")
    print(f"  Total constructors: {len(constructors)}")
    print(f"  Missing name: {constructors['name'].isnull().sum()}")
    print(f"  Missing nationality: {constructors['nationality'].isnull().sum()}")
# FastF1 coverage check
print(f"\nFastF1 Coverage (2018-2024):")
for year in fastf1_years:
    if year in fastf1_data:
        datasets_present = list(fastf1_data[year].keys())
        print(f"  {year}: {', '.join(datasets_present)} ({len(datasets_present)}/4)")
    else:
        print(f"  {year}: No data")



Driver Coverage:
  Total drivers: 861
  Missing forename: 0
  Missing surname: 0
  Missing nationality: 0

Constructor Coverage:
  Total constructors: 212
  Missing name: 0
  Missing nationality: 0

FastF1 Coverage (2018-2024):
  2018: RESULTS, LAPS (2/4)
  2019: No data
  2020: No data
  2021: No data
  2022: No data
  2023: No data
  2024: No data


## 3. Data Quality Checks

Check missing values, outliers, data types, and range validation.


In [13]:
# Missing values analysis
print("Missing Values Analysis (Top 10 columns per dataset):")
for name, df in kaggle_data.items():
    if df is not None and not df.empty:
        missing_pct = df.isnull().sum() / len(df) * 100
        missing_pct = missing_pct[missing_pct > 0].sort_values(ascending=False)
        if len(missing_pct) > 0:
            print(f"\n{name}:")
            print(missing_pct.head(10).to_string())


Missing Values Analysis (Top 10 columns per dataset):

qualifying:
q3    0.438346
q2    0.209644


In [14]:
# Outlier detection for key numeric columns
if 'results' in kaggle_data:
    results = kaggle_data['results']
    print("Results Data Quality:")
    print(f"  Position range: {results['positionOrder'].min()} - {results['positionOrder'].max()}")
    print(f"  Points range: {results['points'].min()} - {results['points'].max()}")
    print(f"  Laps range: {results['laps'].min()} - {results['laps'].max()}")
    
    # Check for outliers
    if 'fastestLapSpeed' in results.columns:
        speed_outliers = results[results['fastestLapSpeed'] > 400]  # Unrealistic speeds
        if len(speed_outliers) > 0:
            print(f"  Warning: {len(speed_outliers)} rows with speed > 400 km/h")
    
    # Check positionOrder consistency
    position_issues = results[results['positionOrder'] <= 0]
    if len(position_issues) > 0:
        print(f"  Warning: {len(position_issues)} rows with invalid positionOrder")

# Data type validation
print("\nData Type Validation:")
if 'races' in kaggle_data:
    races = kaggle_data['races']
    if 'date' in races.columns:
        date_parsed = pd.to_datetime(races['date'], errors='coerce')
        invalid_dates = date_parsed.isnull().sum() - races['date'].isnull().sum()
        if invalid_dates > 0:
            print(f"  Warning: {invalid_dates} invalid dates in races.csv")


Results Data Quality:
  Position range: 1 - 39
  Points range: 0.0 - 50.0
  Laps range: 0 - 200


TypeError: '>' not supported between instances of 'str' and 'int'

## 4. Cross-Source Consistency (2018-2024)

Compare Kaggle and FastF1 data for overlapping years to ensure consistency.


In [None]:
# Race matching between Kaggle and FastF1
if 'races' in kaggle_data and 2018 in fastf1_data:
    races = kaggle_data['races']
    races_2018plus = races[races['year'] >= 2018].copy()
    
    # Normalize event names for matching
    def normalize_event_name(name):
        """Normalize event name for matching."""
        if pd.isna(name):
            return None
        name = str(name).lower()
        # Remove "Grand Prix" and common variations
        name = name.replace('grand prix', '').strip()
        name = name.replace('gp', '').strip()
        return name
    
    races_2018plus['event_normalized'] = races_2018plus['name'].apply(normalize_event_name)
    
    # Get FastF1 event names
    fastf1_events = set()
    for year in range(2018, 2025):
        if year in fastf1_data and 'RESULTS' in fastf1_data[year]:
            events = fastf1_data[year]['RESULTS']['Event'].unique()
            for event in events:
                fastf1_events.add((year, normalize_event_name(event)))
    
    # Match races
    matched = 0
    unmatched = []
    for _, race in races_2018plus.iterrows():
        key = (race['year'], normalize_event_name(race['name']))
        if key in fastf1_events:
            matched += 1
        else:
            unmatched.append(race[['year', 'name', 'round']])
    
    print(f"Race Matching (2018-2024):")
    print(f"  Kaggle races: {len(races_2018plus)}")
    print(f"  Matched to FastF1: {matched}")
    print(f"  Unmatched: {len(unmatched)}")
    if unmatched:
        print(f"\nUnmatched races:")
        print(pd.DataFrame(unmatched).to_string(index=False))


In [None]:
# Result consistency check (compare finishing positions)
if 'results' in kaggle_data and 2018 in fastf1_data:
    results = kaggle_data['results']
    races_2018 = results.merge(
        kaggle_data['races'][['raceId', 'year', 'name']],
        on='raceId',
        how='left'
    )
    races_2018 = races_2018[races_2018['year'] >= 2018]
    
    # Sample check: Compare a few races
    print("Result Consistency Check (Sample):")
    sample_races = races_2018['raceId'].unique()[:3]
    
    for race_id in sample_races:
        race_info = races_2018[races_2018['raceId'] == race_id].iloc[0]
        year = race_info['year']
        event_name = race_info['name']
        
        # Get Kaggle results
        kaggle_race_results = races_2018[races_2018['raceId'] == race_id].sort_values('positionOrder')
        
        # Get FastF1 results if available
        if year in fastf1_data and 'RESULTS' in fastf1_data[year]:
            fastf1_results = fastf1_data[year]['RESULTS']
            fastf1_race = fastf1_results[
                (fastf1_results['Year'] == year) & 
                (fastf1_results['Event'] == event_name) &
                (fastf1_results['Session'] == 'R')
            ]
            
            if len(fastf1_race) > 0:
                print(f"\n  {year} {event_name}:")
                print(f"    Kaggle: {len(kaggle_race_results)} drivers")
                print(f"    FastF1: {len(fastf1_race)} drivers")
                # Note: Full driver matching would require driver name/ID mapping


## 5. Verification Report

Generate summary report of all findings.


In [None]:
# Generate verification report
verification_report = {
    'summary': {
        'kaggle_datasets': len(kaggle_data),
        'fastf1_years': len([y for y in fastf1_years if y in fastf1_data]),
        'verification_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    },
    'completeness': completeness_df.to_dict('records'),
    'year_coverage': {
        'min_year': kaggle_data['races']['year'].min() if 'races' in kaggle_data else None,
        'max_year': kaggle_data['races']['year'].max() if 'races' in kaggle_data else None,
        'races_1994plus': len(kaggle_data['races'][kaggle_data['races']['year'] >= 1994]) if 'races' in kaggle_data else None
    }
}

# Save report
import json
report_path = PROCESSED_ROOT / "verification_report.json"
with open(report_path, 'w') as f:
    json.dump(verification_report, f, indent=2, default=str)

print("Verification Report Summary:")
print(f"  Kaggle datasets: {verification_report['summary']['kaggle_datasets']}")
print(f"  FastF1 years: {verification_report['summary']['fastf1_years']}")
print(f"  Year range: {verification_report['year_coverage']['min_year']} - {verification_report['year_coverage']['max_year']}")
print(f"  Races (1994+): {verification_report['year_coverage']['races_1994plus']}")
print(f"\nReport saved to: {report_path}")
