# 02.2 - Telemetry Data Exploration (Memory-Efficient)

This notebook provides **resource-efficient** exploration of the TELEMETRY data, which is extremely large and RAM-intensive.

## Memory-Efficient Techniques Used:
1. **Chunked Reading**: Process data in small chunks instead of loading entire files
2. **Sampling**: Examine random samples from large files
3. **Selective Column Reading**: Only load columns we need to inspect
4. **Memory Profiling**: Track memory usage during operations
5. **Incremental Processing**: Process one file/year at a time and clear memory between operations

## Goals:
- Understand the structure of TELEMETRY data
- Identify available columns and data types
- Check data volume and memory requirements
- Prepare for feature extraction


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
import os
import sys
warnings.filterwarnings('ignore')

# Memory profiling (optional - install with: pip install memory-profiler)
try:
    from memory_profiler import profile
    MEMORY_PROFILING = True
except ImportError:
    MEMORY_PROFILING = False
    print("Note: memory-profiler not installed. Install with: pip install memory-profiler")

# Set up paths
# Get project root (works whether running from notebooks/ or F1/ folder)
PROJECT_ROOT = Path().resolve()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent

FASTF1_ROOT = PROJECT_ROOT / "data" / "raw" / "fastf1_2018plus"

print(f"FastF1 data: {FASTF1_ROOT}")
print(f"Python version: {sys.version}")
print(f"Pandas version: {pd.__version__}")

FastF1 data: C:\Users\erikv\Downloads\F1\data\raw\fastf1_2018plus
Python version: 3.12.9 (tags/v3.12.9:fdb8142, Feb  4 2025, 15:27:58) [MSC v.1942 64 bit (AMD64)]
Pandas version: 2.3.3


## 1. File Size Analysis
Check the size of all TELEMETRY files to understand data volume.


In [4]:
# Check file sizes for all telemetry files
telemetry_files = sorted(FASTF1_ROOT.glob("ALL_TELEMETRY_*.csv"))

print("TELEMETRY File Sizes:")
print("=" * 60)

total_size = 0
for file_path in telemetry_files:
    size_mb = file_path.stat().st_size / (1024 * 1024)
    size_gb = size_mb / 1024
    total_size += size_mb
    
    # Get approximate line count (first line is header, estimate from file size)
    # This is a rough estimate - actual count will be verified in chunks
    print(f"{file_path.name:30s} {size_mb:>10.2f} MB ({size_gb:>6.3f} GB)")

print("=" * 60)
print(f"{'Total':30s} {total_size:>10.2f} MB ({total_size/1024:>6.3f} GB)")
print(f"\nNumber of files: {len(telemetry_files)}")


TELEMETRY File Sizes:
ALL_TELEMETRY_2018.csv            2398.98 MB ( 2.343 GB)
ALL_TELEMETRY_2019.csv            6466.11 MB ( 6.315 GB)
ALL_TELEMETRY_2020.csv            2060.52 MB ( 2.012 GB)
ALL_TELEMETRY_2021.csv            6636.65 MB ( 6.481 GB)
ALL_TELEMETRY_2022.csv             214.53 MB ( 0.210 GB)
ALL_TELEMETRY_2023.csv            5905.84 MB ( 5.767 GB)
ALL_TELEMETRY_2024.csv            6205.47 MB ( 6.060 GB)
Total                            29888.11 MB (29.188 GB)

Number of files: 7


## 2. Memory-Efficient Column Discovery
Read only the first chunk to discover columns without loading the entire file.


In [5]:
# Function to get column info from first chunk
def get_columns_info(file_path, chunk_size=1000):
    """Read first chunk to discover columns and data types"""
    try:
        # Read only first chunk
        chunk_iter = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False)
        first_chunk = next(chunk_iter)
        
        info = {
            'columns': list(first_chunk.columns),
            'n_columns': len(first_chunk.columns),
            'dtypes': first_chunk.dtypes.to_dict(),
            'sample_row': first_chunk.iloc[0].to_dict() if len(first_chunk) > 0 else None,
            'memory_usage_mb': first_chunk.memory_usage(deep=True).sum() / (1024 * 1024)
        }
        
        return info, first_chunk
    except StopIteration:
        return None, None
    except Exception as e:
        print(f"Error reading {file_path.name}: {e}")
        return None, None

# Check first file (or smallest) for structure
test_file = telemetry_files[0] if telemetry_files else None

if test_file:
    print(f"Analyzing structure of: {test_file.name}")
    print("=" * 60)
    
    info, sample_df = get_columns_info(test_file, chunk_size=1000)
    
    if info:
        print(f"\nColumns ({info['n_columns']}):")
        for i, col in enumerate(info['columns'], 1):
            dtype = info['dtypes'][col]
            print(f"  {i:2d}. {col:30s} {str(dtype)}")
        
        print(f"\nFirst chunk memory usage: {info['memory_usage_mb']:.2f} MB")
        
        print(f"\nSample row (first record):")
        if info['sample_row']:
            for key, value in list(info['sample_row'].items())[:10]:  # First 10 columns
                print(f"  {key:30s}: {value}")
            if len(info['sample_row']) > 10:
                print(f"  ... and {len(info['sample_row']) - 10} more columns")


Analyzing structure of: ALL_TELEMETRY_2018.csv

Columns (14):
   1. Date                           object
   2. RPM                            float64
   3. Speed                          float64
   4. nGear                          int64
   5. Throttle                       float64
   6. Brake                          bool
   7. DRS                            int64
   8. Source                         object
   9. Time                           object
  10. SessionTime                    object
  11. Year                           int64
  12. Event                          object
  13. Session                        object
  14. Driver                         int64

First chunk memory usage: 0.42 MB

Sample row (first record):
  Date                          : 2018-03-25 05:06:03.659
  RPM                           : 0.0
  Speed                         : 0.0
  nGear                         : 0
  Throttle                      : 0.0
  Brake                         : False
  DRS         

## 3. Efficient Row Count Estimation
Count rows without loading entire file into memory.


In [6]:
# Function to count rows efficiently using chunking
def count_rows_chunked(file_path, chunk_size=10000, max_chunks=None):
    """Count total rows by processing in chunks"""
    total_rows = 0
    chunks_processed = 0
    
    try:
        chunk_iter = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False)
        
        for chunk in chunk_iter:
            total_rows += len(chunk)
            chunks_processed += 1
            
            # Limit chunks for very large files (optional)
            if max_chunks and chunks_processed >= max_chunks:
                print(f"  (Limited to {max_chunks} chunks for estimation)")
                break
                
    except Exception as e:
        print(f"  Error: {e}")
        return None, chunks_processed
    
    return total_rows, chunks_processed

# Count rows for one file as example
if test_file:
    print(f"Counting rows in: {test_file.name}")
    print("This may take a while for large files...")
    
    row_count, chunks = count_rows_chunked(test_file, chunk_size=10000)
    
    if row_count is not None:
        print(f"Total rows: {row_count:,}")
        print(f"Chunks processed: {chunks}")
        
        # Estimate memory if full file loaded (approximate)
        if sample_df is not None:
            row_size_mb = sample_df.memory_usage(deep=True).sum() / (1024 * 1024) / len(sample_df)
            estimated_memory_gb = (row_count * row_size_mb) / 1024
            print(f"Estimated memory if fully loaded: {estimated_memory_gb:.2f} GB")


Counting rows in: ALL_TELEMETRY_2018.csv
This may take a while for large files...
Total rows: 19,140,560
Chunks processed: 1915
Estimated memory if fully loaded: 7.90 GB


## 3.5. Row Count Comparison Across Years
Compare row counts for all telemetry files across different years to understand data volume trends.


In [7]:
# Count rows for all telemetry files across years
print("Row Count Comparison by Year:")
print("=" * 70)

row_counts_by_year = {}
file_sizes_by_year = {}

for file_path in telemetry_files:
    year = file_path.stem.split('_')[-1]
    file_size_mb = file_path.stat().st_size / (1024 * 1024)
    file_sizes_by_year[year] = file_size_mb
    
    print(f"\n{year}: {file_path.name}")
    print(f"  File size: {file_size_mb:.2f} MB")
    print(f"  Counting rows (this may take a while)...")
    
    row_count, chunks = count_rows_chunked(file_path, chunk_size=10000)
    
    if row_count is not None:
        row_counts_by_year[year] = row_count
        print(f"  ✓ Total rows: {row_count:,}")
        print(f"  ✓ Chunks processed: {chunks}")
        
        # Calculate rows per MB
        rows_per_mb = row_count / file_size_mb if file_size_mb > 0 else 0
        print(f"  ✓ Rows per MB: {rows_per_mb:,.0f}")
    else:
        print(f"  ✗ Error counting rows")

print("\n" + "=" * 70)
print("SUMMARY - Row Counts by Year:")
print("=" * 70)

# Create comparison DataFrame
if row_counts_by_year:
    comparison_df = pd.DataFrame({
        'Year': list(row_counts_by_year.keys()),
        'Row_Count': list(row_counts_by_year.values()),
        'File_Size_MB': [file_sizes_by_year.get(year, 0) for year in row_counts_by_year.keys()]
    })
    
    # Sort by year
    comparison_df = comparison_df.sort_values('Year')
    
    # Calculate additional metrics
    comparison_df['Rows_Per_MB'] = comparison_df['Row_Count'] / comparison_df['File_Size_MB']
    comparison_df['Pct_of_Total'] = (comparison_df['Row_Count'] / comparison_df['Row_Count'].sum() * 100).round(2)
    
    # Display formatted table
    print(f"\n{'Year':<8} {'Row Count':>15} {'File Size (MB)':>18} {'Rows/MB':>12} {'% of Total':>12}")
    print("-" * 70)
    
    for _, row in comparison_df.iterrows():
        print(f"{row['Year']:<8} {row['Row_Count']:>15,} {row['File_Size_MB']:>18,.2f} "
              f"{row['Rows_Per_MB']:>12,.0f} {row['Pct_of_Total']:>11.2f}%")
    
    print("-" * 70)
    print(f"{'TOTAL':<8} {comparison_df['Row_Count'].sum():>15,} {comparison_df['File_Size_MB'].sum():>18,.2f} "
          f"{comparison_df['Row_Count'].sum() / comparison_df['File_Size_MB'].sum():>12,.0f} {'100.00':>12}%")
    
    # Find min/max
    max_year = comparison_df.loc[comparison_df['Row_Count'].idxmax(), 'Year']
    max_rows = comparison_df['Row_Count'].max()
    min_year = comparison_df.loc[comparison_df['Row_Count'].idxmin(), 'Year']
    min_rows = comparison_df['Row_Count'].min()
    
    print(f"\nMaximum rows: {max_year} with {max_rows:,} rows")
    print(f"Minimum rows: {min_year} with {min_rows:,} rows")
    print(f"Range: {max_rows - min_rows:,} rows ({((max_rows / min_rows - 1) * 100):.1f}% difference)")
    
    # Store for later use
    telemetry_row_counts = comparison_df
else:
    print("No row counts available")
    telemetry_row_counts = None


Row Count Comparison by Year:

2018: ALL_TELEMETRY_2018.csv
  File size: 2398.98 MB
  Counting rows (this may take a while)...
  ✓ Total rows: 19,140,560
  ✓ Chunks processed: 1915
  ✓ Rows per MB: 7,979

2019: ALL_TELEMETRY_2019.csv
  File size: 6466.11 MB
  Counting rows (this may take a while)...
  ✓ Total rows: 51,628,140
  ✓ Chunks processed: 5163
  ✓ Rows per MB: 7,984

2020: ALL_TELEMETRY_2020.csv
  File size: 2060.52 MB
  Counting rows (this may take a while)...
  ✓ Total rows: 16,355,180
  ✓ Chunks processed: 1636
  ✓ Rows per MB: 7,937

2021: ALL_TELEMETRY_2021.csv
  File size: 6636.65 MB
  Counting rows (this may take a while)...
  ✓ Total rows: 52,824,160
  ✓ Chunks processed: 5283
  ✓ Rows per MB: 7,959

2022: ALL_TELEMETRY_2022.csv
  File size: 214.53 MB
  Counting rows (this may take a while)...
  ✓ Total rows: 1,725,796
  ✓ Chunks processed: 173
  ✓ Rows per MB: 8,044

2023: ALL_TELEMETRY_2023.csv
  File size: 5905.84 MB
  Counting rows (this may take a while)...
  ✓ To

In [None]:
# Optional: Visualize row counts comparison (if matplotlib is available)
try:
    import matplotlib.pyplot as plt
    
    if telemetry_row_counts is not None:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
        
        # Plot 1: Row counts by year
        ax1.bar(telemetry_row_counts['Year'], telemetry_row_counts['Row_Count'] / 1e6, 
                color='steelblue', alpha=0.7)
        ax1.set_xlabel('Year', fontsize=12)
        ax1.set_ylabel('Row Count (Millions)', fontsize=12)
        ax1.set_title('Telemetry Row Counts by Year', fontsize=14, fontweight='bold')
        ax1.grid(axis='y', alpha=0.3)
        
        # Add value labels on bars
        for idx, row in telemetry_row_counts.iterrows():
            height = row['Row_Count'] / 1e6
            ax1.text(row['Year'], height, f'{height:.1f}M', 
                    ha='center', va='bottom', fontsize=9)
        
        # Plot 2: File size vs Row count
        ax2.scatter(telemetry_row_counts['File_Size_MB'] / 1024, 
                   telemetry_row_counts['Row_Count'] / 1e6,
                   s=100, alpha=0.6, color='coral')
        ax2.set_xlabel('File Size (GB)', fontsize=12)
        ax2.set_ylabel('Row Count (Millions)', fontsize=12)
        ax2.set_title('File Size vs Row Count', fontsize=14, fontweight='bold')
        ax2.grid(alpha=0.3)
        
        # Add year labels to scatter points
        for idx, row in telemetry_row_counts.iterrows():
            ax2.annotate(row['Year'], 
                        (row['File_Size_MB'] / 1024, row['Row_Count'] / 1e6),
                        fontsize=9, ha='center')
        
        plt.tight_layout()
        plt.show()
        
        print("Visualization created successfully!")
    else:
        print("No data available for visualization")
        
except ImportError:
    print("Matplotlib not available - skipping visualization")
except Exception as e:
    print(f"Error creating visualization: {e}")


## 4. Sampling Strategy - Get Representative Sample
Instead of loading entire files, create a representative sample for exploration.


In [None]:
# Function to get stratified sample (first N, middle N, last N rows)
def get_stratified_sample(file_path, n_samples=5000, chunk_size=10000):
    """Get samples from beginning, middle, and end of file"""
    samples = []
    
    try:
        # First, count total chunks (approximate)
        chunk_iter = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False)
        
        chunks_list = []
        for chunk in chunk_iter:
            chunks_list.append(chunk)
        
        total_chunks = len(chunks_list)
        
        if total_chunks == 0:
            return None
        
        # Sample from beginning
        if len(chunks_list[0]) > 0:
            samples.append(chunks_list[0].head(min(n_samples // 3, len(chunks_list[0]))))
        
        # Sample from middle
        if total_chunks > 2:
            mid_chunk_idx = total_chunks // 2
            if len(chunks_list[mid_chunk_idx]) > 0:
                samples.append(chunks_list[mid_chunk_idx].head(min(n_samples // 3, len(chunks_list[mid_chunk_idx]))))
        
        # Sample from end
        if total_chunks > 1:
            if len(chunks_list[-1]) > 0:
                samples.append(chunks_list[-1].tail(min(n_samples // 3, len(chunks_list[-1]))))
        
        # Combine samples
        if samples:
            combined_sample = pd.concat(samples, ignore_index=True)
            # Remove duplicates if any
            combined_sample = combined_sample.drop_duplicates()
            return combined_sample
        
    except Exception as e:
        print(f"Error sampling {file_path.name}: {e}")
        return None
    
    return None

# Get sample for exploration
if test_file:
    print(f"Creating sample from: {test_file.name}")
    print("Sampling 5,000 rows (stratified across file)...")
    
    sample_telemetry = get_stratified_sample(test_file, n_samples=5000, chunk_size=10000)
    
    if sample_telemetry is not None:
        print(f"\nSample size: {len(sample_telemetry):,} rows, {len(sample_telemetry.columns)} columns")
        print(f"Sample memory usage: {sample_telemetry.memory_usage(deep=True).sum() / (1024 * 1024):.2f} MB")
        
        print("\nSample data preview:")
        print(sample_telemetry.head(10))
        
        print("\nData types:")
        print(sample_telemetry.dtypes)
        
        print("\nMissing values:")
        missing = sample_telemetry.isnull().sum()
        print(missing[missing > 0])


## 5. Summary Statistics by Year (Chunked Processing)
Get summary statistics for each year without loading full files.


In [None]:
# Function to get summary stats chunked
def get_summary_stats_chunked(file_path, numeric_cols=None, chunk_size=50000, max_chunks=10):
    """Calculate summary statistics by processing chunks"""
    stats_list = []
    row_counts = []
    
    try:
        chunk_iter = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False, usecols=numeric_cols)
        
        for i, chunk in enumerate(chunk_iter):
            if max_chunks and i >= max_chunks:
                break
            
            # Basic stats
            row_counts.append(len(chunk))
            
            # Numeric summary
            if numeric_cols:
                numeric_chunk = chunk.select_dtypes(include=[np.number])
                if len(numeric_chunk.columns) > 0:
                    stats_list.append(numeric_chunk.describe())
        
        if stats_list:
            # Combine stats (approximate - averaging across chunks)
            combined_stats = pd.concat(stats_list).groupby(level=0).mean()
            total_rows = sum(row_counts)
            return combined_stats, total_rows
        
    except Exception as e:
        print(f"  Error: {e}")
        return None, 0
    
    return None, sum(row_counts)

# Get summary for all years (limited processing)
print("Summary Statistics (Limited Processing):")
print("=" * 60)

for file_path in telemetry_files[:3]:  # Process first 3 files only as example
    year = file_path.stem.split('_')[-1]
    print(f"\n{year}: {file_path.name}")
    
    # First get numeric columns from sample
    info, _ = get_columns_info(file_path, chunk_size=1000)
    if info:
        numeric_cols = [col for col, dtype in info['dtypes'].items() 
                       if pd.api.types.is_numeric_dtype(dtype)]
        
        print(f"  Processing up to 10 chunks...")
        stats, row_count = get_summary_stats_chunked(
            file_path, 
            numeric_cols=numeric_cols[:10] if len(numeric_cols) > 10 else numeric_cols,  # Limit columns
            chunk_size=50000,
            max_chunks=10
        )
        
        print(f"  Rows processed: {row_count:,}")
        if stats is not None:
            print(f"  Numeric columns analyzed: {len(stats.columns)}")
            print(f"  Sample stats:\n{stats.head()}")
    
    # Clear memory
    if 'stats' in locals():
        del stats


## 6. Key Insights and Data Structure
Summarize findings for feature extraction planning.


In [None]:
# Create summary document
if sample_telemetry is not None:
    print("TELEMETRY DATA STRUCTURE SUMMARY")
    print("=" * 60)
    
    print(f"\nColumns ({len(sample_telemetry.columns)}):")
    for col in sample_telemetry.columns:
        dtype = sample_telemetry[col].dtype
        null_pct = (sample_telemetry[col].isnull().sum() / len(sample_telemetry)) * 100
        print(f"  - {col:30s} {str(dtype):15s} {null_pct:5.1f}% null")
    
    print(f"\nUnique values in categorical columns:")
    for col in sample_telemetry.select_dtypes(include=['object']).columns:
        unique_count = sample_telemetry[col].nunique()
        if unique_count < 20:  # Only show if reasonable number of unique values
            print(f"  - {col:30s}: {unique_count} unique values")
            print(f"    Values: {list(sample_telemetry[col].unique())[:10]}")
    
    print(f"\nNumeric columns ranges:")
    for col in sample_telemetry.select_dtypes(include=[np.number]).columns:
        print(f"  - {col:30s}: [{sample_telemetry[col].min():.2f}, {sample_telemetry[col].max():.2f}]")
    
    print(f"\nTime/Date columns:")
    time_cols = [col for col in sample_telemetry.columns 
                if 'time' in col.lower() or 'date' in col.lower()]
    for col in time_cols:
        print(f"  - {col}")
        print(f"    Sample values: {sample_telemetry[col].head(3).tolist()}")
    
    # Memory recommendations
    if test_file:
        file_size_mb = test_file.stat().st_size / (1024 * 1024)
        if file_size_mb > 100:
            print(f"\n⚠️  MEMORY WARNING:")
            print(f"   File size: {file_size_mb:.2f} MB")
            print(f"   Recommendation: Use chunked processing for feature extraction")
            print(f"   Suggested chunk size: 10,000 - 50,000 rows")


## 7. Memory-Efficient Feature Planning
Based on exploration, plan feature extraction strategy.


In [None]:
# Helper function for chunked feature extraction (template)
def process_telemetry_chunked(file_path, feature_func, chunk_size=50000, output_path=None):
    """
    Template for memory-efficient feature extraction from telemetry data.
    
    Parameters:
    -----------
    file_path : Path
        Path to telemetry CSV file
    feature_func : callable
        Function that takes a chunk DataFrame and returns features
    chunk_size : int
        Number of rows to process per chunk
    output_path : Path, optional
        Path to save extracted features
    """
    results = []
    
    try:
        chunk_iter = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False)
        
        for i, chunk in enumerate(chunk_iter):
            # Extract features from chunk
            features = feature_func(chunk)
            results.append(features)
            
            # Optional: Save incrementally to disk
            if output_path and (i + 1) % 10 == 0:  # Save every 10 chunks
                temp_df = pd.concat(results, ignore_index=True)
                temp_df.to_csv(output_path, mode='a', header=not output_path.exists(), index=False)
                results = []  # Clear from memory
                print(f"  Saved progress after {i + 1} chunks")
        
        # Final save
        if results:
            final_df = pd.concat(results, ignore_index=True)
            if output_path:
                final_df.to_csv(output_path, mode='a', header=not output_path.exists(), index=False)
            else:
                return final_df
        
    except Exception as e:
        print(f"Error processing {file_path.name}: {e}")
        return None
    
    return None

print("Feature extraction template function created.")
print("\nUsage example:")
print("  def extract_features(chunk):")
print("      # Your feature extraction logic here")
print("      return features_df")
print("  ")
print("  process_telemetry_chunked(")
print("      file_path=telemetry_file,")
print("      feature_func=extract_features,")
print("      chunk_size=50000,")
print("      output_path=Path('features.csv')")
print("  )")
