My project is making a machine learning micro service using a weighted Linear Regression and a weighted XGBoost. This is the code I used to train a model and make it presistant for the purpose of being able to be called later in an API. 

In [None]:
"""
Weighted Linear Regression for F1 Predictions
"""

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from pathlib import Path

DATA_PATH = Path("dataset/F1_training_data.csv")
MODEL_PATH = Path("models/trained/linear_v0.1.joblib")

def load_data():
    print("üìä Loading training data...")
    df = pd.read_csv(DATA_PATH)
    print(f"‚úÖ Loaded {len(df):,} rows")
    print(f"   Years: {int(df['year'].min())} - {int(df['year'].max())}")
    print(f"   Columns: {list(df.columns)}")
    return df

def create_features(df):
    print("\nüîß Creating feature matrix...")
    
    # Use ONLY numerical features (engineered features replace categorical ones)
    feature_cols = [
        'grid',                    # Starting position
        'driver_avg_position',     # Driver's rolling avg finish
        'driver_avg_points',       # Driver's rolling avg points
        'driver_recent_form',      # Driver's weighted recent form
        'track_avg_pos_change',    # Track overtaking metric
        'driver_track_avg'         # Driver's performance at this track
    ]
    
    print(f"   Using features: {feature_cols}")
    
    X = df[feature_cols].copy()
    y = df['position'].copy()
    
    # Remove any rows with NaN (shouldn't be any after feature engineering)
    mask = ~(X.isna().any(axis=1) | y.isna())
    X = X[mask]
    y = y[mask]
    
    print(f"‚úÖ Feature matrix ready: {len(X)} samples x {len(feature_cols)} features")
    print(f"   Target range: {y.min():.0f} - {y.max():.0f}")
    
    return X, y, df[mask]

def create_weights(df):
    print("\n‚öñÔ∏è  Creating time-based weights (recent races matter more)...")
    years = df['year'].values
    max_year = years.max()
    
    # Exponential decay: more recent years get higher weights
    # decay_rate = 0.15 means each year back reduces weight by ~14%
    decay_rate = 0.22
    weights = np.exp(-(max_year - years) * decay_rate)
    
    print(f"   Weight range: {weights.min():.3f} to {weights.max():.3f}")
    print(f"   2018 weight: {weights[years == 2018].mean():.3f}")
    print(f"   2025 weight: {weights[years == 2025].mean():.3f}")
    
    return weights

def train_model(X, y, weights):
    print("\nüéØ Training weighted linear regression...")
    
    # Split data for validation
    X_train, X_val, y_train, y_val, w_train, w_val = train_test_split(
        X, y, weights, test_size=0.15, random_state=42
    )
    
    print(f"   Training samples: {len(X_train)}")
    print(f"   Validation samples: {len(X_val)}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train weighted linear regression
    model = LinearRegression()
    model.fit(X_train_scaled, y_train, sample_weight=w_train)
    
    # Evaluate
    train_score = model.score(X_train_scaled, y_train)
    val_score = model.score(X_val_scaled, y_val)
    
    y_val_pred = model.predict(X_val_scaled)
    mae = mean_absolute_error(y_val, y_val_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    
    print(f"\nüìä Model Performance:")
    print(f"   Training R¬≤: {train_score:.4f}")
    print(f"   Validation R¬≤: {val_score:.4f}")
    print(f"   Validation MAE: {mae:.2f} positions")
    print(f"   Validation RMSE: {rmse:.2f} positions")
    
    # Show feature importance (coefficients)
    print(f"\nüìà Feature Coefficients:")
    for feat, coef in zip(X.columns, model.coef_):
        print(f"   {feat:25s}: {coef:+.4f}")
    
    return model, scaler

def save_model(model, scaler):
    print("üíæ Saving model...")
    MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
    
    package = {
        'model': model,
        'scaler': scaler,
        'version': 'v0.1'
    }
    
    joblib.dump(package, MODEL_PATH)
    print(f"‚úÖ Saved to {MODEL_PATH}")

def main():
    print("=" * 70)
    print("F1 Weighted Linear Regression Training")
    print("=" * 70)
    
    df = load_data()
    X, y, df_clean = create_features(df)
    weights = create_weights(df_clean)
    model, scaler = train_model(X, y, weights)
    save_model(model, scaler)
    
    print("\n" + "=" * 70)
    print("‚úÖ Training complete!")
    print("=" * 70)
    print("\nüìù Next steps:")
    print("   1. Model saved to: models/trained/linear_v0.1.joblib")
    print("   2. Update app/main.py to use the trained model")
    print("   3. Test predictions on 2025 races 19-24")

if __name__ == "__main__":
    main()

We then use the presistent model to predict the remaining races using an engineered CSV, and for testing purposes we used the 2024 grid posistion in order to predict our 2025 podium for the remain 6 races. 

In [None]:
"""
Predict the last 6 races of F1 2025 season (races 19-24)
- For completed races: compare predictions vs actual
- For future races: generate predictions based on historical data
"""

import pandas as pd
import numpy as np
import joblib
from pathlib import Path

# Paths
TRAINING_DATA = Path("dataset/F1_training_data.csv")
FULL_2025_DATA = Path("dataset/Formula1_2025Season_RaceResults.csv")
MODEL_PATH = Path("models/trained/linear_v0.1.joblib")
OUTPUT_PATH = Path("predictions/2025_races_19-24_predictions.csv")

def load_model():
    """Load the trained model"""
    print("üì¶ Loading trained model...")
    package = joblib.load(MODEL_PATH)
    model = package['model']
    scaler = package['scaler']
    print(f"‚úÖ Model loaded: {package['version']}")
    return model, scaler

def load_training_data():
    """Load training data to get driver/track statistics"""
    print("\nüìä Loading training data for feature engineering...")
    df = pd.read_csv(TRAINING_DATA)
    print(f"‚úÖ Loaded {len(df)} training samples")
    return df

def load_2025_races():
    """Load full 2025 season data"""
    print("\nüìä Loading 2025 season data...")
    df = pd.read_csv(FULL_2025_DATA)
    
    # Standardize column names
    df = df.rename(columns={
        'Starting Grid': 'grid',
        'Position': 'position',
        'Points': 'points',
        'Driver': 'driver',
        'Track': 'track'
    })
    
    df['year'] = 2025
    
    print(f"‚úÖ Loaded {len(df)} rows from 2025 season")
    
    return df

def get_last_6_races(df_2025):
    """Get the last 6 races (19-24) from 2025 data"""
    print("\nüèÅ Identifying last 6 races...")
    
    unique_tracks = df_2025['track'].unique()
    print(f"   Total races in 2025: {len(unique_tracks)}")
    print(f"   All races: {list(unique_tracks)}")
    
    if len(unique_tracks) < 19:
        print(f"   ‚ö†Ô∏è  Only {len(unique_tracks)} races available, need at least 19")
        last_6_tracks = unique_tracks[-6:] if len(unique_tracks) >= 6 else unique_tracks
    else:
        last_6_tracks = unique_tracks[18:]  # Races 19 onwards (0-indexed: position 18+)
    
    print(f"\n   Last 6 races (19-24): {list(last_6_tracks)}")
    
    df_last_6 = df_2025[df_2025['track'].isin(last_6_tracks)].copy()
    
    # Convert position to numeric (handles 'NC', 'DNF', etc.)
    df_last_6['position'] = pd.to_numeric(df_last_6['position'], errors='coerce')
    df_last_6['grid'] = pd.to_numeric(df_last_6['grid'], errors='coerce')
    df_last_6['points'] = pd.to_numeric(df_last_6['points'], errors='coerce')
    
    # Identify which races have actual results vs need predictions
    completed_races = []
    future_races = []
    
    for track in last_6_tracks:
        track_data = df_last_6[df_last_6['track'] == track]
        if track_data['position'].notna().any():
            completed_races.append(track)
        else:
            future_races.append(track)
    
    print(f"\n   ‚úÖ Completed races ({len(completed_races)}): {completed_races}")
    print(f"   üîÆ Future races ({len(future_races)}): {future_races}")
    
    return df_last_6, last_6_tracks, completed_races, future_races

def fill_missing_races_with_historical(df_last_6, last_6_tracks, df_training):
    """
    For races not yet in 2025 data, create entries using:
    - Historical grid positions from prior seasons at same track
    - Current 2025 driver lineup
    """
    print("\nüîÑ Filling missing races with historical grid data...")
    
    # Get current 2025 drivers
    drivers_2025 = df_training[df_training['year'] == 2025]['driver'].unique()
    
    if len(drivers_2025) == 0:
        # Fallback to most recent drivers
        drivers_2025 = df_training.groupby('driver').tail(1)['driver'].unique()[:20]
    
    print(f"   Current driver lineup: {len(drivers_2025)} drivers")
    
    # Expected last 6 tracks (typical F1 calendar order for end of season)
    # These are the tracks we expect in races 19-24
    expected_tracks = ['Singapore', 'United States', 'Mexico', 'Brazil', 'Las Vegas', 'Abu Dhabi']
    
    missing_tracks = []
    filled_data = []
    
    for track in expected_tracks:
        if track not in last_6_tracks:
            missing_tracks.append(track)
            
            # Get historical data for this track
            track_history = df_training[df_training['track'] == track].copy()
            
            if len(track_history) > 0:
                print(f"   üìç {track}: Using historical grid data")
                
                # Get most recent year's grid positions for this track
                recent_year = track_history['year'].max()
                recent_race = track_history[track_history['year'] == recent_year]
                
                # Create mapping of grid positions from recent race
                grid_positions = recent_race.set_index('driver')['grid'].to_dict()
                
                # Create entries for current 2025 drivers
                for driver in drivers_2025:
                    # Use historical grid if driver raced there, else estimate
                    if driver in grid_positions:
                        grid_pos = grid_positions[driver]
                    else:
                        # New driver - estimate based on their average position
                        driver_avg = df_training[df_training['driver'] == driver]['grid'].mean()
                        if pd.notna(driver_avg):
                            grid_pos = driver_avg
                        else:
                            grid_pos = 10.0  # Default middle of pack
                    
                    filled_data.append({
                        'year': 2025,
                        'track': track,
                        'driver': driver,
                        'grid': grid_pos,
                        'position': np.nan,  # Future race - no result yet
                        'points': np.nan
                    })
            else:
                print(f"   ‚ö†Ô∏è  {track}: No historical data available, skipping")
    
    if filled_data:
        df_filled = pd.DataFrame(filled_data)
        print(f"   ‚úÖ Created {len(df_filled)} entries for {len(missing_tracks)} missing races")
        
        # Combine with existing last 6 races
        df_combined = pd.concat([df_last_6, df_filled], ignore_index=True)
        
        # Update tracks list
        all_last_6_tracks = list(last_6_tracks) + missing_tracks
        
        return df_combined, all_last_6_tracks, missing_tracks
    else:
        print("   ‚ÑπÔ∏è  No missing races to fill")
        return df_last_6, list(last_6_tracks), []

def engineer_features_for_prediction(df_predict, df_training):
    """
    Engineer features for prediction races using training data statistics
    """
    print("\nüîß Engineering features for prediction...")
    
    df_predict = df_predict.copy()
    
    # Calculate statistics from training data (up to race 18)
    driver_stats = df_training.groupby('driver').agg({
        'position': 'mean',
        'points': 'mean'
    }).rename(columns={
        'position': 'driver_avg_position',
        'points': 'driver_avg_points'
    })
    
    # Track statistics from training data
    track_stats = df_training.groupby('track').apply(
        lambda x: (x['grid'] - x['position']).mean()
    ).to_dict()
    
    # Driver-track combinations
    driver_track_stats = df_training.groupby(['driver', 'track'])['position'].mean().to_dict()
    
    # Merge driver stats
    df_predict = df_predict.merge(driver_stats, on='driver', how='left')
    
    # Add track avg position change
    df_predict['track_avg_pos_change'] = df_predict['track'].map(track_stats)
    
    # Add driver-track average
    df_predict['driver_track_avg'] = df_predict.apply(
        lambda row: driver_track_stats.get((row['driver'], row['track']), row['driver_avg_position']),
        axis=1
    )
    
    # For recent form, use last 3 races from training data per driver
    driver_recent = df_training.sort_values(['year', 'track']).groupby('driver').tail(3).groupby('driver')['position'].mean()
    df_predict['driver_recent_form'] = df_predict['driver'].map(driver_recent)
    
    # Fill missing values with driver averages
    df_predict['driver_recent_form'] = df_predict['driver_recent_form'].fillna(df_predict['driver_avg_position'])
    df_predict['driver_track_avg'] = df_predict['driver_track_avg'].fillna(df_predict['driver_avg_position'])
    
    # Fill any remaining NaN with medians
    for col in ['driver_avg_position', 'driver_avg_points', 'driver_recent_form', 
                'track_avg_pos_change', 'driver_track_avg']:
        if col in df_predict.columns:
            df_predict[col] = df_predict[col].fillna(df_predict[col].median())
    
    print(f"‚úÖ Features engineered for {len(df_predict)} entries")
    
    return df_predict

def make_predictions(df, model, scaler):
    """Make predictions using the trained model"""
    print("\nüîÆ Making predictions...")
    
    feature_cols = [
        'grid',
        'driver_avg_position',
        'driver_avg_points',
        'driver_recent_form',
        'track_avg_pos_change',
        'driver_track_avg'
    ]
    
    X = df[feature_cols].copy()
    X_scaled = scaler.transform(X)
    
    predictions = model.predict(X_scaled)
    
    # Clip predictions to valid range [1, 20]
    predictions = np.clip(predictions, 1, 20)
    
    df['predicted_position'] = predictions
    
    print(f"‚úÖ Predictions complete")
    
    return df

def evaluate_predictions(df, completed_races):
    """Evaluate predictions for completed races"""
    if not completed_races:
        print("\n‚ö†Ô∏è  No completed races to evaluate")
        return None
    
    print("\nüìä Evaluating predictions on completed races...")
    
    df_completed = df[df['track'].isin(completed_races) & df['position'].notna()].copy()
    
    if len(df_completed) == 0:
        print("   No data with actual results")
        return None
    
    mae = np.mean(np.abs(df_completed['predicted_position'] - df_completed['position']))
    rmse = np.sqrt(np.mean((df_completed['predicted_position'] - df_completed['position'])**2))
    
    print(f"\n   MAE: {mae:.2f} positions")
    print(f"   RMSE: {rmse:.2f} positions")
    
    # Show sample comparisons
    print(f"\n   Sample predictions vs actual:")
    sample = df_completed[['track', 'driver', 'grid', 'position', 'predicted_position']].head(10)
    print(sample.to_string(index=False))
    
    return df_completed

def save_predictions(df, output_path):
    """Save predictions to CSV"""
    print(f"\nüíæ Saving predictions to {output_path}...")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Sort by track and predicted position
    df_sorted = df.sort_values(['track', 'predicted_position'])
    
    # Select relevant columns
    output_cols = ['track', 'driver', 'grid', 'predicted_position', 'position', 'points']
    df_output = df_sorted[output_cols].copy()
    
    df_output.to_csv(output_path, index=False)
    print(f"‚úÖ Saved {len(df_output)} predictions")
    
    return df_output

def main():
    print("=" * 70)
    print("F1 2025 Last 6 Races Prediction")
    print("=" * 70)
    
    # Load model and data
    model, scaler = load_model()
    df_training = load_training_data()
    df_2025 = load_2025_races()
    
    # Get last 6 races
    df_last_6, last_6_tracks, completed_races, future_races = get_last_6_races(df_2025)
    
    # Fill missing races with historical grid data
    df_last_6_filled, all_tracks, filled_tracks = fill_missing_races_with_historical(
        df_last_6, last_6_tracks, df_training
    )
    
    # Update future races list
    all_future_races = future_races + filled_tracks
    
    # Engineer features
    df_with_features = engineer_features_for_prediction(df_last_6_filled, df_training)
    
    # Make predictions
    df_predictions = make_predictions(df_with_features, model, scaler)
    
    # Evaluate on completed races
    if completed_races:
        evaluate_predictions(df_predictions, completed_races)
    
    # Save predictions
    df_output = save_predictions(df_predictions, OUTPUT_PATH)
    
    # Show predictions for future/filled races
    if all_future_races:
        print("\nüîÆ Predictions for future/simulated races:")
        df_future = df_predictions[df_predictions['track'].isin(all_future_races)].sort_values(['track', 'predicted_position'])
        for track in all_future_races:
            track_data = df_future[df_future['track'] == track]
            if len(track_data) > 0:
                is_filled = track in filled_tracks
                status = "üìä Historical grid" if is_filled else "üîÆ Future"
                print(f"\n   {status} - {track}:")
                track_pred = track_data[['driver', 'grid', 'predicted_position']].head(10)
                print(track_pred.to_string(index=False))
    
    print("\n" + "=" * 70)
    print("‚úÖ Predictions complete!")
    print("=" * 70)
    print(f"\nüìÇ Results saved to: {OUTPUT_PATH}")
    print(f"\nüìä Summary:")
    print(f"   Total races predicted: {len(all_tracks)}")
    print(f"   Completed races: {len(completed_races)}")
    print(f"   Future/Simulated races: {len(all_future_races)}")

if __name__ == "__main__":
    main()
