In [142]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Load the dataset
file_path = 'C:/Users/ZIKOU/Desktop/f1-predictor-trunk/model-notebooks/cleaned_data.csv'
data = pd.read_csv(file_path)



In [143]:
# Calculate current year
current_year = pd.Timestamp.now().year

# Convert Date of Birth to Age
data['dob'] = current_year - pd.to_datetime(data['dob']).dt.year


In [144]:
# Encode categorical variables
label_encoders = {}
for col in ['GP_name', 'constructor', 'driver']:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

    
# Split data into features and target
X = data.drop(columns=['position'])
y = data['position']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [145]:
data


Unnamed: 0.1,Unnamed: 0,GP_name,quali_pos,constructor,driver,position,driver_confidence,constructor_relaiblity,active_driver,active_constructor,dob,istest
0,90,0,16,7,12,16,0.965035,0.569961,1,1,35,0
1,91,0,15,7,12,10,0.965035,0.569961,1,1,35,0
2,92,0,0,7,12,6,0.965035,0.569961,1,1,35,0
3,93,0,16,7,12,11,0.965035,0.569961,1,1,35,0
4,94,0,20,7,6,19,0.923077,0.569961,1,1,26,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1112,8080,6,5,2,3,5,0.932660,0.824359,1,1,43,0
1113,8081,6,8,2,3,8,0.932660,0.824359,1,1,43,0
1114,8096,6,3,5,8,3,0.940711,0.877805,1,1,39,0
1115,8114,6,20,0,11,17,0.933333,0.395276,1,1,34,0


In [146]:
print(data['dob'].dtype)
print(data['dob'].unique())

# Initialize models
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'SVM': SVR(),
    'Linear Regression': LinearRegression()
}

int32
[35 26 43 39 34 32 30 25 27 28]


In [184]:
# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    accuracy = model.score(X_test, y_test)  # Accuracy for regression models
    results[name] = {'MSE': mse, 'Accuracy': accuracy}

# Display results
for name, metrics in results.items():
    print(f"{name}: MSE = {metrics['MSE']}, Accuracy = {metrics['Accuracy']}")


Random Forest: MSE = 1.417705357142857, Accuracy = 0.9551875256711024
XGBoost: MSE = 1.8755713778059053, Accuracy = 0.9407147657328964
SVM: MSE = 32.129060720737805, Accuracy = -0.015572595183874549
Linear Regression: MSE = 4.451780103064293, Accuracy = 0.8592829740105419


In [175]:
# Determine the best model based on MSE
best_model_name = min(results, key=lambda x: results[x]['MSE'])
best_model = models[best_model_name]
print("The best model based on MSE is:", best_model_name)

# Calculate accuracy of the best model
best_model_accuracy = best_model.score(X_test, y_test)

The best model based on MSE is: Random Forest


In [183]:
# Optimizing the best model if applicable
# Define parameter grid based on the best model
param_grid = {}
if best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'min_samples_leaf': [1, 2, 3]
    }
elif best_model_name == 'XGBoost':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    }
elif best_model_name == 'SVM':
    param_grid = {
        'C': [1, 10, 100],
        'gamma': [0.01, 0.1, 1]
    }

# Choose GridSearchCV or RandomizedSearchCV based on the model
if param_grid:
    grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    print("Optimized parameters:")
    print(grid_search.best_params_)

    # Evaluate the optimized model
    optimized_predictions = best_model.predict(X_test)
    optimized_mse = mean_squared_error(y_test, optimized_predictions)
    print(f"Optimized MSE for {best_model_name}: {optimized_mse}, Accuracy: {best_model_accuracy}")
else:
    print("No parameters to tune for this model.")

Optimized parameters:
{'max_depth': 20, 'min_samples_leaf': 1, 'n_estimators': 300}
Optimized MSE for Random Forest: 1.4456920138888891, Accuracy: 0.9541800677391105


In [185]:
def predict_position(gp_name, driver):
    # Encode the input using the trained LabelEncoders
    gp_name_encoded = label_encoders['GP_name'].transform([gp_name])[0]
    driver_encoded = label_encoders['driver'].transform([driver])[0]
    
    # Example feature set, using the mean of the other features
    example_features = X_train.mean().to_dict()
    example_features['GP_name'] = gp_name_encoded
    example_features['driver'] = driver_encoded
    
    # Use the model to predict
    predicted_position = model.predict([list(example_features.values())])[0]
    return predicted_position

In [186]:
# User interface to select a driver and GP
def user_input_prediction():
    print("Select a Grand Prix from the following list:")
    for gp in label_encoders['GP_name'].classes_:
        print(gp)
    selected_gp = input("Enter your selected GP name: ")

    print("Select a driver from the following list:")
    for driver in label_encoders['driver'].classes_:
        print(driver)
    selected_driver = input("Enter your selected driver name: ")

    prediction = predict_position(selected_gp, selected_driver)
    print(f"The predicted position for {selected_driver} at {selected_gp} is: {prediction:.2f}")

In [189]:
# Call this function to start the user input process
user_input_prediction()

Select a Grand Prix from the following list:
Albert Park Grand Prix Circuit
Autodromo Nazionale di Monza
Autódromo Hermanos Rodríguez
Autódromo José Carlos Pace
Bahrain International Circuit
Baku City Circuit
Buddh International Circuit
Circuit Gilles Villeneuve
Circuit Paul Ricard
Circuit de Barcelona-Catalunya
Circuit de Monaco
Circuit de Nevers Magny-Cours
Circuit de Spa-Francorchamps
Circuit of the Americas
Fuji Speedway
Hockenheimring
Hungaroring
Indianapolis Motor Speedway
Istanbul Park
Korean International Circuit
Marina Bay Street Circuit
Nürburgring
Red Bull Ring
Sepang International Circuit
Shanghai International Circuit
Silverstone Circuit
Sochi Autodrom
Suzuka Circuit
Valencia Street Circuit
Yas Marina Circuit
Enter your selected GP name: Albert Park Grand Prix Circuit
Select a driver from the following list:
Alexander Albon
Carlos Sainz
Charles Leclerc
Fernando Alonso
George Russell
Kevin Magnussen
Lance Stroll
Lando Norris
Lewis Hamilton
Max Verstappen
Pierre Gasly
Sergio



In [188]:
Valtteri Bottas

SyntaxError: invalid syntax (3262589035.py, line 1)