In [75]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

In [76]:
df = pd.read_csv("total.csv")
df.head()

Unnamed: 0,Driver,Constructor,RaceName,DriverPointsSoFar,ConstructorPointsSoFar,NextRacePosition,DriverWinRateThisSeason,DriverPodiumRate,AverageQualifyingPosition,DriverExperienceYears,DNFCountThisSeason,TeammateComparison
0,VER,Red Bull Racing,Bahrain Grand Prix,0.0,0.0,1,0.0,0.0,,9,0,0.0
1,VER,Red Bull Racing,Emilia Romagna Grand Prix,77.0,141.0,1,0.75,0.75,1.0,9,1,13.0
2,VER,Red Bull Racing,Japanese Grand Prix,51.0,97.0,1,0.666667,0.666667,1.0,9,1,5.0
3,VER,Red Bull Racing,Australian Grand Prix,51.0,87.0,19,1.0,1.0,1.0,9,0,15.0
4,VER,Red Bull Racing,Monaco Grand Prix,102.0,170.0,6,0.8,0.8,1.0,9,1,34.0


In [77]:
X = df.drop(columns=['NextRacePosition'])
y = df['NextRacePosition']
groups = df['RaceName']

In [78]:
gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]


In [79]:
cat_cols = ['Driver', 'Constructor', 'RaceName']
num_cols = [c for c in X.columns if c not in cat_cols]

In [80]:
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
], remainder='passthrough')

In [81]:
model = HistGradientBoostingRegressor(max_depth=6, learning_rate=0.1, random_state=42)

In [82]:

pipeline = Pipeline([
    ('prep', preprocessor),
    ('model', model)
])

In [83]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,0.1
,max_iter,100
,max_leaf_nodes,31
,max_depth,6
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255


In [84]:
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

Mean Absolute Error: 3.775362028182565


In [85]:
def predict_race_positions(race_df, pipeline):
    preds = pipeline.predict(race_df)
    race_df = race_df.copy()
    race_df['PredictedScore'] = preds
    # Sort by predicted score ascending (lower predicted position = better)
    race_df = race_df.sort_values(by='PredictedScore')
    # Assign predicted finishing positions 1..N
    race_df['PredictedPosition'] = range(1, len(race_df) + 1)
    return race_df[['Driver', 'PredictedScore', 'PredictedPosition']]


In [86]:
from scipy.stats import kendalltau, spearmanr

race_correlations = []
for race, grp in X_test.assign(TruePos=y_test).groupby('RaceName'):
    pred = pipeline.predict(grp)
    race_correlations.append(
        spearmanr(grp['TruePos'], pred).correlation
    )

print("Average Spearman correlation:", sum(race_correlations)/len(race_correlations))


Average Spearman correlation: 0.5235588972431078


In [87]:
new_race = pd.read_csv("to_pred.csv")
new_race

Unnamed: 0.1,Unnamed: 0,Driver,Constructor,RaceName,DriverPointsSoFar,ConstructorPointsSoFar,NextRacePosition,DriverWinRateThisSeason,DriverPodiumRate,AverageQualifyingPosition,DriverExperienceYears,DNFCountThisSeason,TeammateComparison
0,0,NOR,McLaren,Singapore Grand Prix,284.0,587.0,,0.294118,0.764706,3.352941,6,2,-19.0
1,1,VER,Red Bull Racing,Singapore Grand Prix,241.0,255.0,,0.235294,0.470588,3.176471,10,1,227.0
2,2,RUS,Mercedes,Singapore Grand Prix,202.0,276.0,,0.058824,0.411765,4.588235,6,1,128.0
3,3,ANT,Mercedes,Singapore Grand Prix,74.0,276.0,,0.0,0.058824,8.882353,0,6,-128.0
4,4,ALB,Williams,Singapore Grand Prix,70.0,98.0,,0.0,0.0,11.470588,6,5,42.0
5,5,STR,Aston Martin,Singapore Grand Prix,28.0,58.0,,0.0,0.0,16.0625,8,7,-2.0
6,6,HUL,Kick Sauber,Singapore Grand Prix,37.0,55.0,,0.0,0.058824,16.0,15,8,19.0
7,7,LEC,Ferrari,Singapore Grand Prix,156.0,263.0,,0.0,0.294118,5.411765,7,2,49.0
8,8,PIA,McLaren,Singapore Grand Prix,303.0,587.0,,0.411765,0.823529,2.529412,2,1,19.0
9,9,HAM,Ferrari,Singapore Grand Prix,107.0,263.0,,0.0,0.0,8.0,18,3,-49.0


In [88]:
pred_scores = pipeline.predict(new_race)
new_race['PredictedScore'] = pred_scores

In [89]:
new_race = new_race.sort_values(by='PredictedScore')

In [90]:
new_race['PredictedPosition'] = range(1, len(new_race) + 1)

print(new_race[['Driver', 'PredictedScore', 'PredictedPosition']])

   Driver  PredictedScore  PredictedPosition
7     LEC        0.740167                  1
2     RUS        0.926422                  2
1     VER        2.711397                  3
8     PIA        4.681949                  4
9     HAM        6.433532                  5
0     NOR        7.168698                  6
3     ANT       10.069704                  7
19    HAD       10.070885                  8
14    LAW       12.023494                  9
13    BEA       12.104816                 10
10    GAS       12.768864                 11
15    BOR       13.230425                 12
16    ALO       13.357619                 13
6     HUL       13.485118                 14
11    TSU       13.985752                 15
12    OCO       14.569917                 16
5     STR       14.666100                 17
18    DOO       15.069761                 18
20    COL       15.188542                 19
17    SAI       15.613979                 20
4     ALB       17.358473                 21
