In [1]:
import pandas as pd
import numpy as np

results_file_name = '../data/pcs_worldtour_results.csv'
results = pd.read_csv(results_file_name, index_col = 0)
results['points'] = results.groupby('race_url').pcs_points.transform(lambda x: x / x.max())
results['one_day'] = results['race_url'].str.count('/') == 2

def extract_stage_number(race_url):
    if 'stage-' in race_url:
        return int(race_url.split('stage-')[1])
    elif 'prologue' in race_url:
        return 0
    else: # One-day race
        return -1

results['stage'] = results.race_url.apply(extract_stage_number)
results['race_url'] = results['race_url'].str.replace('/stage-.*|/prologue', '', regex = True)

races = pd.read_csv('../data/pcs_worldtour_races.csv')
riders = pd.read_csv('../data/pcs_worldtour_riders.csv', index_col = 0)
results = pd.merge(results, races)
results = pd.merge(results, riders)

results['race_id'] = np.where(
    results['one_day'],
    results['race_name'],
    results['race_url'].str.replace('race/', '') + '/' + results['stage'].astype(str)
)

results = results.dropna()
results

Unnamed: 0,rider_url,team_url,rank,status,age,time,pcs_points,uci_points,race_url,points,one_day,stage,race_name,year,stage_race,rider_name,nationality,race_id
0,arnaud-demare,fdj-2016,1.0,DF,24,6:54:45,275,500.0,race/milano-sanremo/2016,1.000000,True,-1,milano-sanremo,2016,False,DÉMARE Arnaud,FR,milano-sanremo
1,arnaud-demare,fdj-2016,5.0,DF,24,5:55:23,80,225.0,race/gent-wevelgem/2016,0.355556,True,-1,gent-wevelgem,2016,False,DÉMARE Arnaud,FR,gent-wevelgem
3,arnaud-demare,fdj-2016,34.0,DF,24,4:54:45,5,8.0,race/cyclassics-hamburg/2016,0.022222,True,-1,cyclassics-hamburg,2016,False,DÉMARE Arnaud,FR,cyclassics-hamburg
5,arnaud-demare,fdj-2017,20.0,DF,25,4:57:50,15,20.0,race/omloop-het-nieuwsblad/2017,0.066667,True,-1,omloop-het-nieuwsblad,2017,False,DÉMARE Arnaud,FR,omloop-het-nieuwsblad
6,arnaud-demare,fdj-2017,6.0,DF,25,7:08:44,90,175.0,race/milano-sanremo/2017,0.327273,True,-1,milano-sanremo,2017,False,DÉMARE Arnaud,FR,milano-sanremo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170302,vojtech-repa,equipo-kern-pharma-2022,57.0,DF,22,3:52:47,0,0.0,race/vuelta-a-espana/2022,0.000000,False,17,vuelta-a-espana,2022,True,ŘEPA Vojtěch,CZ,vuelta-a-espana/2022/17
170303,vojtech-repa,equipo-kern-pharma-2022,78.0,DF,22,5:13:41,0,0.0,race/vuelta-a-espana/2022,0.000000,False,18,vuelta-a-espana,2022,True,ŘEPA Vojtěch,CZ,vuelta-a-espana/2022/18
170304,vojtech-repa,equipo-kern-pharma-2022,121.0,DF,22,3:34:39,0,0.0,race/vuelta-a-espana/2022,0.000000,False,19,vuelta-a-espana,2022,True,ŘEPA Vojtěch,CZ,vuelta-a-espana/2022/19
170305,vojtech-repa,equipo-kern-pharma-2022,110.0,DF,22,5:13:32,0,0.0,race/vuelta-a-espana/2022,0.000000,False,20,vuelta-a-espana,2022,True,ŘEPA Vojtěch,CZ,vuelta-a-espana/2022/20


In [2]:
from tensorflow.keras.layers import StringLookup, Embedding, Flatten, ReLU, Dot
from tensorflow.keras import Input, Model
from tensorflow.keras.utils import set_random_seed

set_random_seed(42)
K = 10

riders = Input(shape = (1,), dtype = 'string', name = 'rider')
rider_name_to_int = StringLookup(vocabulary = results['rider_name'].unique(), name = 'rider_name_to_int')
rider_ints = rider_name_to_int(riders)
rider_vector = Embedding(rider_name_to_int.vocabulary_size(), K, name = 'rider_encoder')(rider_ints)
rider_vector_flat = Flatten(name = 'rider_vector')(rider_vector)

races = Input(shape = (1,), dtype = 'string', name = 'race')
race_id_to_int = StringLookup(vocabulary = results['race_id'].unique(), name = 'race_id_to_int')
race_ints = race_id_to_int(races)
race_vector = Embedding(race_id_to_int.vocabulary_size(), K, name = 'race_encoder')(race_ints)
race_vector_flat = Flatten(name = 'race_vector')(race_vector)

dot_product = Dot(axes = (1, 1), name = 'dot_product')([rider_vector_flat, race_vector_flat])
outputs = ReLU()(dot_product)

model = Model([riders, races], outputs)
model.summary()

2023-04-16 22:42:14.186044: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 rider (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 race (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 rider_name_to_int (StringLooku  (None, 1)           0           ['rider[0][0]']                  
 p)                                                                                               
                                                                                                  
 race_id_to_int (StringLookup)  (None, 1)            0           ['race[0][0]']               

In [3]:
model.compile(optimizer = "adam", loss = "mean_squared_error")
history = model.fit([results.rider_name, results.race_id], results.points, epochs=70, verbose = 1)

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


In [4]:
model.save('../models/pcs_worldtour_direct_embeddings')

INFO:tensorflow:Assets written to: ../models/pcs_worldtour_direct_embeddings/assets
