In [1]:
import pandas as pd
import numpy as np

results_file_name = '../data/pcs_worldtour_results.csv'
results = pd.read_csv(results_file_name, index_col = 0)
results['points'] = results.groupby('race_url').pcs_points.transform(lambda x: x / x.max())
results['one_day'] = results['race_url'].str.count('/') == 2

def extract_stage_number(race_url):
    if 'stage-' in race_url:
        return int(race_url.split('stage-')[1])
    elif 'prologue' in race_url:
        return 0
    else: # One-day race
        return -1

results['stage'] = results.race_url.apply(extract_stage_number)
results['race_url'] = results['race_url'].str.replace('/stage-.*|/prologue', '', regex = True)

races = pd.read_csv('../data/pcs_worldtour_races.csv')
riders = pd.read_csv('../data/pcs_worldtour_riders.csv', index_col = 0)
results = pd.merge(results, races)
results = pd.merge(results, riders)

results['race_id'] = np.where(
    results['one_day'],
    results['race_name'],
    results['race_url'].str.replace('race/', '') + '/' + results['stage'].astype(str)
)

results = results.dropna()

# Identify riders with 25 or more PCS points
points_per_rider = results.groupby('rider_name')['pcs_points'].sum().sort_values()
riders_to_keep = points_per_rider[points_per_rider >= 25].index.values

results

Unnamed: 0,rider_url,team_url,rank,status,age,time,pcs_points,uci_points,race_url,profile_icon,profile_score,points,one_day,stage,race_name,year,stage_race,rider_name,nationality,race_id
0,arnaud-demare,fdj-2016,1.0,DF,24,6:54:45,275,500.0,race/milano-sanremo/2016,p2,64.0,1.000000,True,-1,milano-sanremo,2016,False,DÉMARE Arnaud,FR,milano-sanremo
1,arnaud-demare,fdj-2016,5.0,DF,24,5:55:23,80,225.0,race/gent-wevelgem/2016,p2,31.0,0.355556,True,-1,gent-wevelgem,2016,False,DÉMARE Arnaud,FR,gent-wevelgem
3,arnaud-demare,fdj-2016,34.0,DF,24,4:54:45,5,8.0,race/cyclassics-hamburg/2016,p0,33.0,0.022222,True,-1,cyclassics-hamburg,2016,False,DÉMARE Arnaud,FR,cyclassics-hamburg
5,arnaud-demare,fdj-2017,20.0,DF,25,4:57:50,15,20.0,race/omloop-het-nieuwsblad/2017,p2,33.0,0.066667,True,-1,omloop-het-nieuwsblad,2017,False,DÉMARE Arnaud,FR,omloop-het-nieuwsblad
6,arnaud-demare,fdj-2017,6.0,DF,25,7:08:44,90,175.0,race/milano-sanremo/2017,p2,65.0,0.327273,True,-1,milano-sanremo,2017,False,DÉMARE Arnaud,FR,milano-sanremo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170120,vojtech-repa,equipo-kern-pharma-2022,57.0,DF,22,3:52:47,0,0.0,race/vuelta-a-espana/2022,p1,161.0,0.000000,False,17,vuelta-a-espana,2022,True,ŘEPA Vojtěch,CZ,vuelta-a-espana/2022/17
170121,vojtech-repa,equipo-kern-pharma-2022,78.0,DF,22,5:13:41,0,0.0,race/vuelta-a-espana/2022,p5,214.0,0.000000,False,18,vuelta-a-espana,2022,True,ŘEPA Vojtěch,CZ,vuelta-a-espana/2022/18
170122,vojtech-repa,equipo-kern-pharma-2022,121.0,DF,22,3:34:39,0,0.0,race/vuelta-a-espana/2022,p4,101.0,0.000000,False,19,vuelta-a-espana,2022,True,ŘEPA Vojtěch,CZ,vuelta-a-espana/2022/19
170123,vojtech-repa,equipo-kern-pharma-2022,110.0,DF,22,5:13:32,0,0.0,race/vuelta-a-espana/2022,p4,216.0,0.000000,False,20,vuelta-a-espana,2022,True,ŘEPA Vojtěch,CZ,vuelta-a-espana/2022/20


In [2]:
from tensorflow.keras.layers import StringLookup, Embedding, Flatten, ReLU, Dot
from tensorflow.keras import Input, Model
from tensorflow.keras.utils import set_random_seed
from tensorflow.keras.activations import sigmoid

set_random_seed(42)
K = 5

riders = Input(shape = (1,), dtype = 'string', name = 'rider')
rider_name_to_int = StringLookup(vocabulary = riders_to_keep, name = 'rider_name_to_int')
rider_ints = rider_name_to_int(riders)
rider_vector = Embedding(rider_name_to_int.vocabulary_size(), K, name = 'rider_encoder')(rider_ints)
rider_vector_flat = Flatten(name = 'rider_vector')(rider_vector)

races = Input(shape = (1,), dtype = 'string', name = 'race')
race_id_to_int = StringLookup(vocabulary = results['race_id'].unique(), name = 'race_id_to_int')
race_ints = race_id_to_int(races)
race_vector = Embedding(race_id_to_int.vocabulary_size(), K, name = 'race_encoder')(race_ints)
race_vector_flat = Flatten(name = 'race_vector')(race_vector)

dot_product = Dot(axes = (1, 1), name = 'dot_product')([rider_vector_flat, race_vector_flat])
outputs = sigmoid(dot_product)

model = Model([riders, races], outputs)
model.summary()

2023-05-11 22:47:56.762144: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 rider (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 race (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 rider_name_to_int (StringLooku  (None, 1)           0           ['rider[0][0]']                  
 p)                                                                                               
                                                                                                  
 race_id_to_int (StringLookup)  (None, 1)            0           ['race[0][0]']               

In [3]:
model.compile(optimizer = "adam", loss = "bce")
history = model.fit([results.rider_name, results.race_id], results.points, epochs=100, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 98/100
Epoch 99/100
Epoch 100/100


In [4]:
model.save('../models/pcs_worldtour_direct_embeddings')

INFO:tensorflow:Assets written to: ../models/pcs_worldtour_direct_embeddings/assets
