In [1]:
import pandas as pd
import numpy as np

results = pd.read_csv("https://raw.githubusercontent.com/baronet2/Bike2Vec/main/data/pcs_worldtour_results.csv", index_col = 0).head(100000)
results['points'] = results.groupby('race_url').pcs_points.transform(lambda x: x / x.max())

# Identify riders with 25 or more PCS points
points_per_rider = results.groupby('rider_url')['pcs_points'].sum().sort_values()
riders_to_keep = points_per_rider[points_per_rider >= 25].index.values

results

Unnamed: 0,rider_url,team_url,rank,status,age,time,pcs_points,uci_points,race_url,profile_icon,profile_score,points
0,arnaud-demare,fdj-2016,1.0,DF,24,6:54:45,275,500.0,race/milano-sanremo/2016,p2,64.0,1.000000
1,ben-swift,team-sky-2016,2.0,DF,28,6:54:45,200,400.0,race/milano-sanremo/2016,p2,64.0,0.727273
2,jurgen-roelandts,lotto-soudal-2016,3.0,DF,30,6:54:45,150,325.0,race/milano-sanremo/2016,p2,64.0,0.545455
3,nacer-bouhanni,cofidis-solutions-credits-2016,4.0,DF,25,6:54:45,120,275.0,race/milano-sanremo/2016,p2,64.0,0.436364
4,greg-van-avermaet,bmc-racing-team-2016,5.0,DF,30,6:54:45,100,225.0,race/milano-sanremo/2016,p2,64.0,0.363636
...,...,...,...,...,...,...,...,...,...,...,...,...
38,marco-haller,team-katusha-alpecin-2019,39.0,DF,28,3:51:46,0,0.0,race/giro-d-italia/2019/stage-12,p4,131.0,0.000000
39,christian-knees,team-ineos-2019,40.0,DF,38,3:51:55,0,0.0,race/giro-d-italia/2019/stage-12,p4,131.0,0.000000
40,jasha-sutterlin,movistar-team-2019,41.0,DF,26,3:51:55,0,0.0,race/giro-d-italia/2019/stage-12,p4,131.0,0.000000
41,jenthe-biermans,team-katusha-alpecin-2019,42.0,DF,23,3:52:16,0,0.0,race/giro-d-italia/2019/stage-12,p4,131.0,0.000000


In [4]:
from tensorflow.keras.layers import StringLookup, Embedding, Flatten, ReLU, Dot
from tensorflow.keras import Input, Model
from tensorflow.keras.utils import set_random_seed
from tensorflow.keras.activations import sigmoid
from tensorflow.keras import regularizers

set_random_seed(42)
K = 2
rider_regularization_lambda = 0.1
race_regularization_lambda = 0.1

riders = Input(shape = (1,), dtype = 'string', name = 'rider')
rider_name_to_int = StringLookup(vocabulary = riders_to_keep, name = 'rider_name_to_int')
rider_ints = rider_name_to_int(riders)
rider_vector = Embedding(rider_name_to_int.vocabulary_size(), K, name = 'rider_encoder', embeddings_regularizer = regularizers.L2(rider_regularization_lambda))(rider_ints)
rider_vector_flat = Flatten(name = 'rider_vector')(rider_vector)

races = Input(shape = (1,), dtype = 'string', name = 'race')
race_id_to_int = StringLookup(vocabulary = results['race_url'].unique(), name = 'race_id_to_int')
race_ints = race_id_to_int(races)
race_vector = Embedding(race_id_to_int.vocabulary_size(), K, name = 'race_encoder', embeddings_regularizer = regularizers.L2(race_regularization_lambda))(race_ints)
race_vector_flat = Flatten(name = 'race_vector')(race_vector)

dot_product = Dot(axes = (1, 1), name = 'dot_product')([rider_vector_flat, race_vector_flat])
outputs = sigmoid(dot_product)

model = Model([riders, races], outputs)
model.summary()

In [5]:
model.compile(optimizer = "sgd", loss = "mse")
history = model.fit([results.rider_url, results.race_url], results.points, epochs=3, verbose = 1)

Epoch 1/3
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 0.2997
Epoch 2/3
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 871us/step - loss: 0.2371
Epoch 3/3
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 892us/step - loss: 0.2371
