In [1]:
import pandas as pd
from tensorflow import keras, constant
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans

2023-02-23 21:22:37.823724: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
results_file_name = 'pcs_youth_results.csv'
results = pd.read_csv(results_file_name).query('Year >= 2013')
results['points'] = results['Pnt'] / max(results['Pnt'])

results

Unnamed: 0,Rnk,Rider,Age,Team,UCI,Pnt,Time,Race,Year,points
0,1,KOOIJ Olav,20.0,Jumbo-Visma,20,30,3:27:45,zlm-tour,2022,0.200000
1,2,SALBY Alexander,24.0,Riwal Cycling Team,10,18,",,0:00",zlm-tour,2022,0.120000
2,3,WELSFORD Sam,26.0,Team DSM,5,12,",,0:00",zlm-tour,2022,0.080000
3,4,DUPONT Timothy,34.0,Bingoal Pauwels Sauces WB,0,7,",,0:00",zlm-tour,2022,0.046667
4,5,MARECZKO Jakub,28.0,Alpecin-Fenix,0,4,",,0:00",zlm-tour,2022,0.026667
...,...,...,...,...,...,...,...,...,...,...
64187,DNF,INGELAERE Nicolas,17.0,,0,0,-,bernaudeau-junior,2013,0.000000
64188,DNF,ARDOUIN Sebastien,16.0,,0,0,-,bernaudeau-junior,2013,0.000000
64189,DNF,ALLAIRE Corentin,16.0,,0,0,-,bernaudeau-junior,2013,0.000000
64190,DNF,GAREL Adrien,17.0,,0,0,-,bernaudeau-junior,2013,0.000000


In [3]:
model = keras.models.load_model('model_direct_embeddings')
results['pred'] = model.predict([results.Rider, results.Race]).round(4)
results.sort_values('pred', ascending = False).head(15)



Unnamed: 0,Rnk,Rider,Age,Team,UCI,Pnt,Time,Race,Year,points,pred
57286,1,VAN DER POEL Mathieu,18.0,,0,150,3:33:14,uci-world-championships-mj,2013,1.0,0.9415
6493,1,HAGENES Per Strand,18.0,Norway,0,150,2:43:48,uci-world-championships-mj,2021,1.0,0.8158
19221,1,EVENEPOEL Remco,18.0,,0,100,0:33:15,uci-world-championships-itt-mj,2018,0.666667,0.641
26488,4,MÄRKL Niklas,18.0,,0,80,",,0:51",uci-world-championships-mj,2017,0.533333,0.6356
34391,2,MÄRKL Niklas,17.0,,0,110,0:070:07,uci-world-championships-mj,2016,0.733333,0.6356
6495,3,MIHKELS Madis,18.0,Estonia,0,90,0:240:24,uci-world-championships-mj,2021,0.6,0.629
26662,DNF,EVENEPOEL Remco,17.0,,0,0,-,uci-world-championships-mj,2017,0.0,0.6262
19062,1,EVENEPOEL Remco,18.0,,0,150,3:03:49,uci-world-championships-mj,2018,1.0,0.6262
42096,3,PEDERSEN Rasmus Lund,17.0,,0,90,0:010:01,uci-world-championships-mj,2015,0.6,0.6005
57288,3,NIKA Iltjan,18.0,,0,90,",,0:03",uci-world-championships-mj,2013,0.6,0.5932


In [4]:
rider_dict = {i: v for i, v in enumerate(model.get_layer('rider_name_to_int').get_vocabulary())} 
all_rider_names = np.array(list(rider_dict.values())[1:])

rider_encoder = keras.Model(model.input[0], model.get_layer('rider_vector_nonneg').output)

def rider_name_to_vector(rider_name):
    if not isinstance(rider_name, list):
        rider_name = [rider_name]
    return rider_encoder(constant(rider_name))
    
rider_name_to_vector('EVENEPOEL Remco'), rider_name_to_vector(['EVENEPOEL Remco', 'VAN Aert'])

(<tf.Tensor: shape=(1, 5), dtype=float32, numpy=
 array([[0.21687263, 0.        , 0.        , 0.31212363, 0.19678833]],
       dtype=float32)>,
 <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
 array([[0.21687263, 0.        , 0.        , 0.31212363, 0.19678833],
        [0.03581891, 0.01626286, 0.        , 0.04499895, 0.        ]],
       dtype=float32)>)

## Similar Riders

In [5]:
rider_nn = NearestNeighbors(n_neighbors = 2)
rider_nn.fit(rider_encoder(all_rider_names).numpy())

def rider_int_to_name(rider_int):
    return rider_dict[rider_int]
    
def get_nearest_rider(rider_name = 'EVENEPOEL Remco'):
    embedding = rider_name_to_vector(rider_name)
    distances, indices = rider_nn.kneighbors(embedding)
    return rider_int_to_name(indices[0][1] + 1)

(get_nearest_rider('EVENEPOEL Remco'),
 get_nearest_rider('POGAČAR Tadej'),
 get_nearest_rider('VAN AERT Wout'),
 get_nearest_rider('CAVENDISH Mark')
)

('VAN MECHELEN Vlad', 'ERŽEN Žak', 'CARNEIRO Rui', 'VAN DE SOMPEL Jori')

## Race Prediction

In [6]:
preds = model.predict(
    [all_rider_names,
     np.array(['tour-de-l-avenir'] * len(all_rider_names))])
pd.DataFrame({'rider': all_rider_names, 'pred': preds.flatten()}).sort_values('pred', ascending = False).head(15)



Unnamed: 0,rider,pred
57,BRAET Vito,0.004653
3736,SCOTT Jared,0.004634
104,HAGENES Per Strand,0.004609
6396,BOKELOH Jonas,0.004582
1728,BROWN Jim,0.004518
848,ERMAKOV Roman,0.004235
3053,CEPEDA Jefferson Alexander,0.004174
115,MILESI Lorenzo,0.004132
13650,ALAPHILIPPE Julian,0.003982
4113,TEUNISSEN Mike,0.003913


## Similar Races

In [7]:
race_dict = {i: v for i, v in enumerate(model.get_layer('race_name_to_int').get_vocabulary())}

race_similarities = pairwise_distances(model.get_layer('race_encoder').get_weights()[0])
race_similarities = (
    pd.DataFrame(race_similarities)
    .stack()
    .reset_index()
    .rename(columns = {'level_0': 'race_1', 'level_1': 'race_2', 0: 'similarity'})
    .query('race_1 < race_2')
    .assign(
        race_1 = lambda x: x.race_1.replace(race_dict),
        race_2 = lambda x: x.race_2.replace(race_dict)
    )
)
race_similarities.sort_values('similarity', ascending = True).head(10)

Unnamed: 0,race_1,race_2,similarity
891,tour-de-l-avenir,paris-arras-tour,0.012635
1240,ronde-de-l-isard,giro-ciclistico-della-valle-d-aosta-mont-blanc,0.023177
831,tour-de-normadie,paris-arras-tour,0.028049
1223,ronde-de-l-isard,olympias-tour,0.030226
383,trofeo-piva,olympias-tour,0.032007
806,tour-de-normadie,le-trophee-centre-morbihan,0.032184
2339,gp-capodarco,la-cote-picarde-nations-cup,0.032788
1550,liege-bastogne-liege-u23,tour-des-pays-de-savoie,0.032985
2511,giro-ciclistico-d-italia,paris-arras-tour,0.033073
1283,piccolo-giro-di-lombardia,olympias-tour,0.034686


## Clustering Riders

In [8]:
rider_embeddings = rider_encoder(all_rider_names).numpy()
kmeans = KMeans(n_clusters = 5, random_state = 0).fit(rider_embeddings)
rider_clusters = pd.DataFrame({'rider': all_rider_names, 'cluster': kmeans.labels_})
rider_clusters



Unnamed: 0,rider,cluster
0,KOOIJ Olav,2
1,SALBY Alexander,2
2,WELSFORD Sam,2
3,DUPONT Timothy,4
4,MARECZKO Jakub,2
...,...,...
15146,DE MARANS Alexandre,2
15147,VALADE Hugo,2
15148,ARDOUIN Sebastien,2
15149,ALLAIRE Corentin,2


In [9]:
rider_clusters.cluster.value_counts()

2    14848
0       98
3       79
1       71
4       55
Name: cluster, dtype: int64

In [10]:
rider_clusters.groupby('cluster').rider.unique()

cluster
0    [BRAET Vito, HAGENES Per Strand, SEGAERT Alec,...
1    [SHEFFIELD Magnus, PENHOËT Paul, DE PRETTO Dav...
2    [KOOIJ Olav, SALBY Alexander, WELSFORD Sam, MA...
3    [FEDOROV Yevgeniy, DINHAM Matthew, GRÉGOIRE Ro...
4    [DUPONT Timothy, MIHKELS Madis, SVRČEK Martin,...
Name: rider, dtype: object