In [1]:
import pandas as pd
from tensorflow import keras, constant
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans

2023-03-01 01:00:41.973127: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
results_file_name = 'pcs_results.csv'
results = pd.read_csv(results_file_name, index_col = 0)
results['points'] = results.groupby('race_url').pcs_points.transform(lambda x: x / x.max())
results['one_day'] = results['race_url'].str.count('/') == 2
results = results.query('one_day')

races = pd.read_csv('world_tour_races.csv')
riders = pd.read_csv('pcs_riders.csv', index_col = 0)
results = pd.merge(results, races)
results = pd.merge(results, riders)
results

Unnamed: 0,rider_url,team_url,rank,status,age,time,pcs_points,uci_points,race_url,points,one_day,race_name,year,stage_race,rider_name,nationality
0,arnaud-demare,fdj-2016,1.0,DF,24,6:54:45,275,500.0,race/milano-sanremo/2016,1.000000,True,milano-sanremo,2016,False,DÉMARE Arnaud,FR
1,arnaud-demare,fdj-2016,5.0,DF,24,5:55:23,80,225.0,race/gent-wevelgem/2016,0.355556,True,gent-wevelgem,2016,False,DÉMARE Arnaud,FR
2,arnaud-demare,fdj-2016,,DNF,24,,0,0.0,race/ronde-van-vlaanderen/2016,0.000000,True,ronde-van-vlaanderen,2016,False,DÉMARE Arnaud,FR
3,arnaud-demare,fdj-2016,34.0,DF,24,4:54:45,5,8.0,race/cyclassics-hamburg/2016,0.022222,True,cyclassics-hamburg,2016,False,DÉMARE Arnaud,FR
4,arnaud-demare,fdj-2016,,DNF,25,,0,0.0,race/bretagne-classic/2016,0.000000,True,bretagne-classic,2016,False,DÉMARE Arnaud,FR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20153,santiago-umba,drone-hopper-androni-giocattoli-2022,88.0,DF,19,6:39:45,5,0.0,race/il-lombardia/2022,0.018182,True,il-lombardia,2022,False,UMBA Santiago,CO
20154,natnael-tesfatsion,drone-hopper-androni-giocattoli-2022,,DNF,23,,0,0.0,race/il-lombardia/2022,0.000000,True,il-lombardia,2022,False,TESFATSION Natnael,ER
20155,sean-quinn,ef-education-easypost-2022,,DNF,22,,0,0.0,race/il-lombardia/2022,0.000000,True,il-lombardia,2022,False,QUINN Sean,US
20156,sergio-martin,caja-rural-seguros-rga-2022,,DNF,25,,0,0.0,race/il-lombardia/2022,0.000000,True,il-lombardia,2022,False,MARTÍN Sergio Roman,ES


In [3]:
model = keras.models.load_model('wt_oneday_direct_embeddings')
results['pred'] = model.predict([results.rider_name, results.race_name]).round(4)
results.sort_values('pred', ascending = False).head(15)[['rider_name', 'race_name', 'year', 'rank', 'points', 'pred']]



Unnamed: 0,rider_name,race_name,year,rank,points,pred
18173,EVENEPOEL Remco,san-sebastian,2022,1.0,1.0,1.0044
18166,EVENEPOEL Remco,san-sebastian,2019,1.0,1.0,1.0044
4774,ARNDT Nikias,great-ocean-race,2017,1.0,1.0,0.8349
16326,VAN AERT Wout,bretagne-classic,2022,1.0,1.0,0.8129
17976,VAN DER POEL Mathieu,ronde-van-vlaanderen,2021,2.0,0.727273,0.8094
17970,VAN DER POEL Mathieu,ronde-van-vlaanderen,2020,1.0,1.0,0.8094
17980,VAN DER POEL Mathieu,ronde-van-vlaanderen,2022,1.0,1.0,0.8094
17963,VAN DER POEL Mathieu,ronde-van-vlaanderen,2019,4.0,0.436364,0.8094
12611,GROENEWEGEN Dylan,oxyclean-classic-brugge-de-panne,2019,1.0,1.0,0.7941
12614,GROENEWEGEN Dylan,oxyclean-classic-brugge-de-panne,2022,2.0,0.666667,0.7941


In [4]:
rider_dict = {i: v for i, v in enumerate(model.get_layer('rider_name_to_int').get_vocabulary())} 
all_rider_names = np.array(list(rider_dict.values())[1:])

rider_encoder = keras.Model(model.input[0], model.get_layer('rider_vector').output)

def rider_name_to_vector(rider_name):
    if not isinstance(rider_name, list):
        rider_name = [rider_name]
    return rider_encoder(constant(rider_name))
    
rider_name_to_vector('EVENEPOEL Remco'), rider_name_to_vector(['EVENEPOEL Remco', 'VAN Aert'])

(<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
 array([[-0.62507236, -0.21986185, -0.45205355,  0.6254721 ,  0.20994172,
         -0.13009325, -0.33821023,  0.23531958,  0.24564373, -0.17589954]],
       dtype=float32)>,
 <tf.Tensor: shape=(2, 10), dtype=float32, numpy=
 array([[-0.62507236, -0.21986185, -0.45205355,  0.6254721 ,  0.20994172,
         -0.13009325, -0.33821023,  0.23531958,  0.24564373, -0.17589954],
        [ 0.03581891,  0.01626286, -0.01412892,  0.04499895, -0.04219389,
          0.01523323,  0.04704999, -0.03320879,  0.00237893,  0.01307596]],
       dtype=float32)>)

## Similar Riders

In [5]:
rider_nn = NearestNeighbors(n_neighbors = 2)
rider_nn.fit(rider_encoder(all_rider_names).numpy())

def rider_int_to_name(rider_int):
    return rider_dict[rider_int]
    
def get_nearest_rider(rider_name = 'EVENEPOEL Remco'):
    embedding = rider_name_to_vector(rider_name)
    distances, indices = rider_nn.kneighbors(embedding)
    return rider_int_to_name(indices[0][1] + 1)

riders_to_check = [
    'EVENEPOEL Remco',
    'POGAČAR Tadej',
    'VAN AERT Wout',
    'VAN DER POEL Mathieu',
    'SAGAN Peter',
    'ALAPHILIPPE Julian',
    'BENNETT Sam',
    'NIBALI Vincenzo',
    'GILBERT Philippe'
]

{r: get_nearest_rider(r) for r in riders_to_check}

{'EVENEPOEL Remco': 'RODRÍGUEZ Joaquim',
 'POGAČAR Tadej': 'PINOT Thibaut',
 'VAN AERT Wout': 'NAESEN Oliver',
 'VAN DER POEL Mathieu': 'CANCELLARA Fabian',
 'SAGAN Peter': 'GIRMAY Biniam',
 'ALAPHILIPPE Julian': 'VALVERDE Alejandro',
 'BENNETT Sam': 'PHILIPSEN Jasper',
 'NIBALI Vincenzo': 'URÁN Rigoberto',
 'GILBERT Philippe': 'VAN BAARLE Dylan'}

## Race Prediction

In [6]:
preds = model.predict(
    [all_rider_names,
     np.array(['paris-roubaix'] * len(all_rider_names))])
pd.DataFrame({'rider': all_rider_names, 'pred': preds.flatten()}).sort_values('pred', ascending = False).head(15)



Unnamed: 0,rider,pred
1148,VAN DER POEL Mathieu,0.379775
359,HAYMAN Mathew,0.360545
4,VAN AVERMAET Greg,0.338408
58,MATTHEWS Michael,0.331758
1242,VERMEERSCH Florian,0.327866
11,SAGAN Peter,0.326582
54,BOONEN Tom,0.323533
8,COLBRELLI Sonny,0.307826
433,GILBERT Philippe,0.288705
23,VANMARCKE Sep,0.287994


## Similar Races

In [7]:
race_dict = {i: v for i, v in enumerate(model.get_layer('race_name_to_int').get_vocabulary())}

race_similarities = pairwise_distances(model.get_layer('race_encoder').get_weights()[0])
race_similarities = (
    pd.DataFrame(race_similarities)
    .stack()
    .reset_index()
    .rename(columns = {'level_0': 'race_1', 'level_1': 'race_2', 0: 'similarity'})
    .query('race_1 < race_2')
    .assign(
        race_1 = lambda x: x.race_1.replace(race_dict),
        race_2 = lambda x: x.race_2.replace(race_dict)
    )
    .query('race_1 != "[UNK]" and race_2 != "[UNK]"')
)
race_similarities.sort_values('similarity', ascending = True).head(10)

Unnamed: 0,race_1,race_2,similarity
282,ride-london-classic,eschborn-frankfurt,0.486484
53,gent-wevelgem,gp-quebec,0.61098
76,ronde-van-vlaanderen,gp-montreal,0.674315
24,milano-sanremo,gent-wevelgem,0.694923
50,gent-wevelgem,omloop-het-nieuwsblad,0.734315
80,ronde-van-vlaanderen,amstel-gold-race,0.738469
77,ronde-van-vlaanderen,e3-harelbeke,0.759207
73,ronde-van-vlaanderen,dwars-door-vlaanderen,0.766796
140,omloop-het-nieuwsblad,paris-roubaix,0.770224
327,amstel-gold-race,strade-bianche,0.793467


## Clustering Riders

In [8]:
rider_embeddings = rider_encoder(all_rider_names).numpy()
kmeans = KMeans(n_clusters = 5, random_state = 0).fit(rider_embeddings)
rider_clusters = pd.DataFrame({'rider': all_rider_names, 'cluster': kmeans.labels_})
rider_clusters



Unnamed: 0,rider,cluster
0,DÉMARE Arnaud,3
1,SWIFT Ben,0
2,ROELANDTS Jürgen,1
3,BOUHANNI Nacer,3
4,VAN AVERMAET Greg,2
...,...,...
1557,UMBA Santiago,0
1558,TESFATSION Natnael,0
1559,QUINN Sean,0
1560,MARTÍN Sergio Roman,0


In [9]:
rider_clusters.cluster.value_counts()

0    1301
1     180
4      36
3      29
2      16
Name: cluster, dtype: int64

In [10]:
rider_clusters.groupby('cluster').rider.unique()

cluster
0    [SWIFT Ben, POZZATO Filippo, MONTAGUTI Matteo,...
1    [ROELANDTS Jürgen, HAUSSLER Heinrich, COLBRELL...
2    [VAN AVERMAET Greg, SAGAN Peter, VALVERDE Alej...
3    [DÉMARE Arnaud, BOUHANNI Nacer, TRENTIN Matteo...
4    [KRISTOFF Alexander, VANMARCKE Sep, NIZZOLO Gi...
Name: rider, dtype: object