In [1]:
import pandas as pd
results_file_name = 'results.csv'
results = pd.read_csv(results_file_name)

races_file_name = 'races.csv'
races = pd.read_csv(races_file_name)

results = results.join(races.set_index('race_id'), on = 'race_id')
results = results.loc[results.rider != 'no result']

results['rider_id'] = results.rider.astype('category').cat.codes
results['race'] = results.race_id.astype('category').cat.codes
rider_lookup = results[['rider', 'rider_id']].drop_duplicates().set_index('rider_id')
race_lookup = results[['name', 'race']].drop_duplicates().set_index('race')

results['cq_points'] = results['cq_points'] / max(results['cq_points'])

results

Unnamed: 0,rank,rider,team,time,cq_points,race_id,start,end,cat,name,rider_id,race
0,1.0,LONARDI Giovanni,EOK,"4h23'11""",0.053571,39288,23/01/2022,,1.2,GP Valencia,858,0
1,2.0,CAPIOT Amaury,ARK,,0.035714,39288,23/01/2022,,1.2,GP Valencia,234,0
2,3.0,LAWLESS Christopher,TEN,,0.028571,39288,23/01/2022,,1.2,GP Valencia,824,0
3,4.0,JAIME FERNANDEZ Alex,EKP,"02""",0.025000,39288,23/01/2022,,1.2,GP Valencia,686,0
4,5.0,SOTO GUIRAO Antonio Jesus,EUS,"02""",0.021429,39288,23/01/2022,,1.2,GP Valencia,1334,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4922,16.0,ALMAMMARI Faisal,-,"06'35""",0.000000,40839,23/12/2022,,1.2,"Arab Road Cycling Championships (Kalba, Sharjah)",31,186
4923,17.0,ALMARHOON Hani,-,"07'19""",0.000000,40839,23/12/2022,,1.2,"Arab Road Cycling Championships (Kalba, Sharjah)",34,186
4924,18.0,ALRAHBI Said,-,"07'19""",0.000000,40839,23/12/2022,,1.2,"Arab Road Cycling Championships (Kalba, Sharjah)",38,186
4925,19.0,ALRIKABI Ahmed,-,"08'45""",0.000000,40839,23/12/2022,,1.2,"Arab Road Cycling Championships (Kalba, Sharjah)",39,186


In [8]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Flatten, Embedding, Dot, ReLU
from tensorflow.keras import Input, Model
from tensorflow.keras.activations import relu

rider = Input(shape = (1,), dtype = 'int32', name = 'rider')
race = Input(shape = (1,), dtype = 'int32', name = 'race')

EMBEDDING_DIMENSION = 5
rider_encoder = ReLU()(Flatten()(
    Embedding(input_dim = max(results.rider_id) + 1, output_dim = EMBEDDING_DIMENSION, name = 'rider_encoder')(rider)))
race_encoder = ReLU()(Flatten()(
    Embedding(input_dim = max(results.race) + 1, output_dim = EMBEDDING_DIMENSION, name = 'race_encoder')(race)))

dot_product = Dot(axes = (1, 1), name = 'dot_product')([rider_encoder, race_encoder])
output = relu(dot_product)

model = Model([rider, race], output)

model.compile(optimizer = "adam", loss = "mean_squared_error")

history = model.fit([results.rider_id, results.race], results.cq_points, epochs=100, verbose = 0)
results['pred'] = model.predict([results.rider_id, results.race]).round(4)
results.sort_values('pred', ascending = False).head(15)

ModuleNotFoundError: No module named 'tensorflow.keras.layers.experimental.preprocessing'

In [10]:
dir(tensorflow.keras.layers)

['AbstractRNNCell',
 'Activation',
 'ActivityRegularization',
 'Add',
 'AdditiveAttention',
 'AlphaDropout',
 'Attention',
 'Average',
 'AveragePooling1D',
 'AveragePooling2D',
 'AveragePooling3D',
 'AvgPool1D',
 'AvgPool2D',
 'AvgPool3D',
 'BatchNormalization',
 'Bidirectional',
 'Concatenate',
 'Conv1D',
 'Conv2D',
 'Conv2DTranspose',
 'Conv3D',
 'Conv3DTranspose',
 'ConvLSTM2D',
 'Convolution1D',
 'Convolution2D',
 'Convolution2DTranspose',
 'Convolution3D',
 'Convolution3DTranspose',
 'Cropping1D',
 'Cropping2D',
 'Cropping3D',
 'CuDNNGRU',
 'CuDNNLSTM',
 'Dense',
 'DenseFeatures',
 'DepthwiseConv2D',
 'Dot',
 'Dropout',
 'ELU',
 'Embedding',
 'Flatten',
 'GRU',
 'GRUCell',
 'GaussianDropout',
 'GaussianNoise',
 'GlobalAveragePooling1D',
 'GlobalAveragePooling2D',
 'GlobalAveragePooling3D',
 'GlobalAvgPool1D',
 'GlobalAvgPool2D',
 'GlobalAvgPool3D',
 'GlobalMaxPool1D',
 'GlobalMaxPool2D',
 'GlobalMaxPool3D',
 'GlobalMaxPooling1D',
 'GlobalMaxPooling2D',
 'GlobalMaxPooling3D',
 'Inp

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])

In [None]:
results.query('race == 173').sort_values('pred', ascending = False)

In [None]:
from sklearn.metrics import pairwise_distances

rider_similarities = pairwise_distances(model.get_layer('rider_encoder').get_weights()[0], metric = 'cosine')
rider_similarities = (
    pd.DataFrame(rider_similarities)
    .stack()
    .reset_index()
    .rename(columns = {'level_0': 'rider_1', 'level_1': 'rider_2', 0: 'similarity'})
    .query('rider_1 < rider_2')
    .join(rider_lookup, on = 'rider_1')
    .rename(columns = {'rider': 'rider_1'})
    .join(rider_lookup, on = 'rider_2')
    .rename(columns = {'rider': 'rider_2'})
)
rider_similarities.sort_values('similarity', ascending = False).head(10)

In [None]:
from sklearn.metrics import pairwise_distances

race_similarities = pairwise_distances(model.get_layer('race_encoder').get_weights()[0], metric = 'cosine')
race_similarities = (
    pd.DataFrame(race_similarities)
    .stack()
    .reset_index()
    .rename(columns = {'level_0': 'race_1', 'level_1': 'race_2', 0: 'similarity'})
    .query('race_1 < race_2')
    .join(race_lookup, on = 'race_1')
    .rename(columns = {'name': 'race_1'})
    .join(race_lookup, on = 'race_2')
    .rename(columns = {'name': 'race_2'})
)
race_similarities.sort_values('similarity', ascending = False).head(10)

In [None]:
model.get_layer('rider_encoder').weights[0]

Next steps:
- Stages/stage races
- Add preprocessing as layers
- Nonnegative constraint for embeddings - causes error...
- Test set evaluation (by race, window vs random)
- Unobserved rider/race (e.g. predict Il Lombardia on all riders)
- Multiple random initializations