In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from keras import layers, Model, Sequential, optimizers
import keras

In [15]:
df = pd.read_csv('data.csv')

In [16]:
base_features = [
    'age',
    'bmi',
    'female',
    'typical',
    'max',
    'sprint',
    'tempo',
    'injury'
]

race_cols = {
    'k5': ('k5_ti', 'k5_di', 'k5_tr'),
    'k10': ('k10_ti', 'k10_di', 'k10_tr'),
    'm5': ('m5_ti', 'm5_di', 'm5_tr'),
    'm10': ('m10_ti', 'm10_di', 'm10_tr'),
    'mh': ('mh_ti', 'mh_di', 'mh_tr'),
    'mf': ('mf_ti', 'mf_di', 'mf_tr')
}

# Create indicator variables for each race
for prefix, (time, _, _) in race_cols.items():
    df[f'{prefix}_ind'] = df[time].notnull().astype(int)

target = 'k10_ti'
df = df[pd.notnull(df[target])]

k10_features = ['k10_di', 'k10_tr']
k5_features = ['k5_ind', 'k5_ti', 'k5_di', 'k5_tr']
mh_features = ['mh_ind', 'mh_ti', 'mh_di', 'mh_tr']
mf_features = ['mf_ind', 'mf_ti', 'mf_di', 'mf_tr']

base_features.extend(k10_features)
base_features.extend(k5_features)
base_features.extend(mh_features)
base_features.extend(mf_features)

In [17]:
X = df[base_features]
y = df[target]

# Check for inf/nan before scaling
print("Pre-scaling checks:")
print("Infinity values in X:", np.any(np.isinf(X)))
print("NaN values in X:", np.any(np.isnan(X)))
print("NaN values in y:", np.any(np.isnan(y)))

# Look at value ranges
print("\nFeature ranges:")
for column in X.columns:
    print(f"{column:15} min: {X[column].min():10.2f} max: {X[column].max():10.2f}")

print("\nTarget range:")
print(f"min: {y.min():10.2f} max: {y.max():10.2f}")

Pre-scaling checks:
Infinity values in X: False
NaN values in X: True
NaN values in y: False

Feature ranges:
age             min:      16.00 max:      74.00
bmi             min:      17.85 max:      47.18
female          min:       0.00 max:       1.00
typical         min:       0.00 max:     120.00
max             min:       0.00 max:     130.00
sprint          min:       0.00 max:       1.00
tempo           min:       0.00 max:       1.00
injury          min:       1.00 max:       3.00
k10_di          min:       1.00 max:       5.00
k10_tr          min:       1.00 max:       4.00
k5_ind          min:       0.00 max:       1.00
k5_ti           min:     864.00 max:    3679.00
k5_di           min:       1.00 max:       5.00
k5_tr           min:       1.00 max:       4.00
mh_ind          min:       0.00 max:       1.00
mh_ti           min:    3890.00 max:   11985.00
mh_di           min:       1.00 max:       5.00
mh_tr           min:       1.00 max:       4.00
mf_ind          min:      

In [24]:
X = df[base_features].copy()
y = df[target].copy()

na_cols = k5_features + mh_features + mf_features
for col in na_cols:
    X[col] = X[col].fillna(X[col].mean())
    # X[col] = X[col].fillna(-1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)

In [19]:
# Check after scaling
print("\nPost-scaling checks:")
print("Infinity values in X_train:", np.any(np.isinf(X_train)))
print("NaN values in X_train:", np.any(np.isnan(X_train)))
print("Value ranges in X_train:")
print(f"min: {X_train.min():10.2f} max: {X_train.max():10.2f}")


Post-scaling checks:
Infinity values in X_train: False
NaN values in X_train: False
Value ranges in X_train:
min:      -2.26 max:       7.50


In [35]:
input_dim = len(base_features)

model = Sequential([

    layers.Input(shape=(input_dim,)),

    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.1),

    layers.Dense(48, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.1),

    layers.Dense(32, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.05),

    layers.Dense(1)
])

model.compile(
    optimizer=optimizers.Adam(learning_rate=0.001),
    loss='mean_squared_error',
    metrics=['mean_absolute_error']
)

In [36]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

reduce_lr = keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    verbose=1,
    min_lr=0.00001
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=1000,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

Epoch 1/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 9455036.0000 - mean_absolute_error: 3000.6870 - val_loss: 9070083.0000 - val_mean_absolute_error: 2947.3474 - learning_rate: 0.0010
Epoch 2/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 9545968.0000 - mean_absolute_error: 3010.8665 - val_loss: 9068238.0000 - val_mean_absolute_error: 2947.1667 - learning_rate: 0.0010
Epoch 3/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 9462532.0000 - mean_absolute_error: 3005.5671 - val_loss: 9066221.0000 - val_mean_absolute_error: 2947.0396 - learning_rate: 0.0010
Epoch 4/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 9311337.0000 - mean_absolute_error: 2980.8831 - val_loss: 9063867.0000 - val_mean_absolute_error: 2946.9644 - learning_rate: 0.0010
Epoch 5/1000
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 95