In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import chardet
import re
import warnings
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
import warnings
warnings.filterwarnings("ignore", message="The structure of `inputs` doesn't match the expected structure")
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Concatenate, Input, RepeatVector
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler

pd.options.mode.chained_assignment = None

In [2]:
df_exploded = pd.read_csv('data_output/df_exploded_full.csv')

In [3]:
df_exploded.shape

(1679050, 9)

# Modelo RNN: Prediccion del siguiente golpe del jugador

El objetivo del modelo es tomar como input una seuencia de golpes y un jugador y predecir el siguiente golpe que este jugador dara. 

In [4]:
#Conditional column: where the fourth item in the 'Sequence' column contains 's' new column return 's', when 'b' return 'b', when 'f' return 'f', and when other return 'o'

df_exploded_test = df_exploded[df_exploded['Sequence'].str.split().apply(len) > 3]
df_exploded_test['cuarto'] = df_exploded_test['Sequence'].str.split().str[3].apply(
    lambda x: x[0] if isinstance(x, str) and x[0] in ['s', 'b', 'f'] else 'o'
)
df_exploded_test['cuarto'].value_counts(normalize=True)

f    0.439085
b    0.344230
o    0.119495
s    0.097190
Name: cuarto, dtype: float64

In [61]:
jugador_elegido = "Roger Federer"

df_exploded = df_exploded.dropna(subset=['Sequence'])

df_exploded_filtered = df_exploded[df_exploded['Sequence'].str.split().apply(len) > 3]
# Filter rows where the fourth item in the 'Sequence' column contains 's', 'b', or 'f'
df_exploded_filtered = df_exploded_filtered[
    df_exploded_filtered['Sequence'].str.split().str[3].str.contains(r'^[sbf]')
]

print(f"Total de filas: {df_exploded_filtered.shape[0]}")

Total de filas: 261967


In [63]:
df_exploded_filtered['Sequence'].str.split().str[3].str[0].value_counts(normalize=True)

f    0.498673
b    0.390946
s    0.110380
Name: Sequence, dtype: float64

In [64]:
df_exploded_filtered['X'] = df_exploded_filtered['Sequence'].str.split().str[:3]
df_exploded_filtered['y'] = df_exploded_filtered['Sequence'].str.split().str[3].str[0]

df_exploded_filtered['y'].value_counts(normalize=True)

f    0.498673
b    0.390946
s    0.110380
Name: y, dtype: float64

# Resampling
El mejor desempeño se dio con el siguiente resampling:

In [65]:
#resample df_exploded_filtered so that f = 40% and s = 30% and b = 30%
from sklearn.utils import resample

# Separar las clases
f_class = df_exploded_filtered[df_exploded_filtered['y'] == 'f']
b_class = df_exploded_filtered[df_exploded_filtered['y'] == 'b']
s_class = df_exploded_filtered[df_exploded_filtered['y'] == 's']

# Número total de muestras objetivo
total_samples = len(df_exploded_filtered)

# Calcular el número de muestras para cada clase según las proporciones deseadas
f_target = int(total_samples * 0.33)
b_target = int(total_samples * 0.33)
s_target = int(total_samples * 0.33)

# Aplicar resampling (sobremuestreo o submuestreo)
f_resampled = resample(f_class, replace=True, n_samples=f_target, random_state=42)
b_resampled = resample(b_class, replace=True, n_samples=b_target, random_state=42)
s_resampled = resample(s_class, replace=True, n_samples=s_target, random_state=42)

# Combinar las clases resampleadas
df_resampled = pd.concat([f_resampled, b_resampled, s_resampled])

# Barajar el dataset para mezclar las clases
df_resampled = df_resampled.sample(frac=1, random_state=42).reset_index(drop=True)

df_exploded_filtered = df_resampled.copy() 

In [66]:
df_exploded_filtered[['Player', 'X', 'y']].head()

Unnamed: 0,Player,X,y
0,Andy Roddick,"[b39, s3, b3]",f
1,Mats Wilander,"[4, f3, b3]",s
2,Mats Wilander,"[6, b3, s3]",s
3,Pablo Andujar,"[4, b3, f1]",f
4,Yevgeny Kafelnikov,"[b29, f1, b2]",b


In [67]:
df_exploded_filtered['y'].value_counts(normalize = True)

f    0.333333
s    0.333333
b    0.333333
Name: y, dtype: float64

In [68]:
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

n_steps = 3

X = df_exploded_filtered['X']
players = df_exploded_filtered['Player']
y = df_exploded_filtered['y']

# Tokenizador para las secuencias
tokenizer = Tokenizer()
tokenizer.fit_on_texts([' '.join(seq) for seq in X])
X_encoded = tokenizer.texts_to_sequences([' '.join(seq) for seq in X])
X_padded = tf.keras.preprocessing.sequence.pad_sequences(X_encoded, maxlen=n_steps)

# Codificar jugadores
player_encoder = LabelEncoder()
players_encoded = player_encoder.fit_transform(players)

# Codificar etiquetas (y)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)


In [69]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 50

# Aumentar la capacidad del modelo
seq_input = Input(shape=(n_steps,), name='sequence_input')
embedding = Embedding(input_dim=vocab_size, output_dim=100)(seq_input)  # Embeddings (vectorizaciones de las secuencias)
lstm = LSTM(128, return_sequences=True)(embedding)  # Red LSTM, unidades y capas adicionales
lstm = LSTM(128)(lstm)
dropout = Dropout(0.5)(lstm)

player_input = Input(shape=(1,), name='player_input')
player_embedding = Embedding(input_dim=len(player_encoder.classes_), output_dim=10)(player_input)  # Embeddings (vectorizacions de jugadores)
player_flattened = tf.keras.layers.Flatten()(player_embedding)

combined = Concatenate()([dropout, player_flattened])
output = Dense(y_categorical.shape[1], activation='softmax', name='output')(combined)

model = Model(inputs=[seq_input, player_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [70]:
from tensorflow.keras.callbacks import EarlyStopping

# Early stopping para evitar overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',  # Métrica que se monitorea
    patience=10,          # Número de épocas sin mejora antes de detener
    restore_best_weights=True  # Restaurar los mejores pesos
)

# Dividir en conjuntos de entrenamiento y prueba
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, players_train, players_test = train_test_split(
    X_padded, y_categorical, players_encoded, test_size=0.2, random_state=42
)

# Entrenar el modelo
history = model.fit(
    [X_train, players_train],
    y_train,
    validation_data=([X_test, players_test], y_test),
    epochs=50,
    batch_size=64,
   # callbacks=[early_stopping]
)


Epoch 1/50




[1m3242/3242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 4ms/step - accuracy: 0.4558 - loss: 1.0393 - val_accuracy: 0.4814 - val_loss: 1.0148
Epoch 2/50
[1m3242/3242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.4847 - loss: 1.0128 - val_accuracy: 0.4814 - val_loss: 1.0136
Epoch 3/50
[1m3242/3242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.4889 - loss: 1.0079 - val_accuracy: 0.4814 - val_loss: 1.0137
Epoch 4/50
[1m3242/3242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.4866 - loss: 1.0097 - val_accuracy: 0.4836 - val_loss: 1.0131
Epoch 5/50
[1m3242/3242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.4902 - loss: 1.0065 - val_accuracy: 0.4838 - val_loss: 1.0129
Epoch 6/50
[1m3242/3242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.4893 - loss: 1.0054 - val_accuracy: 0.4837 - val_loss: 1.0117
Epoch 7/50
[1m3242/3

In [71]:
# Predecir el próximo golpe
def predict_next_shot(sequence, player):
    # Preprocesar entrada
    seq_encoded = tokenizer.texts_to_sequences([sequence])
    seq_padded = tf.keras.preprocessing.sequence.pad_sequences(seq_encoded, maxlen=n_steps)
    player_encoded = player_encoder.transform([player])
    
    # Hacer predicción
    prediction = model.predict([seq_padded, player_encoded])[0]  # Vector de probabilidades para la fila
    
    # Mapear índices a las clases
    class_probabilities = {label_encoder.inverse_transform([i])[0]: prob for i, prob in enumerate(prediction)}
    return class_probabilities

# Evaluar en el conjunto de prueba
loss, accuracy = model.evaluate([X_test, players_test], y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[1m1621/1621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.4983 - loss: 1.0186
Test Loss: 1.0187658071517944, Test Accuracy: 0.498650461435318


In [72]:
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

# Calcular y mostrar métricas
y_pred = model.predict([X_test, players_test])
y_pred_classes = y_pred.argmax(axis=1)
y_true_classes = y_test.argmax(axis=1)

precision = precision_score(y_true_classes, y_pred_classes, average='weighted')
recall = recall_score(y_true_classes, y_pred_classes, average='weighted')
f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')

print(f"Test Loss: {loss:.2f}")
print(f"Test Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")


[1m1621/1621[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step
Test Loss: 1.02
Test Accuracy: 0.50
Precision: 0.50
Recall: 0.50
F1-Score: 0.50


In [77]:
secuencia = ['b1', 'b3', 'f3']

print(predict_next_shot(secuencia, "Roger Federer"))
print(predict_next_shot(secuencia, "Novak Djokovic"))
print(predict_next_shot(secuencia, "Rafael Nadal"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
{'b': 0.2917914, 'f': 0.19745621, 's': 0.5107524}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
{'b': 0.4685634, 'f': 0.2544115, 's': 0.2770251}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
{'b': 0.40689754, 'f': 0.23035334, 's': 0.36274916}


In [74]:
X.sample(3)

84456     [b28, f1, b2]
225732     [b8, f3, b2]
75590       [4, f2, s3]
Name: X, dtype: object

In [75]:
X.iloc[163614]

['b27', 'b2', 'f3']

El modelo podría tener mejor accuracy, pero tiene un buen desempeño dando predicciones coherentes con los estilos de juego de los jugadores. Por ejemplo, Federer de Reves a una mano tiene mas propension a dar un slice que otros jugadores y esto es reflejado en las predicciones del modelo. 
También una sucesión de golpes simples al medio son seguidos por altas probabilidades de un reves, lo cual tiene sentido porque la mayoría de los rivales prefieren forzar que el jugador de un reves.