In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import chardet
import re
import warnings
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Concatenate, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.optimizers import Adam

In [2]:
df = pd.read_csv('data_output/charting_clean.csv', low_memory=False)

In [3]:
df.head()

Unnamed: 0,match_id,Pt,Set1,Set2,Gm1,Gm2,Pts,Gm#,TbSet,TB?,...,2nd_test,rallyCountVerification,Player_svr,Player_ret,1st_player_svr,1st_player_ret,2nd_player_svr,2nd_player_ret,1st_final,2nd_final
0,20230908-M-US_Open-SF-Novak_Djokovic_-Ben_Shelton,1,0,0,0.0,0.0,0-0,1 (1),1,0,...,,1,Novak Djokovic,Ben Shelton,6,f2n#,,,6 f2n#,
1,20230908-M-US_Open-SF-Novak_Djokovic_-Ben_Shelton,2,0,0,0.0,0.0,15-0,1 (2),1,0,...,6 b19 f1 b2 s1 f3 f2 j2 *,8,Novak Djokovic,Ben Shelton,4n,,6 f1 s1 f2,b19 b2 f3 j2*,4n,6 b19 f1 b2 s1 f3 f2 j2*
2,20230908-M-US_Open-SF-Novak_Djokovic_-Ben_Shelton,3,0,0,0.0,0.0,15-15,1 (3),1,0,...,4 b28 f2 o1 *,4,Novak Djokovic,Ben Shelton,4d,,4 f2,b28 o1*,4d,4 b28 f2 o1*
3,20230908-M-US_Open-SF-Novak_Djokovic_-Ben_Shelton,4,0,0,0.0,0.0,15-30,1 (4),1,0,...,,3,Novak Djokovic,Ben Shelton,6 f3*,s28,,,6 s28 f3*,
4,20230908-M-US_Open-SF-Novak_Djokovic_-Ben_Shelton,5,0,0,0.0,0.0,30-30,1 (5),1,0,...,,3,Novak Djokovic,Ben Shelton,4 b3*,b37,,,4 b37 b3*,


In [4]:
# Select relevant columns
point_winning_data = df[['1stIn', '2ndIn', '1st_final', '2nd_final', 'Pt', 'Set1', 'Set2', 'Gm1', 'Gm2', 'TbSet', 'TB?', 'TBpt', 'Svr', 'Ret', 'isAce', 'isUnret', 'isRallyWinner', 'isForced', 'isUnforced', 'isDouble', 'PtWinner', 'isSvrWinner', 'rallyCount']]

# Replace values in '1st_final' with '2nd_final' if '1stIn' == 0
point_winning_data['1st_final'] = np.where(point_winning_data['1stIn'] == 0, point_winning_data['2nd_final'], point_winning_data['1st_final'])

# Drop rows where '2ndIn' is 0
point_winning_data = point_winning_data[point_winning_data['2ndIn'] != 0]

# Display the first few rows of the DataFrame
point_winning_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  point_winning_data['1st_final'] = np.where(point_winning_data['1stIn'] == 0, point_winning_data['2nd_final'], point_winning_data['1st_final'])


Unnamed: 0,1stIn,2ndIn,1st_final,2nd_final,Pt,Set1,Set2,Gm1,Gm2,TbSet,...,Ret,isAce,isUnret,isRallyWinner,isForced,isUnforced,isDouble,PtWinner,isSvrWinner,rallyCount
0,1,,6 f2n#,,1,0,0,0.0,0.0,1,...,2,False,False,False,True,False,False,1,1,1
1,0,1.0,6 b19 f1 b2 s1 f3 f2 j2*,6 b19 f1 b2 s1 f3 f2 j2*,2,0,0,0.0,0.0,1,...,2,False,False,True,False,False,False,2,0,8
2,0,1.0,4 b28 f2 o1*,4 b28 f2 o1*,3,0,0,0.0,0.0,1,...,2,False,False,True,False,False,False,2,0,4
3,1,,6 s28 f3*,,4,0,0,0.0,0.0,1,...,2,False,False,True,False,False,False,1,1,3
4,1,,4 b37 b3*,,5,0,0,0.0,0.0,1,...,2,False,False,True,False,False,False,1,1,3


In [5]:
#drop na if na in 1st_final
point_winning_data = point_winning_data.dropna(subset=['1st_final'])

In [8]:
point_winning_data.to_csv('data_output/full_sequence.csv', index=False)

In [116]:
point_winning_data = point_winning_data.sample(frac=0.5, random_state=42)

In [117]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(point_winning_data['1st_final'])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Convertir las secuencias en listas de tokens
tokenized_sequences = []
for seq in point_winning_data['1st_final']:
    tokens = seq.split()
    tokenized_sequence = tokenizer.texts_to_sequences([tokens])
    tokenized_sequence = [item for sublist in tokenized_sequence for item in sublist]
    tokenized_sequences.append(tokenized_sequence)

# Rellenar secuencias para que todas tengan la misma longitud
max_sequence_length = 10  # Máximo de 5 golpes para unificar el largo de las secuencias
X_seq = pad_sequences(tokenized_sequences, maxlen=max_sequence_length, padding='post')


In [118]:
point_winning_data

Unnamed: 0,1stIn,2ndIn,1st_final,2nd_final,Pt,Set1,Set2,Gm1,Gm2,TbSet,...,Ret,isAce,isUnret,isRallyWinner,isForced,isUnforced,isDouble,PtWinner,isSvrWinner,rallyCount
592200,0,1.0,5 b38 s2n@,5 b38 s2n@,149,2,0,0.0,1.0,1,...,2,False,False,False,False,True,False,2,0,2
484980,1,,6 f2n#,,101,1,0,5.0,4.0,1,...,2,False,False,False,True,False,False,1,1,1
411659,1,,4 b38 b2 f1 f1 f3n@,,109,1,0,0.0,0.0,1,...,1,False,False,False,False,True,False,2,1,5
570119,0,1.0,5 b28 b3w@,5 b28 b3w@,136,2,0,4.0,5.0,1,...,2,False,False,False,False,True,False,2,0,2
159868,1,,6 b29 b2 b2 f2 b2 f2 b2 f19w@,,198,2,1,0.0,0.0,1,...,1,False,False,False,False,True,False,1,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179267,1,,4 q28 f19w@,,67,1,0,1.0,1.0,1,...,1,False,False,False,False,True,False,1,0,2
171718,0,1.0,5 f17 f1 f1 f1 f2 f38*,5 f17 f1 f1 f1 f2 f38*,133,1,1,2.0,1.0,1,...,1,False,False,True,False,False,False,2,1,7
621610,1,,6*,,228,1,2,3.0,2.0,1,...,1,True,False,False,False,False,False,2,1,1
162217,0,1.0,5 b3d@,5 b3d@,49,1,0,0.0,0.0,1,...,2,False,False,False,False,True,False,1,1,1


In [119]:
point_winning_data.dtypes

1stIn              int64
2ndIn            float64
1st_final         object
2nd_final         object
Pt                 int64
Set1               int64
Set2               int64
Gm1              float64
Gm2              float64
TbSet             object
TB?               object
TBpt             float64
Svr                int64
Ret                int64
isAce               bool
isUnret             bool
isRallyWinner       bool
isForced            bool
isUnforced          bool
isDouble            bool
PtWinner           int64
isSvrWinner        int64
rallyCount         int64
dtype: object

In [120]:
point_winning_data['TbSet'] = point_winning_data['TbSet'].replace('T',2).astype(float)
point_winning_data['TB?'] = point_winning_data['TbSet'].replace('S',2).astype(float)
point_winning_data.dtypes

1stIn              int64
2ndIn            float64
1st_final         object
2nd_final         object
Pt                 int64
Set1               int64
Set2               int64
Gm1              float64
Gm2              float64
TbSet            float64
TB?              float64
TBpt             float64
Svr                int64
Ret                int64
isAce               bool
isUnret             bool
isRallyWinner       bool
isForced            bool
isUnforced          bool
isDouble            bool
PtWinner           int64
isSvrWinner        int64
rallyCount         int64
dtype: object

In [121]:
point_winning_data.head()

Unnamed: 0,1stIn,2ndIn,1st_final,2nd_final,Pt,Set1,Set2,Gm1,Gm2,TbSet,...,Ret,isAce,isUnret,isRallyWinner,isForced,isUnforced,isDouble,PtWinner,isSvrWinner,rallyCount
592200,0,1.0,5 b38 s2n@,5 b38 s2n@,149,2,0,0.0,1.0,1.0,...,2,False,False,False,False,True,False,2,0,2
484980,1,,6 f2n#,,101,1,0,5.0,4.0,1.0,...,2,False,False,False,True,False,False,1,1,1
411659,1,,4 b38 b2 f1 f1 f3n@,,109,1,0,0.0,0.0,1.0,...,1,False,False,False,False,True,False,2,1,5
570119,0,1.0,5 b28 b3w@,5 b28 b3w@,136,2,0,4.0,5.0,1.0,...,2,False,False,False,False,True,False,2,0,2
159868,1,,6 b29 b2 b2 f2 b2 f2 b2 f19w@,,198,2,1,0.0,0.0,1.0,...,1,False,False,False,False,True,False,1,0,8


In [122]:
from sklearn.preprocessing import StandardScaler

X_context_features = point_winning_data[['Pt', 'Set1', 'Set2', 'Gm1', 'Gm2', 'TbSet', 'TB?', 'TBpt', 'Svr', 'Ret', 'isAce', 'isUnret', 'isRallyWinner', 'isForced', 'isUnforced', 'isDouble', 'PtWinner', 'isSvrWinner', 'rallyCount']]

scaler = StandardScaler()
context_features = scaler.fit_transform(X_context_features)

print(f"X_seq shape: {X_seq.shape}, context_features shape: {X_context_features.shape}")

X_seq shape: (356652, 10), context_features shape: (356652, 19)


In [123]:
# definir las etiquetas y, 0 si fue winner (*), 1 si fue error forzado (#), 2 si fue error no forzado(@)
point_winning_data.head()
#Si ultimo caracter de 1st_final es *, entonces 0, si es # entonces 1, si es @ entonces 2
y_type = []
for i in point_winning_data['1st_final']:
    if i[-1] == '*':
        y_type.append(0)
    elif i[-1] == '#':
        y_type.append(1)
    else:
        y_type.append(2)
        
print(f'X_seq shape: {X_seq.shape}, context_features shape: {X_context_features.shape}, y_type shape: {len(y_type)}')

X_seq shape: (356652, 10), context_features shape: (356652, 19), y_type shape: 356652


In [140]:
seq_input = Input(shape=(max_sequence_length,))
context_input = Input(shape=(X_context_features.shape[1],))

# Procesamiento secuencial con Embedding y LSTM
embedding_layer = Embedding(input_dim=vocab_size, output_dim=64, input_length=max_sequence_length)(seq_input)
lstm_layer = LSTM(64)(embedding_layer)

# Procesamiento de características contextuales con capas densas
dense_context = Dense(32, activation='relu')(context_input)

# Combinar ambas entradas
merged = Concatenate()([lstm_layer, dense_context])
dropout_layer = Dropout(0.2)(merged)
output = Dense(3, activation='softmax')(dropout_layer)  # 3 clases: ganador, error forzado, error no forzado

# Definir y compilar el modelo
model = Model(inputs=[seq_input, context_input], outputs=output)
model.compile(optimizer=Adam(learning_rate= 0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Mostrar resumen del modelo
model.summary()

# Etiquetas de entrenamiento (por simplicidad, generamos algunas etiquetas para el ejemplo)
y_type = np.array(y_type)  # Ejemplo: 0 - ganador, 1 - error forzado, 2 - error no forzado

# Entrenar el modelo
history = model.fit(
    [X_seq, X_context_features], y_type,
    epochs=10,  # Reducido para una prueba rápida
    batch_size=64,
    validation_split=0.2
)




Epoch 1/10




[1m1905/4459[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m7s[0m 3ms/step - accuracy: 0.4459 - loss: 1.0198

KeyboardInterrupt: 

In [139]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.regularizers import l2

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64, return_sequences=False))
model.add(BatchNormalization())
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(3, activation='softmax'))

# Compilar el modelo con una tasa de aprendizaje más baja
model.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Entrenar el modelo
history = model.fit(
    [X_seq, X_context_features], y_type,
    epochs=10,  # Puedes ajustar según los resultados observados
    batch_size=32,
    validation_split=0.2
)


Epoch 1/10




AttributeError: Exception encountered when calling Embedding.call().

[1m'tuple' object has no attribute 'dtype'[0m

Arguments received by Embedding.call():
  • inputs=('tf.Tensor(shape=(None, 10), dtype=int32)', 'tf.Tensor(shape=(None, 19), dtype=float32)')

# Prediccion

In [125]:
# Secuencia y características contextuales de ejemplo
new_sequence = "6 f2 f1 f3 o1 s2"  # Ejemplo de secuencia de golpes
new_sequence_tokenized = tokenizer.texts_to_sequences([new_sequence.split()])
new_sequence_padded = pad_sequences(new_sequence_tokenized, maxlen=max_sequence_length, padding='post')

# Nuevas características contextuales (tomando la primera muestra del contexto de prueba)
test_context_features = context_features[0]  # X_context contiene las características normalizadas del conjunto de datos original

# Transformar la nueva característica a una matriz con una fila
test_context_features = test_context_features.reshape(1, -1)

# Hacer la predicción
predicted_probabilities = model.predict([new_sequence_padded, test_context_features])
predicted_class = np.argmax(predicted_probabilities)

# Interpretar la predicción
ending_types = {0: "Winner (*)", 1: "Forced Error (#)", 2: "Unforced Error (@)"}
print(f"Predicción: {ending_types[predicted_class]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
Predicción: Winner (*)


In [128]:
# Secuencia y características contextuales de ejemplo
new_sequence = ("s2")  # Ejemplo de secuencia de golpes
new_sequence_tokenized = tokenizer.texts_to_sequences([new_sequence.split()])
new_sequence_padded = pad_sequences(new_sequence_tokenized, maxlen=max_sequence_length, padding='post')

# Nuevas características contextuales (tomando la primera muestra del contexto de prueba)
test_context_features = context_features[42]  # X_context contiene las características normalizadas del conjunto de datos original

# Transformar la nueva característica a una matriz con una fila
test_context_features = test_context_features.reshape(1, -1)

# Hacer la predicción
predicted_probabilities = model.predict([new_sequence_padded, test_context_features])[0]

# Interpretar la predicción y mostrar las probabilidades
ending_types = {0: "Winner (*)", 1: "Forced Error (#)", 2: "Unforced Error (@)"}
print("Probabilidades de cada clase:")
for i, probability in enumerate(predicted_probabilities):
    print(f"{ending_types[i]}: {probability:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Probabilidades de cada clase:
Winner (*): 0.7506
Forced Error (#): 0.0078
Unforced Error (@): 0.2416


In [146]:
point_winning_data.sample(10)

Unnamed: 0,1stIn,2ndIn,1st_final,2nd_final,Pt,Set1,Set2,Gm1,Gm2,TbSet,...,Ret,isAce,isUnret,isRallyWinner,isForced,isUnforced,isDouble,PtWinner,isSvrWinner,rallyCount
347564,1,,4 b28 b2 b2 f1 f1 f2 b2 b2 f3 b3w@,,41,0,0,4.0,2.0,1.0,...,2,False,False,False,False,True,False,2,0,10
411510,1,,4 b37 b1*,,120,0,1,4.0,5.0,1.0,...,2,False,False,True,False,False,False,1,1,3
599782,0,1.0,4 b38 b3 b3 f2d@,4 b38 b3 b3 f2d@,94,1,0,0.0,1.0,1.0,...,2,False,False,False,False,True,False,2,0,4
305525,0,1.0,6 b27 f1*,6 b27 f1*,8,0,0,1.0,0.0,1.0,...,1,False,False,True,False,False,False,2,1,3
510677,0,1.0,5 b29 f1 f1 f1 f3w@,5 b29 f1 f1 f1 f3w@,7,0,0,1.0,0.0,1.0,...,1,False,False,False,False,True,False,2,1,5
55514,0,1.0,6 b28 f3 b2 f1 f2 f3 b3 f1 f1 f1 f3 s3w#,6 b28 f3 b2 f1 f2 f3 b3 f1 f1 f1 f3 s3w#,196,1,2,0.0,0.0,1.0,...,1,False,False,False,True,False,False,1,0,12
691941,0,1.0,6 b28 v2 z1n@,6 b28 v2 z1n@,183,1,1,1.0,5.0,1.0,...,2,False,False,False,False,True,False,1,1,3
615807,0,1.0,4 b37 b3 f1 f1d@,4 b37 b3 f1 f1d@,22,0,0,1.0,2.0,1.0,...,1,False,False,False,False,True,False,1,0,4
79689,0,1.0,5 b28 f3 b2 f1n@,5 b28 f3 b2 f1n@,121,1,0,5.0,5.0,1.0,...,2,False,False,False,False,True,False,2,0,4
663762,0,1.0,5 f27 z2n@,5 f27 z2n@,318,2,2,1.0,2.0,0.0,...,1,False,False,False,False,True,False,1,0,2
