In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import chardet
import re
import warnings
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
df = pd.read_csv('data_output/charting_clean.csv', low_memory=False)

In [14]:
point_winning_data = df[['1stIn', '2ndIn', '1st_final', '2nd_final']]

# Replace values in '1st_final' with '2nd_final' if '1stIn' == 0
point_winning_data['1st_final'] = np.where(point_winning_data['1stIn'] == 0, point_winning_data['2nd_final'], point_winning_data['1st_final'])

# Drop rows where '2ndIn' is 0
point_winning_data = point_winning_data[point_winning_data['2ndIn'] != 0]

# Display the first few rows of the DataFrame
point_winning_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  point_winning_data['1st_final'] = np.where(point_winning_data['1stIn'] == 0, point_winning_data['2nd_final'], point_winning_data['1st_final'])


Unnamed: 0,1stIn,2ndIn,1st_final,2nd_final
0,1,,6 f2n#,
1,0,1.0,6 b19 f1 b2 s1 f3 f2 j2*,6 b19 f1 b2 s1 f3 f2 j2*
2,0,1.0,4 b28 f2 o1*,4 b28 f2 o1*
3,1,,6 s28 f3*,
4,1,,4 b37 b3*,


In [17]:
#drop na if na in 1st_final
point_winning_data = point_winning_data.dropna(subset=['1st_final'])

(713303, 4)

In [27]:
point_winning_data = point_winning_data.sample(frac=0.3, random_state=42)

In [24]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(point_winning_data['1st_final'])
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Step 2: Tokenize sequences and create labels for when the point ends
tokenized_sequences = []
ending_labels = []

for seq in point_winning_data['1st_final']:
    tokens = seq.split()
    
    # Generate partial sequences and labels
    for i in range(1, len(tokens)):
        partial_sequence = tokens[:i]  # Create partial sequence up to shot i
        tokenized_sequence = tokenizer.texts_to_sequences([partial_sequence])
        tokenized_sequence = [item for sublist in tokenized_sequence for item in sublist]  # Flatten list of lists
        tokenized_sequences.append(tokenized_sequence)

        # Label for whether the next shot is the end of the rally
        if tokens[i][-1] in ['#', '@', '*']:
            ending_labels.append(1)  # Next shot ends the rally
        else:
            ending_labels.append(0)  # Next shot does not end the rally

# Step 3: Pad sequences
max_sequence_length = max(len(seq) for seq in tokenized_sequences)
X = pad_sequences(tokenized_sequences, maxlen=max_sequence_length, padding='post')

# Convert labels to numpy array
y_ending = np.array(ending_labels)  # Labels for whether the next shot ends the rally

# Display data shape for verification
print(f"X shape: {X.shape}, y_ending shape: {y_ending.shape}")

X shape: (282673, 82), y_ending shape: (282673,)


In [25]:
# Define the RNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=64))
model.add(LSTM(64, return_sequences=False))  # Only want the final output
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Predict the probability that the next shot ends the rally

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [26]:
# Train the model
history = model.fit(
    X, y_ending,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/10
[1m 418/7067[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:41[0m 15ms/step - accuracy: 0.7620 - loss: 0.5559

KeyboardInterrupt: 