In [43]:
import pickle
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict
from supervision.keypoint.core import KeyPoints

FRAME_WINDOW = 7
FRAMES_BEFORE = 3
FRAMES_AFTER = 3
MIDDLE_FRAME_INDEX = 3

with open('all_data.pkl', 'rb') as f:
    all_data = pickle.load(f)

def read_shots_file(folder: str) -> List[int]:
    with open(f'assets/{folder}/shots.txt', 'r') as f:
        return [int(line.strip()) for line in f]

shots_data = {folder: read_shots_file(folder) for folder in all_data['pose'].keys()}

def get_bottom_left_player(keypoints: KeyPoints) -> np.ndarray:
    if keypoints.xy.shape[0] == 0:
        return np.zeros((17, 2))
    centers = np.mean(keypoints.xy, axis=1)
    bottom_left_idx = np.lexsort(([-y for _, y in centers], [x for x, _ in centers]))[0]
    return keypoints.xy[bottom_left_idx]

def get_bottom_left_ball(ball_data):
    if not ball_data:
        return None
    try:
        ball_array = np.array(ball_data)
        if ball_array.ndim == 3:
            ball_positions = ball_array[:, 0, :2]
        elif ball_array.ndim == 2:
            ball_positions = ball_array[:, :2]
        else:
            return None
        bottom_left_idx = np.lexsort(([-y for _, y in ball_positions], [x for x, _ in ball_positions]))[0]
        return ball_array[bottom_left_idx, :4]  # Return only x, y, width, height
    except Exception:
        return None

def extract_features(folder: str, frame_numbers: List[int]) -> List[float]:
    features = []
    for i in frame_numbers:
        if i in all_data['pose'][folder]:
            bottom_left_keypoints = get_bottom_left_player(all_data['pose'][folder][i])
            pose_features = bottom_left_keypoints.flatten().tolist()
        else:
            pose_features = [0] * (17 * 2)  # 17 keypoints * 2 coordinates
        
        if i in all_data['ball'][folder]:
            ball_data = all_data['ball'][folder][i]
            if ball_data:
                bottom_left_ball = get_bottom_left_ball(ball_data)
                ball_features = bottom_left_ball.tolist() if bottom_left_ball is not None else [0] * 4
            else:
                ball_features = [0] * 4
        else:
            ball_features = [0] * 4
        
        features.extend(pose_features + ball_features)
    
    return features

def generate_training_data(folder: str) -> List[Tuple[List[float], int]]:
    data = []
    frame_numbers = sorted(set(all_data['pose'][folder].keys()) | set(all_data['ball'][folder].keys()))
    for i in range(len(frame_numbers) - FRAME_WINDOW + 1):
        current_frames = frame_numbers[i:i+FRAME_WINDOW]
        features = extract_features(folder, current_frames)
        label = 1 if any(frame in shots_data[folder] for frame in current_frames) else 0
        data.append((features, label))
    return data

all_training_data = []
for i, folder in enumerate(all_data['pose'].keys(), 1):
    print(f"Processing folder {i}/{len(all_data['pose'])}")
    all_training_data.extend(generate_training_data(folder))

features, labels = zip(*all_training_data)
df = pd.DataFrame(features)
df['label'] = labels

df.to_csv('padel_shots_dataset.csv', index=False)

print(f"Frame window size: {FRAME_WINDOW}")
print(f"Frames before shot: {FRAMES_BEFORE}")
print(f"Frames after shot: {FRAMES_AFTER}")
print(f"Dataset shape: {df.shape}")
print(f"Number of positive samples: {df['label'].sum()}")
print(f"Number of negative samples: {len(df) - df['label'].sum()}")

Processing folder 1/3
Processing folder 2/3
Processing folder 3/3
Frame window size: 7
Frames before shot: 3
Frames after shot: 3
Dataset shape: (3885, 267)
Number of positive samples: 399
Number of negative samples: 3486


In [44]:
from sklearn.utils import resample
import os

# Create a copy of the original dataframe
df_copy = df.copy()

# Separate positive and negative samples
positive_samples = df_copy[df_copy['label'] == 1]
negative_samples = df_copy[df_copy['label'] == 0]

# Decide on the desired ratio (e.g., 1:3)
desired_ratio = 6
n_negative_samples = len(positive_samples) * desired_ratio

# Undersample negative samples
negative_samples_undersampled = resample(negative_samples, 
                                         replace=False,    # sample without replacement
                                         n_samples=n_negative_samples, 
                                         random_state=42)  # reproducible results

# Combine positive samples with undersampled negative samples
df_balanced = pd.concat([positive_samples, negative_samples_undersampled])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Original dataset shape: {df.shape}")
print(f"Balanced dataset shape: {df_balanced.shape}")
print(f"Number of positive samples in balanced set: {df_balanced['label'].sum()}")
print(f"Number of negative samples in balanced set: {len(df_balanced) - df_balanced['label'].sum()}")

# Save the balanced dataset to a new CSV file
output_dir = 'processed_data'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

balanced_csv_path = os.path.join(output_dir, 'padel_shots_dataset_balanced.csv')
df_balanced.to_csv(balanced_csv_path, index=False)

print(f"Balanced dataset saved to: {balanced_csv_path}")

# To read the balanced dataset back in, you can use:
# df_balanced = pd.read_csv('processed_data/padel_shots_dataset_balanced.csv')

Original dataset shape: (3885, 267)
Balanced dataset shape: (2793, 267)
Number of positive samples in balanced set: 399
Number of negative samples in balanced set: 2394
Balanced dataset saved to: processed_data/padel_shots_dataset_balanced.csv


In [45]:
import pandas as pd
from sklearn.utils import resample
import os

# Create a copy of the original dataframe
df_copy = df.copy()

# Separate positive and negative samples
positive_samples = df_copy[df_copy['label'] == 1]
negative_samples = df_copy[df_copy['label'] == 0]

# Decide on the desired ratio (e.g., 1:3)
desired_ratio = 5
n_negative_samples = len(positive_samples) * desired_ratio

# Sequentially undersample negative samples by selecting contiguous blocks
block_size = len(negative_samples) // (n_negative_samples // len(positive_samples))
negative_samples_undersampled = pd.DataFrame()

for start in range(0, len(negative_samples), block_size):
    end = min(start + block_size, len(negative_samples))
    negative_samples_undersampled = pd.concat([negative_samples_undersampled, negative_samples.iloc[start:end]])

# If the blocks are not enough, adjust by reducing the block size or reshuffling
negative_samples_undersampled = negative_samples_undersampled.sample(n=n_negative_samples, random_state=42)

# Combine positive samples with undersampled negative samples
df_balanced = pd.concat([positive_samples, negative_samples_undersampled])

# Note: No shuffling is done to preserve the temporal order

print(f"Original dataset shape: {df.shape}")
print(f"Balanced dataset shape: {df_balanced.shape}")
print(f"Number of positive samples in balanced set: {df_balanced['label'].sum()}")
print(f"Number of negative samples in balanced set: {len(df_balanced) - df_balanced['label'].sum()}")

# Save the balanced dataset to a new CSV file
output_dir = 'processed_data'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

balanced_csv_path = os.path.join(output_dir, 'padel_shots_dataset_balanced_sequential.csv')
df_balanced.to_csv(balanced_csv_path, index=False)

print(f"Balanced dataset saved to: {balanced_csv_path}")

# To read the balanced dataset back in, you can use:
# df_balanced = pd.read_csv('processed_data/padel_shots_dataset_balanced_sequential.csv')

Original dataset shape: (3885, 267)
Balanced dataset shape: (2394, 267)
Number of positive samples in balanced set: 399
Number of negative samples in balanced set: 1995
Balanced dataset saved to: processed_data/padel_shots_dataset_balanced_sequential.csv
