In [None]:
# Importing the necessary packages as well as my own functions that will be used in this project.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from preprocessing import *

In [None]:
# Read in the data, concat the df's if using more than one file. Also we merge it with a csv with player
# names and rename 'event_id' column to be compatible with the functions being used.
# cumulative_match_mins accounts for the stoppage time in games and insert_ball_carries inserts carries
# into the data.
# We then select a trimmed down df that we will use.

df7 = pd.read_csv("../whoscored data/pl_all_match_events.23.24.csv")
df1 = pd.read_csv("../whoscored data/pl_all_match_events.17.18.csv")
df2 = pd.read_csv("../whoscored data/pl_all_match_events.18.19.csv")
df3 = pd.read_csv("../whoscored data/pl_all_match_events.19.20.csv")
df4 = pd.read_csv("../whoscored data/pl_all_match_events.20.21.csv")
df5 = pd.read_csv("../whoscored data/pl_all_match_events.21.22.csv")
df6 = pd.read_csv("../whoscored data/pl_all_match_events.22.23.csv")

df = pd.concat([df1, df2, df3, df4, df5, df6, df7], ignore_index=True)
df = df.rename(columns={'event_id': 'eventId'})

merged_df = cumulative_match_mins(df)
carry_df = insert_ball_carries(merged_df)

df = carry_df.copy()
df = df[['match_id', 'eventId', 'expandedMinute', 'second', 'outcomeType', 'x', 'y', 'endX', 'endY', 'teamId', 'eventType', 'isShot']]

df_with_possessions = get_possession_chains(carry_df, chain_check=3, suc_evts_in_chain=2)

In [None]:
# We create a new column for total seconds elapsed. We're looking for shots within 15 seconds, so this
# column will help us. We then define the pitch dimensions and bin totals and calculate the bin size.
# Then we filter for passes. I initially included carries as well, but it seemed to add too much noise.
# End locations of passes seem to give a good amount of information. 

df = df_with_possessions.copy()

df['seconds'] = df['expandedMinute'] * 60 + df['second']

pitch_length = 100  
pitch_width = 100  
bins_x = 24
bins_y = 19

bin_size_x = pitch_length / bins_x
bin_size_y = pitch_width / bins_y

df.reset_index(drop=True, inplace=True)

df = df[(df['eventType'] == 'Pass') | (df['isShot'] == True)]
df.loc[df['isShot'] == True, 'endX'] = df['x']
df.loc[df['isShot'] == True, 'endY'] = df['y']

# If the pass is the first pass in a possession, we include the starting bin as well as the ball
# is located in this bin at a point in time. If there are multiple consecutive passes in the same bin, 
# we use the latest as the ball is still within the bin and hasn't left it yet. 

df['is_first_pass'] = (df['possession_id'] != df['possession_id'].shift(1))

expanded_df = pd.DataFrame()

start_passes = df[df['is_first_pass']].copy()
start_passes['bin_x'] = (start_passes['x'] // bin_size_x).astype(int)
start_passes['bin_y'] = (start_passes['y'] // bin_size_y).astype(int)

end_passes = df.copy()
end_passes['bin_x'] = (end_passes['endX'] // bin_size_x).astype(int)
end_passes['bin_y'] = (end_passes['endY'] // bin_size_y).astype(int)

expanded_df = pd.concat([start_passes, end_passes], ignore_index=True)

expanded_df['bin_x'] = expanded_df['bin_x'].clip(0, bins_x - 1)
expanded_df['bin_y'] = expanded_df['bin_y'].clip(0, bins_y - 1)

expanded_df['shifted_bin_x'] = expanded_df['bin_x'].shift(-1)
expanded_df['shifted_bin_y'] = expanded_df['bin_y'].shift(-1)
expanded_df['shifted_teamId'] = expanded_df['teamId'].shift(-1)
expanded_df['shifted_match_id'] = expanded_df['match_id'].shift(-1)
expanded_df['shifted_isShot'] = expanded_df['isShot'].shift(-1)

expanded_df['keep'] = (expanded_df['isShot'] == True) | (
    (expanded_df['eventType'] == 'Pass') & (
        (expanded_df['bin_x'] != expanded_df['shifted_bin_x']) |
        (expanded_df['bin_y'] != expanded_df['shifted_bin_y']) |
        (expanded_df['teamId'] != expanded_df['shifted_teamId']) |
        (expanded_df['match_id'] != expanded_df['shifted_match_id']) |
        (expanded_df['shifted_isShot'] == True)
    )
)

filtered_df = expanded_df[expanded_df['keep']].copy()

filtered_df.drop(columns=['shifted_bin_x', 'shifted_bin_y', 'shifted_teamId', 'shifted_match_id', 'shifted_isShot', 'keep', 'is_first_pass'], inplace=True)

filtered_df = filtered_df.sort_values(by=['match_id', 'seconds'])

bin_x_list = []
bin_y_list = []
shot_occurred_list = []

grouped = filtered_df.groupby('match_id')

# Afterwards we iterate through each match, finding future events from any given pass. We track each bin the ball has been in.
# Note, we use end X and Y for locations. By using end locations, and the starting location of a possession, we should
# be robust. Bins that the ball passes through during a pass aren't used. If there is a shot, we increment the shots 
# occurred by 1. We only count shots attempted by the team in possession i.e. the team who is passing. This is because
# we are trying to find how threathening the opposition team is when they have the ball in a bin.

for match_id, group in grouped:
    
    for idx, row in group.iterrows():
        if row['eventType'] == 'Pass':
            bin_x = row['bin_x']
            bin_y = row['bin_y']
            current_time = row['seconds']
            current_team = row['teamId']
        
            future_events = group[(group['seconds'] > current_time) & (group['seconds'] <= current_time + 15)]
            
            shot_occurred = future_events[(future_events['isShot'] == True) & (future_events['teamId'] == current_team)].any().any()
            
            bin_x_list.append(bin_x)
            bin_y_list.append(bin_y)
            shot_occurred_list.append(shot_occurred)

results_df = pd.DataFrame({
    'bin_x': bin_x_list,
    'bin_y': bin_y_list,
    'shot_occurred': shot_occurred_list
})

# We group by bins and sum the shots occurred. We then divide this by the total amount of times the ball has been in that
# bin. 

grouped = results_df.groupby(['bin_x', 'bin_y'])['shot_occurred'].agg(['sum', 'count']).reset_index()
grouped.columns = ['bin_x', 'bin_y', 'shots', 'total']

bin_stats = pd.DataFrame({
    'bin_x': np.repeat(np.arange(bins_x), bins_y),
    'bin_y': np.tile(np.arange(bins_y), bins_x)
})

bin_stats = bin_stats.merge(grouped, on=['bin_x', 'bin_y'], how='left')
bin_stats.fillna(0, inplace=True)

bin_stats['probability'] = bin_stats['shots'] / bin_stats['total']
bin_stats['probability'] = bin_stats['probability'].fillna(0)

# Then, we create a pivot table for the heatmap visualization. 
# X is the length of the pitch, from one goal to the other (the columns).
# Y is from sideline to the opposite sideline (rows).

heatmap_data = bin_stats.pivot(index='bin_y', columns='bin_x', values='probability')

plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt=".2f", cbar=True)
plt.title('Shot Probability Heatmap')
plt.xlabel('Bin X')
plt.ylabel('Bin Y')

plt.show()
