In [17]:
# Data handling and manipulation
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Statistical tools
from scipy.stats import linregress, uniform, randint
import scipy.stats as st

# Scikit-learn libraries
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, f1_score, make_scorer

# XGBoost
import xgboost as xgb

import ast


#### tags2name

In [5]:
tags2name = pd.read_csv('data/tags2name.csv')

In [6]:

def process_event_tags(events, tags2name):
    """
    Processes the 'tagsList' column in the events DataFrame, extracts labels from the tags table, and merges them back into the original events table.
    
    Parameters:
    events (pd.DataFrame): DataFrame with event data, containing a 'tagsList' column.
    tags2name (pd.DataFrame): DataFrame mapping tag IDs to label names, with 'Tag' and 'Label' columns.
    
    Returns:
    pd.DataFrame: The events DataFrame with an added 'Label' column.
    """
    
    # Getting the list values out of the string values in the tagList column
    events['tagsList'] = events['tagsList'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Exploding the tagsList column to get individual labels
    exploded_events = events.explode('tagsList')

    # Casting to int because the column in the tags table is int
    exploded_events['tagsList'] = exploded_events['tagsList'].astype('Int64')

    # Keeping the original index to preserve information in the merge
    exploded_events['origIndex'] = exploded_events.index

    # Merging with the tags2name DataFrame to get the corresponding label for each tag
    exploded_merged_events = exploded_events.merge(tags2name[['Tag', 'Label']], left_on='tagsList', right_on='Tag', how='left')

    # Grouping by original index to prepare for the join back to the events table
    exploded_merged_events = exploded_merged_events.groupby('origIndex').agg({
        'Label': list
    }).reset_index()

    # Joining back onto the original events table
    events = pd.concat([events, exploded_merged_events['Label']], axis=1)
    
    return events


In [7]:
events_England = pd.read_csv('data/events_England.csv')
events_England=process_event_tags(events_England, tags2name)

In [10]:
events_Spain = pd.read_csv('data/events_Spain.csv')
events_Spain=process_event_tags(events_Spain, tags2name)

In [11]:
events_France = pd.read_csv('data/events_France.csv')
events_France=process_event_tags(events_France, tags2name)

In [12]:
events_Italy = pd.read_csv('data/events_Italy.csv')
events_Italy=process_event_tags(events_Italy, tags2name)

In [13]:
events_Germany = pd.read_csv('data/events_Germany.csv')
events_Germany=process_event_tags(events_Germany, tags2name)

### Events table

In [14]:
events=pd.concat((events_England,events_Spain,events_France,events_Italy,events_Germany), axis=0)

In [15]:
import gc

# Deleting the DataFrame
del events_England,events_Spain,events_France,events_Italy,events_Germany

# Calling garbage collection to release memory
gc.collect()

1419

## About this file

This dataset describes all the events that occur during each match. Each event refers to a ball touch and contains the following information:

- **eventId**: the identifier of the event's type. Each eventId is associated with an event name.
- **eventName**: the name of the event's type (pass, foul, shot, duel, free kick, offside, or touch).
- **subEventId**: the identifier of the subevent's type.
- **subEventName**: the name of the subevent's type, associated with different event types.
- **tags**: a list of event tags providing additional information about the event (e.g., accurate).
- **eventSec**: the time the event occurs (in seconds since the beginning of the current half).
- **id**: a unique identifier of the event.
- **matchId**: the identifier of the match, linked to "wyId" in the match dataset.
- **matchPeriod**: the period of the match ("1H", "2H", "E1", "E2", or "P").
- **playerId**: the identifier of the player who generated the event, linked to "wyId" in a player dataset.
- **positions**: positions: the origin and destination positions associated with the event. Each position is a pair of coordinates (x, y). The x and y coordinates are always in the range [0, 100] and indicate the percentage of the field from the perspective of the attacking team. In particular, the value of the x coordinate indicates the event's nearness (in percentage) to the opponent's goal, while the value of the y coordinates indicates the event's nearness (in percentage) to the right side of the field;
- **teamId**: the identifier of the player's team, linked to "wyId" in the team dataset.
- **tagsList**: same information as the tags column, except in list format.
- **pos_orig_x**: from the positions column, origin in x-axis. (higher x means closer to opposition goal)
- **pos_orig_y**: from the positions column, origin in y-axis.
- **pos_dest_x**: from the positions column, destination in x-axis.
- **pos_dest_y**: from the positions column, destination in y-axis.


In [16]:
events.drop(['tags', 'eventId', 'subEventId'], axis=1, inplace=True)

In [18]:
#positional data isnt that reliable
events.drop(['positions', 'pos_orig_x', 'pos_orig_y', 'pos_dest_x', 'pos_dest_y'], axis=1, inplace=True)

In [19]:
events.drop(['tagsList'], axis=1, inplace=True)

Adding team and player and match mappings so i can spot check things against the actual event as I go along

In [21]:
teams=pd.read_csv('data/teams.csv')
team_mapping=teams.set_index('wyId')['name'].to_dict()

In [22]:
events['teamId']=events['teamId'].map(team_mapping)
events.rename(columns={'teamId': 'teamName'}, inplace=True)

In [23]:
events.head()

Unnamed: 0,subEventName,playerId,matchId,eventName,teamName,matchPeriod,eventSec,id,Label
0,Simple pass,25413,2499719,Pass,Arsenal,1H,2.758649,177959171,[accurate]
1,High pass,370224,2499719,Pass,Arsenal,1H,4.94685,177959172,[accurate]
2,Head pass,3319,2499719,Pass,Arsenal,1H,6.542188,177959173,[accurate]
3,Head pass,120339,2499719,Pass,Arsenal,1H,8.143395,177959174,[accurate]
4,Simple pass,167145,2499719,Pass,Arsenal,1H,10.302366,177959175,[accurate]


Some features I can generate:

**Passes**
- Number of passes
- Successful pass rate
- Number of Launches
- Maybe I will do Pass type (Simple pass -v- High pass -v- Smart pass) 
  
**Shots**
- Number of Shots
- Shots on target rate (accurate v not accurate)
  
**Saves**
- Save attempts (accurate v not accurate) ... although this will be captured in the goalkeepers' player rating. So i might omit this.

**Goals**
- I think it could be interesting to get the number of late goals (say 75+ minutes) a team scores or concedes. Because that is often when a good team breaks through, or a lesser team cracks.


So I will ignore Saves for now. I will filter events only on passes and shots.

But actually from earlier I discovered (my feature_planning worksheet) the "Save attempt" column was most reliable way to track the goals. There was one discrepency with the actual scores I found. For the the West ham v watford match (2499988) it seems to think it was 3-0 instead of 2-0. But that is the only discrepency.

I will filter events only on passes and shots and save attempts.

In [24]:
events=events[events['eventName'].isin(['Pass','Shot', 'Save attempt'])]

So I will create 3 new columns from the Label column. `Accurate` `Inaccurate` `Goal`

In [25]:
events=events.copy()

In [26]:
events.loc[:,'accurate']=events['Label'].apply(lambda x: 1 if 'accurate' in x else 0)

In [27]:
events.loc[:,'inaccurate']=events['Label'].apply(lambda x: 1 if 'not accurate' in x else 0)

In [28]:
events.loc[:,'goal']=events['Label'].apply(lambda x: 1 if 'Goal' in x else 0)

In [29]:
events[['accurate','inaccurate']].value_counts()

accurate  inaccurate
1         0             1322432
0         1              299779
Name: count, dtype: int64

Perfect, everything is either accurate or inaccurate. So I can just drop inaccurate.

In [30]:
events.drop('inaccurate', axis=1, inplace=True)

finding and removing the errant goal row.

In [31]:
events[(events['goal']==1) &
 (events['eventName']=='Save attempt') &
 (events['matchId']==2499988)
  ]
  

Unnamed: 0,subEventName,playerId,matchId,eventName,teamName,matchPeriod,eventSec,id,Label,accurate,goal
455612,Reflexes,92864,2499988,Save attempt,Watford,1H,2219.859224,227348124,"[Goal, not accurate]",0,1
455621,Reflexes,92864,2499988,Save attempt,Watford,1H,2256.129085,227348650,"[Goal, gbr, not accurate]",0,1
456297,Reflexes,92864,2499988,Save attempt,Watford,2H,1922.369339,227348506,"[Goal, glb, not accurate]",0,1


I can see from researching the game that the event associated with 227348124 didnt end in a recorded goal. The one less than a minute later (227348650) did. So I will just delete this row.

In [32]:
events=events[events['id']!=227348124]

In [33]:
events.drop(columns=['id'],inplace=True)

#### Launches

In [34]:
events['Launch']=(events['subEventName']=='Launch').astype(int)

Checking that my stats look good

#### Passes

In [35]:
#I can see this nicely illustrates possession based teams
events[(events['eventName']=='Pass')]['teamName'].value_counts().reset_index()

Unnamed: 0,teamName,count
0,Manchester City,27523
1,Napoli,26874
2,PSG,23908
3,Barcelona,23260
4,Arsenal,22783
...,...,...
93,Augsburg,12100
94,Stoke City,11851
95,Mainz 05,11832
96,Deportivo Alav\u00e9s,11428


#### Pass Completion Rate

In [36]:
#I can see this nicely illustrates possession based teams
events[events['eventName'] == 'Pass'] \
    .groupby('teamName')['accurate'] \
    .agg(lambda x: (x == 1).sum() / len(x)) \
    .reset_index(name='accurate_pass_ratio') \
    .sort_values(by='accurate_pass_ratio', ascending=False)

Unnamed: 0,teamName,accurate_pass_ratio
56,Manchester City,0.899866
68,PSG,0.892463
65,Nice,0.886754
8,Barcelona,0.885168
45,Juventus,0.884102
...,...,...
49,Legan\u00e9s,0.775907
82,Stoke City,0.772087
7,Augsburg,0.771405
23,Crotone,0.758714


In [37]:
pass_count = events[events['eventName'] == 'Pass'] \
    .groupby('teamName') \
    .size() \
    .reset_index(name='pass_count')


# Accurate pass ratio as calculated
accurate_pass_ratio = events[events['eventName'] == 'Pass'] \
    .groupby('teamName')['accurate'] \
    .agg(lambda x: (x == 1).sum() / len(x)) \
    .reset_index(name='accurate_pass_ratio')

# Merge pass_count with accurate_pass_ratio on 'teamName'
team_pass_data = pd.merge(accurate_pass_ratio, pass_count, on='teamName')



correlation = team_pass_data[['pass_count', 'accurate_pass_ratio']].corr()
print(correlation)


                     pass_count  accurate_pass_ratio
pass_count              1.00000              0.90982
accurate_pass_ratio     0.90982              1.00000


In [38]:
#They seem very highly related, so I might not include both.

In [39]:
pass_count = events[events['eventName'] == 'Pass'] \
    .groupby(['teamName','matchId']) \
    .size() \
    .reset_index(name='pass_count')


# Accurate pass ratio as calculated
accurate_pass_ratio = events[events['eventName'] == 'Pass'] \
    .groupby(['teamName','matchId'])['accurate'] \
    .agg(lambda x: (x == 1).sum() / len(x)) \
    .reset_index(name='accurate_pass_ratio')

# Merge pass_count with accurate_pass_ratio on 'teamName'
team_pass_data = pd.merge(accurate_pass_ratio, pass_count, on=['teamName','matchId'])



correlation = team_pass_data[['pass_count', 'accurate_pass_ratio']].corr()
print(correlation)


                     pass_count  accurate_pass_ratio
pass_count              1.00000              0.79015
accurate_pass_ratio     0.79015              1.00000


They are less correlated when I group by teamName and match though. 

#### Launches

In [40]:
#I can see this nicely illustrates non-possession or defensive teams 
events[(events['subEventName']=='Launch')]['teamName'].value_counts().reset_index()

Unnamed: 0,teamName,count
0,Burnley,804
1,Eibar,803
2,Getafe,652
3,Everton,649
4,Newcastle United,640
...,...,...
93,Barcelona,187
94,Nice,183
95,Manchester City,182
96,Real Madrid,143


#### Shots

In [41]:
#I can see this nicely illustrates attacking teams
events[(events['eventName']=='Shot')]['teamName'].value_counts().reset_index()

Unnamed: 0,teamName,count
0,Real Madrid,629
1,Napoli,605
2,Manchester City,603
3,Liverpool,600
4,Roma,597
...,...,...
93,Wolfsburg,315
94,Hellas Verona,313
95,Swansea City,304
96,Bologna,298


#### Shots on target ratio

In [42]:
#I can see this nicely illustrates possession based teams
events[events['eventName'] == 'Shot'] \
    .groupby('teamName')['accurate'] \
    .agg(lambda x: (x == 1).sum() / len(x)) \
    .reset_index(name='shots_on_target') \
    .sort_values(by='shots_on_target', ascending=False)

Unnamed: 0,teamName,shots_on_target
8,Barcelona,0.476096
41,Hertha BSC,0.437931
68,PSG,0.417563
70,Real Betis,0.415042
56,Manchester City,0.414594
...,...,...
78,Sassuolo,0.296729
1,Amiens SC,0.292308
62,Nantes,0.281755
40,Hellas Verona,0.281150


Total shots to shots on target are not as highly correlated as id expect

In [43]:
# Total shots per team
total_shots = events[events['eventName'] == 'Shot']['teamName'] \
    .value_counts() \
    .reset_index(name='total_shots') \
    .rename(columns={'index': 'teamName'})

# Shots on target ratio per team
shots_on_target_ratio = events[events['eventName'] == 'Shot'] \
    .groupby('teamName')['accurate'] \
    .agg(lambda x: (x == 1).sum() / len(x)) \
    .reset_index(name='shots_on_target_ratio')

# Merge the two metrics
team_shot_data = pd.merge(total_shots, shots_on_target_ratio, on='teamName')

# Calculate correlation
correlation = team_shot_data[['total_shots', 'shots_on_target_ratio']].corr()
print(correlation)


                       total_shots  shots_on_target_ratio
total_shots               1.000000               0.360176
shots_on_target_ratio     0.360176               1.000000


#### Now I want to get all the goals conceded per team after the 75th minute

In [44]:
events['concedeLate']=((events['goal']==1) & 
(events['matchPeriod']=='2H') & 
(events['eventName']=='Save attempt')  & 
(events['eventSec']>1800)).astype(int)

In [45]:
#I think this nicely shows the strength of teams too. I will just need to some manipulation after joining to the matches table to get the teams that are scoring these late goals

events[(events['concedeLate']==1)]['teamName'].value_counts().reset_index()

Unnamed: 0,teamName,count
0,Deportivo La Coru\u00f1a,20
1,Metz,20
2,Strasbourg,19
3,Watford,19
4,Villarreal,18
...,...,...
93,Manchester United,4
94,Manchester City,4
95,Bayern M\u00fcnchen,3
96,Juventus,3


### Gathering all these stats on a game level before joining to matches

In [46]:


# 1. Calculate Pass Count per team and match
pass_count = (
    events[events['eventName'] == 'Pass']
    .groupby(['teamName', 'matchId'])
    .size()
    .reset_index(name='pass_count')
)

# 2. Calculate Accurate Pass Ratio per team and match
accurate_pass_ratio = (
    events[events['eventName'] == 'Pass']
    .groupby(['teamName', 'matchId'])['accurate']
    .agg(lambda x: (x == 1).sum() / len(x))
    .reset_index(name='accurate_pass_ratio')
)

# 3. Calculate Launch Count per team and match
launch_count = (
    events[events['subEventName'] == 'Launch']
    .groupby(['teamName', 'matchId'])
    .size()
    .reset_index(name='launch_count')
)

# 4. Calculate Shot Count per team and match
shot_count = (
    events[events['eventName'] == 'Shot']
    .groupby(['teamName', 'matchId'])
    .size()
    .reset_index(name='shot_count')
)

# 5. Calculate Shots on Target Ratio per team and match
shots_on_target_ratio = (
    events[events['eventName'] == 'Shot']
    .groupby(['teamName', 'matchId'])['accurate']
    .agg(lambda x: (x == 1).sum() / len(x))
    .reset_index(name='shots_on_target_ratio')
)

# 6. Calculate Late Concede Count per team and match
concede_late_count = (
    events[events['concedeLate'] == 1]
    .groupby(['teamName', 'matchId'])
    .size()
    .reset_index(name='concede_late_count')
)

# Merging all DataFrames into one
team_match_stats = (
    pass_count
    .merge(accurate_pass_ratio, on=['teamName', 'matchId'], how='outer')
    .merge(launch_count, on=['teamName', 'matchId'], how='outer')
    .merge(shot_count, on=['teamName', 'matchId'], how='outer')
    .merge(shots_on_target_ratio, on=['teamName', 'matchId'], how='outer')
    .merge(concede_late_count, on=['teamName', 'matchId'], how='outer')
)

# Filling any NaN values with 0 for cases where a team may have no entries for a stat
team_match_stats = team_match_stats.fillna(0)


#three teams had no shots in an entire game that season.

In [47]:
team_match_stats[team_match_stats['shot_count']==0]

Unnamed: 0,teamName,matchId,pass_count,accurate_pass_ratio,launch_count,shot_count,shots_on_target_ratio,concede_late_count
734,Cagliari,2576281,225,0.822222,15.0,0.0,0.0,1.0
1599,Huddersfield Town,2499815,236,0.686441,11.0,0.0,0.0,0.0
3195,Swansea City,2500013,145,0.655172,15.0,0.0,0.0,0.0


 11 teams collapsed and conceded 3 late goals in a match

In [49]:
team_match_stats[team_match_stats['concede_late_count'] == team_match_stats['concede_late_count'].max()]

Unnamed: 0,teamName,matchId,pass_count,accurate_pass_ratio,launch_count,shot_count,shots_on_target_ratio,concede_late_count
803,Chelsea,2499977,480,0.86875,7.0,6.0,0.5,3.0
871,Crotone,2576133,356,0.758427,9.0,9.0,0.333333,3.0
987,Deportivo La Coru\u00f1a,2565738,334,0.847305,6.0,6.0,0.5,3.0
1113,Eintracht Frankfurt,2517018,379,0.831135,12.0,10.0,0.3,3.0
1158,Everton,2499764,469,0.850746,12.0,6.0,0.5,3.0
1887,Leicester City,2500070,381,0.824147,6.0,6.0,0.166667,3.0
3167,Swansea City,2499736,398,0.826633,13.0,7.0,0.285714,3.0
3282,Toulouse,2500712,212,0.75,3.0,6.0,0.166667,3.0
3401,Valencia,2565620,359,0.844011,9.0,12.0,0.75,3.0
3614,West Ham United,2500060,280,0.757143,15.0,10.0,0.4,3.0


In [50]:
team_match_stats.head()

Unnamed: 0,teamName,matchId,pass_count,accurate_pass_ratio,launch_count,shot_count,shots_on_target_ratio,concede_late_count
0,AFC Bournemouth,2499728,596,0.895973,11.0,8.0,0.25,0.0
1,AFC Bournemouth,2499729,418,0.794258,6.0,7.0,0.285714,1.0
2,AFC Bournemouth,2499739,223,0.726457,15.0,9.0,0.333333,1.0
3,AFC Bournemouth,2499749,386,0.76943,18.0,5.0,0.4,0.0
4,AFC Bournemouth,2499759,621,0.89372,8.0,13.0,0.230769,0.0


In [51]:
team_match_stats.to_csv('data/team_match_stats.csv', index=False)