In [258]:
# Data handling and manipulation
import numpy as np
import pandas as pd

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Statistical tools
from scipy.stats import linregress, uniform, randint
import scipy.stats as st

# Scikit-learn libraries
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, f1_score, make_scorer

# XGBoost
import xgboost as xgb

import ast


In [259]:
pd.set_option('display.max_columns', None)
#pd.reset_option('display.max_columns')

In [260]:

matches = pd.read_csv('data/matches_England.csv')

In [261]:
match_mapping=matches.set_index('wyId')['label'].to_dict()

In [262]:
matches.drop(['status','duration','team1.scoreET','team1.scoreP','team1.hasFormation','team2.scoreET','team2.scoreP','team2.hasFormation'],
             axis=1, inplace=True)

matches.drop(['roundId','seasonId'],
             axis=1, inplace=True)


matches.drop(['teamsData', 'venue', 
                'date', 'referees', 'team1.coachId', 
                'team1.formation', 'team1.formation.bench', 'team1.formation.lineup',
                'team1.formation.substitutions', 'team2.coachId',
                'team2.formation', 'team2.formation.bench', 'team2.formation.lineup',
                'team2.formation.substitutions'],
            axis=1, inplace=True)



In [263]:
matches=matches.sort_values(by=['dateutc','wyId'])

In [264]:
#mapping names

teams=pd.read_csv('data/teams.csv')
team_mapping=teams.set_index('wyId')['name'].to_dict()

In [265]:
matches['team1.teamId']=matches['team1.teamId'].map(team_mapping)
matches['team2.teamId']=matches['team2.teamId'].map(team_mapping)
matches['winner']=matches['winner'].map(team_mapping)

matches.rename(columns={'team1.teamId':'team1.teamName','team2.teamId':'team2.teamName'}, inplace=True)

In [266]:
matches.columns

Index(['gameweek', 'dateutc', 'winner', 'wyId', 'label', 'competitionId',
       'team1.side', 'team1.teamName', 'team1.score', 'team1.scoreHT',
       'team2.side', 'team2.teamName', 'team2.score', 'team2.scoreHT'],
      dtype='object')

#### Restructuring matches table so that it is more easier to identify the home and away teams

In [267]:
home_teams1 = matches[matches['team1.side'] == 'home'].copy()
home_teams1 = home_teams1[['team1.teamName', 'team1.score', 'team1.scoreHT']]
home_teams1.columns=[ 'home.teamName', 'home.score', 'home.scoreHT']

home_teams2 = matches[matches['team2.side'] == 'home'].copy()
home_teams2 = home_teams2[['team2.teamName', 'team2.score', 'team2.scoreHT']]
home_teams2.columns = ['home.teamName', 'home.score', 'home.scoreHT']

home_teams=pd.concat([home_teams1, home_teams2], axis=0).sort_index()



In [268]:
away_teams1 = matches[matches['team1.side'] == 'away'].copy()
away_teams1 = away_teams1[['team1.teamName', 'team1.score', 'team1.scoreHT']]
away_teams1.columns=['away.teamName', 'away.score', 'away.scoreHT']

away_teams2 = matches[matches['team2.side'] == 'away'].copy()
away_teams2 = away_teams2[['team2.teamName', 'team2.score', 'team2.scoreHT']]
away_teams2.columns = ['away.teamName', 'away.score', 'away.scoreHT']

away_teams=pd.concat([away_teams1, away_teams2], axis=0).sort_index()

In [269]:
match_cols = matches[['competitionId','gameweek','wyId', 'dateutc', 'winner', 'label']]

In [270]:
matches = pd.concat([match_cols , home_teams, away_teams], axis = 1)

In [271]:
matches.drop(['competitionId', 'home.scoreHT', 'away.scoreHT',],
             axis=1, inplace=True)


In [272]:
matches.drop(['home.teamName', 'away.teamName', 'home.score', 'away.score'],
             axis=1, inplace=True)


In [273]:
matches.head()

Unnamed: 0,gameweek,wyId,dateutc,winner,label
379,1,2499719,2017-08-11 18:45:00,Arsenal,"Arsenal - Leicester City, 4 - 3"
378,1,2499727,2017-08-12 11:30:00,,"Watford - Liverpool, 3 - 3"
373,1,2499721,2017-08-12 14:00:00,Burnley,"Chelsea - Burnley, 2 - 3"
374,1,2499722,2017-08-12 14:00:00,Huddersfield Town,"Crystal Palace - Huddersfield Town, 0 - 3"
375,1,2499723,2017-08-12 14:00:00,Everton,"Everton - Stoke City, 1 - 0"


### Events table

In [274]:
events = pd.read_csv('data/events_England.csv')

## About this file

This dataset describes all the events that occur during each match. Each event refers to a ball touch and contains the following information:

- **eventId**: the identifier of the event's type. Each eventId is associated with an event name.
- **eventName**: the name of the event's type (pass, foul, shot, duel, free kick, offside, or touch).
- **subEventId**: the identifier of the subevent's type.
- **subEventName**: the name of the subevent's type, associated with different event types.
- **tags**: a list of event tags providing additional information about the event (e.g., accurate).
- **eventSec**: the time the event occurs (in seconds since the beginning of the current half).
- **id**: a unique identifier of the event.
- **matchId**: the identifier of the match, linked to "wyId" in the match dataset.
- **matchPeriod**: the period of the match ("1H", "2H", "E1", "E2", or "P").
- **playerId**: the identifier of the player who generated the event, linked to "wyId" in a player dataset.
- **positions**: positions: the origin and destination positions associated with the event. Each position is a pair of coordinates (x, y). The x and y coordinates are always in the range [0, 100] and indicate the percentage of the field from the perspective of the attacking team. In particular, the value of the x coordinate indicates the event's nearness (in percentage) to the opponent's goal, while the value of the y coordinates indicates the event's nearness (in percentage) to the right side of the field;
- **teamId**: the identifier of the player's team, linked to "wyId" in the team dataset.
- **tagsList**: same information as the tags column, except in list format.
- **pos_orig_x**: from the positions column, origin in x-axis. (higher x means closer to opposition goal)
- **pos_orig_y**: from the positions column, origin in y-axis.
- **pos_dest_x**: from the positions column, destination in x-axis.
- **pos_dest_y**: from the positions column, destination in y-axis.


In [275]:
events.drop(['tags', 'eventId', 'subEventId'], axis=1, inplace=True)

In [276]:
#positional data isnt that reliable
events.drop(['positions', 'pos_orig_x', 'pos_orig_y', 'pos_dest_x', 'pos_dest_y'], axis=1, inplace=True)

#### tags2name

In [277]:
tags2name = pd.read_csv('data/tags2name.csv')

In [278]:
%%time
##takes about 42 seconds to run

#getting the list values out of the string values in the tagList column
events['tagsList'] = events['tagsList'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

#explode the tagsList column because we will get the labels
exploded_events = events.explode('tagsList')

#cast to int because the column in the tags table is int
exploded_events['tagsList']=exploded_events['tagsList'].astype('Int64')

#keep the original index because I was losing this information in the merge 
exploded_events['origIndex']=exploded_events.index

#get the tag labels
exploded_merged_events=exploded_events.merge(tags2name[['Tag','Label']], left_on='tagsList', right_on='Tag', how='left')

#grouping by original index to prepare for the join back to events table
exploded_merged_events=exploded_merged_events.groupby('origIndex').agg({
    'Label': list
}).reset_index()

#joining back onto original events table
events=pd.concat([events, exploded_merged_events['Label']], axis=1)

CPU times: user 25.1 s, sys: 205 ms, total: 25.3 s
Wall time: 25.3 s


In [279]:
events.drop(['tagsList'], axis=1, inplace=True)

Adding team and player and match mappings so i can spot check things against the actual event as I go along

In [280]:
events['teamId'].map(team_mapping)

0         Arsenal
1         Arsenal
2         Arsenal
3         Arsenal
4         Arsenal
           ...   
643085    Everton
643086    Everton
643087    Everton
643088    Everton
643089    Everton
Name: teamId, Length: 643090, dtype: object

In [281]:
events['teamId']=events['teamId'].map(team_mapping)
events.rename(columns={'teamId': 'teamName'}, inplace=True)

In [282]:
player_games=pd.read_csv('data/player_games.csv')
player_mapping=player_games.set_index('player_id')['player_name'].to_dict()

In [283]:
events['playerName']=events['playerId'].map(player_mapping)


In [284]:
events['match']=events['matchId'].map(match_mapping)

In [285]:
events.head()

Unnamed: 0,subEventName,playerId,matchId,eventName,teamName,matchPeriod,eventSec,id,Label,playerName,match
0,Simple pass,25413,2499719,Pass,Arsenal,1H,2.758649,177959171,[accurate],Alexandre Lacazette,"Arsenal - Leicester City, 4 - 3"
1,High pass,370224,2499719,Pass,Arsenal,1H,4.94685,177959172,[accurate],Rob Holding,"Arsenal - Leicester City, 4 - 3"
2,Head pass,3319,2499719,Pass,Arsenal,1H,6.542188,177959173,[accurate],Mesut Özil,"Arsenal - Leicester City, 4 - 3"
3,Head pass,120339,2499719,Pass,Arsenal,1H,8.143395,177959174,[accurate],Mohamed Naser Elsayed Elneny,"Arsenal - Leicester City, 4 - 3"
4,Simple pass,167145,2499719,Pass,Arsenal,1H,10.302366,177959175,[accurate],Héctor Bellerín Moruno,"Arsenal - Leicester City, 4 - 3"


Some features I can generate:

**Passes**
- Number of passes
- Successful pass rate
- Number of Launches
- Maybe I will do Pass type (Simple pass -v- High pass -v- Smart pass) 
  
**Shots**
- Number of Shots
- Shots on target rate (accurate v not accurate)
  
**Saves**
- Save attempts (accurate v not accurate) ... although this will be captured in the goalkeepers' player rating. So i might omit this.

**Goals**
- I think it could be interesting to get the number of late goals (say 75+ minutes) a team scores or concedes. Because that is often when a good team breaks through, or a lesser team cracks.


So I will ignore Saves for now. I will filter events only on passes and shots.

But actually from earlier I discovered (my feature_planning worksheet) the "Save attempt" column was most reliable way to track the goals. There was one discrepency with the actual scores I found. For the the West ham v watford match (2499988) it seems to think it was 3-0 instead of 2-0. But that is the only discrepency.

I will filter events only on passes and shots and save attempts.

In [286]:
events=events[events['eventName'].isin(['Pass','Shot', 'Save attempt'])]

So I will create 3 new columns from the Label column. `Accurate` `Inaccurate` `Goal`

In [287]:
events=events.copy()

In [288]:
events.loc[:,'accurate']=events['Label'].apply(lambda x: 1 if 'accurate' in x else 0)

In [289]:
events.loc[:,'inaccurate']=events['Label'].apply(lambda x: 1 if 'not accurate' in x else 0)

In [290]:
events.loc[:,'goal']=events['Label'].apply(lambda x: 1 if 'Goal' in x else 0)

In [291]:
events[['accurate','inaccurate']].value_counts()

accurate  inaccurate
1         0             276565
0         1              63858
Name: count, dtype: int64

Perfect, everything is either accurate or inaccurate. So I can just drop inaccurate.

In [292]:
events.drop('inaccurate', axis=1, inplace=True)

finding and removing the errant goal row.

In [297]:
events[(events['goal']==1) &
 (events['eventName']=='Save attempt') &
 (events['matchId']==2499988)
  ]
  

Unnamed: 0,subEventName,playerId,matchId,eventName,teamName,matchPeriod,eventSec,id,Label,playerName,match,accurate,goal
455621,Reflexes,92864,2499988,Save attempt,Watford,1H,2256.129085,227348650,"[Goal, gbr, not accurate]",Orestis Karnezis,"West Ham United - Watford, 2 - 0",0,1
456297,Reflexes,92864,2499988,Save attempt,Watford,2H,1922.369339,227348506,"[Goal, glb, not accurate]",Orestis Karnezis,"West Ham United - Watford, 2 - 0",0,1


I can see from researching the game that the event associated with 227348124 didnt end in a recorded goal. The one less than a minute later (227348650) did. So I will just delete this row.

In [296]:
events=events[events['id']!=227348124]

In [298]:
events.drop(columns=['id'],inplace=True)

In [299]:
events.head()

Unnamed: 0,subEventName,playerId,matchId,eventName,teamName,matchPeriod,eventSec,Label,playerName,match,accurate,goal
0,Simple pass,25413,2499719,Pass,Arsenal,1H,2.758649,[accurate],Alexandre Lacazette,"Arsenal - Leicester City, 4 - 3",1,0
1,High pass,370224,2499719,Pass,Arsenal,1H,4.94685,[accurate],Rob Holding,"Arsenal - Leicester City, 4 - 3",1,0
2,Head pass,3319,2499719,Pass,Arsenal,1H,6.542188,[accurate],Mesut Özil,"Arsenal - Leicester City, 4 - 3",1,0
3,Head pass,120339,2499719,Pass,Arsenal,1H,8.143395,[accurate],Mohamed Naser Elsayed Elneny,"Arsenal - Leicester City, 4 - 3",1,0
4,Simple pass,167145,2499719,Pass,Arsenal,1H,10.302366,[accurate],Héctor Bellerín Moruno,"Arsenal - Leicester City, 4 - 3",1,0


#### Launches

In [322]:
events['Launch']=(events['subEventName']=='Launch').astype(int)

Checking that my stats look good

#### Passes

In [324]:
#I can see this nicely illustrates possession based teams
events[(events['eventName']=='Pass')]['teamName'].value_counts().reset_index()

Unnamed: 0,teamName,count
0,Manchester City,27523
1,Arsenal,22783
2,Liverpool,22268
3,Tottenham Hotspur,20988
4,Chelsea,20467
5,Manchester United,19372
6,Southampton,16536
7,AFC Bournemouth,15359
8,Watford,14854
9,Swansea City,14842


#### Pass Completion Rate

In [325]:
#I can see this nicely illustrates possession based teams
events[events['eventName'] == 'Pass'] \
    .groupby('teamName')['accurate'] \
    .agg(lambda x: (x == 1).sum() / len(x)) \
    .reset_index(name='accurate_pass_ratio') \
    .sort_values(by='accurate_pass_ratio', ascending=False)

Unnamed: 0,teamName,accurate_pass_ratio
10,Manchester City,0.899866
1,Arsenal,0.865909
4,Chelsea,0.859921
9,Liverpool,0.859395
11,Manchester United,0.856855
16,Tottenham Hotspur,0.853964
15,Swansea City,0.82435
13,Southampton,0.814163
0,AFC Bournemouth,0.811446
17,Watford,0.806584


In [346]:
pass_count = events[events['eventName'] == 'Pass'] \
    .groupby('teamName') \
    .size() \
    .reset_index(name='pass_count')


# Accurate pass ratio as calculated
accurate_pass_ratio = events[events['eventName'] == 'Pass'] \
    .groupby('teamName')['accurate'] \
    .agg(lambda x: (x == 1).sum() / len(x)) \
    .reset_index(name='accurate_pass_ratio')

# Merge pass_count with accurate_pass_ratio on 'teamName'
team_pass_data = pd.merge(accurate_pass_ratio, pass_count, on='teamName')



correlation = team_pass_data[['pass_count', 'accurate_pass_ratio']].corr()
print(correlation)


                     pass_count  accurate_pass_ratio
pass_count              1.00000              0.97328
accurate_pass_ratio     0.97328              1.00000


In [347]:
#They seem very highly related, so I might not include both.

In [349]:
pass_count = events[events['eventName'] == 'Pass'] \
    .groupby(['teamName','matchId']) \
    .size() \
    .reset_index(name='pass_count')


# Accurate pass ratio as calculated
accurate_pass_ratio = events[events['eventName'] == 'Pass'] \
    .groupby(['teamName','matchId'])['accurate'] \
    .agg(lambda x: (x == 1).sum() / len(x)) \
    .reset_index(name='accurate_pass_ratio')

# Merge pass_count with accurate_pass_ratio on 'teamName'
team_pass_data = pd.merge(accurate_pass_ratio, pass_count, on=['teamName','matchId'])



correlation = team_pass_data[['pass_count', 'accurate_pass_ratio']].corr()
print(correlation)


                     pass_count  accurate_pass_ratio
pass_count             1.000000             0.834621
accurate_pass_ratio    0.834621             1.000000


They are less correlated when I group by teamName and match though. 

#### Launches

In [326]:
#I can see this nicely illustrates non-possession or defensive teams 
events[(events['subEventName']=='Launch')]['teamName'].value_counts().reset_index()

Unnamed: 0,teamName,count
0,Burnley,804
1,Everton,649
2,Newcastle United,640
3,Stoke City,623
4,West Ham United,608
5,Crystal Palace,601
6,West Bromwich Albion,594
7,Watford,588
8,Brighton & Hove Albion,563
9,Huddersfield Town,561


#### Shots

In [327]:
#I can see this nicely illustrates attacking teams
events[(events['eventName']=='Shot')]['teamName'].value_counts().reset_index()

Unnamed: 0,teamName,count
0,Manchester City,603
1,Liverpool,600
2,Tottenham Hotspur,568
3,Chelsea,550
4,Arsenal,538
5,Manchester United,466
6,Crystal Palace,424
7,Southampton,415
8,AFC Bournemouth,413
9,Newcastle United,410


#### Shots on target ratio

In [329]:
#I can see this nicely illustrates possession based teams
events[events['eventName'] == 'Shot'] \
    .groupby('teamName')['accurate'] \
    .agg(lambda x: (x == 1).sum() / len(x)) \
    .reset_index(name='shots_on_target') \
    .sort_values(by='shots_on_target', ascending=False)

Unnamed: 0,teamName,shots_on_target
10,Manchester City,0.414594
1,Arsenal,0.403346
9,Liverpool,0.373333
11,Manchester United,0.371245
19,West Ham United,0.370031
8,Leicester City,0.365333
4,Chelsea,0.361818
0,AFC Bournemouth,0.358354
3,Burnley,0.357558
14,Stoke City,0.349296


Total shots to shots on target are not as highly correlated as id expect

In [350]:
# Total shots per team
total_shots = events[events['eventName'] == 'Shot']['teamName'] \
    .value_counts() \
    .reset_index(name='total_shots') \
    .rename(columns={'index': 'teamName'})

# Shots on target ratio per team
shots_on_target_ratio = events[events['eventName'] == 'Shot'] \
    .groupby('teamName')['accurate'] \
    .agg(lambda x: (x == 1).sum() / len(x)) \
    .reset_index(name='shots_on_target_ratio')

# Merge the two metrics
team_shot_data = pd.merge(total_shots, shots_on_target_ratio, on='teamName')

# Calculate correlation
correlation = team_shot_data[['total_shots', 'shots_on_target_ratio']].corr()
print(correlation)


                       total_shots  shots_on_target_ratio
total_shots               1.000000               0.593449
shots_on_target_ratio     0.593449               1.000000


#### Now I want to get all the goals conceded per team after the 75th minute

In [343]:
events['concedeLate']=((events['goal']==1) & 
(events['matchPeriod']=='2H') & 
(events['eventName']=='Save attempt')  & 
(events['eventSec']>1800)).astype(int)

In [345]:
#I think this nicely shows the strength of teams too. I will just need to some manipulation after joining to the matches table to get the teams that are scoring these late goals

events[(events['concedeLate']==1)]['teamName'].value_counts().reset_index()

Unnamed: 0,teamName,count
0,Watford,19
1,West Ham United,17
2,Everton,16
3,West Bromwich Albion,16
4,Stoke City,15
5,AFC Bournemouth,13
6,Brighton & Hove Albion,13
7,Leicester City,12
8,Swansea City,12
9,Crystal Palace,12


### Gathering all these stats on a game level before joining to matches

In [357]:


# 1. Calculate Pass Count per team and match
pass_count = (
    events[events['eventName'] == 'Pass']
    .groupby(['teamName', 'matchId'])
    .size()
    .reset_index(name='pass_count')
)

# 2. Calculate Accurate Pass Ratio per team and match
accurate_pass_ratio = (
    events[events['eventName'] == 'Pass']
    .groupby(['teamName', 'matchId'])['accurate']
    .agg(lambda x: (x == 1).sum() / len(x))
    .reset_index(name='accurate_pass_ratio')
)

# 3. Calculate Launch Count per team and match
launch_count = (
    events[events['subEventName'] == 'Launch']
    .groupby(['teamName', 'matchId'])
    .size()
    .reset_index(name='launch_count')
)

# 4. Calculate Shot Count per team and match
shot_count = (
    events[events['eventName'] == 'Shot']
    .groupby(['teamName', 'matchId'])
    .size()
    .reset_index(name='shot_count')
)

# 5. Calculate Shots on Target Ratio per team and match
shots_on_target_ratio = (
    events[events['eventName'] == 'Shot']
    .groupby(['teamName', 'matchId'])['accurate']
    .agg(lambda x: (x == 1).sum() / len(x))
    .reset_index(name='shots_on_target_ratio')
)

# 6. Calculate Late Concede Count per team and match
concede_late_count = (
    events[events['concedeLate'] == 1]
    .groupby(['teamName', 'matchId'])
    .size()
    .reset_index(name='concede_late_count')
)

# Merging all DataFrames into one
team_match_stats = (
    pass_count
    .merge(accurate_pass_ratio, on=['teamName', 'matchId'], how='outer')
    .merge(launch_count, on=['teamName', 'matchId'], how='outer')
    .merge(shot_count, on=['teamName', 'matchId'], how='outer')
    .merge(shots_on_target_ratio, on=['teamName', 'matchId'], how='outer')
    .merge(concede_late_count, on=['teamName', 'matchId'], how='outer')
)

# Filling any NaN values with 0 for cases where a team may have no entries for a stat
team_match_stats = team_match_stats.fillna(0)


#two teams had no shots in an entire game that season.

In [368]:
team_match_stats[team_match_stats['shot_count']==0]

Unnamed: 0,teamName,matchId,pass_count,accurate_pass_ratio,launch_count,shot_count,shots_on_target_ratio,concede_late_count
275,Huddersfield Town,2499815,236,0.686441,11.0,0.0,0.0,0.0
599,Swansea City,2500013,145,0.655172,15.0,0.0,0.0,0.0


In [370]:
team_match_stats[team_match_stats['shot_count']==0]['matchId'].map(match_mapping)

275       Liverpool - Huddersfield Town, 3 - 0
599    Huddersfield Town - Swansea City, 0 - 0
Name: matchId, dtype: object

5 teams collapsed and conceded 3 late goals in a match

In [372]:
team_match_stats[team_match_stats['concede_late_count'] == team_match_stats['concede_late_count'].max()]

Unnamed: 0,teamName,matchId,pass_count,accurate_pass_ratio,launch_count,shot_count,shots_on_target_ratio,concede_late_count
177,Chelsea,2499977,480,0.86875,7.0,6.0,0.5,3.0
232,Everton,2499764,469,0.850746,12.0,6.0,0.5,3.0
339,Leicester City,2500070,381,0.824147,6.0,6.0,0.166667,3.0
571,Swansea City,2499736,398,0.826633,13.0,7.0,0.285714,3.0
756,West Ham United,2500060,280,0.757143,15.0,10.0,0.4,3.0


In [373]:
team_match_stats[team_match_stats['concede_late_count'] == team_match_stats['concede_late_count'].max()]['matchId'].map(match_mapping)

177                   Watford - Chelsea, 4 - 1
232         Manchester United - Everton, 4 - 0
339     Crystal Palace - Leicester City, 5 - 0
571    Swansea City - Manchester United, 0 - 4
756           Arsenal - West Ham United, 4 - 1
Name: matchId, dtype: object

In [374]:
team_match_stats.head()

Unnamed: 0,teamName,matchId,pass_count,accurate_pass_ratio,launch_count,shot_count,shots_on_target_ratio,concede_late_count
0,AFC Bournemouth,2499728,596,0.895973,11.0,8.0,0.25,0.0
1,AFC Bournemouth,2499729,418,0.794258,6.0,7.0,0.285714,1.0
2,AFC Bournemouth,2499739,223,0.726457,15.0,9.0,0.333333,1.0
3,AFC Bournemouth,2499749,386,0.76943,18.0,5.0,0.4,0.0
4,AFC Bournemouth,2499759,621,0.89372,8.0,13.0,0.230769,0.0


In [363]:
team_match_stats.to_csv('data/team_match_stats_england.csv', index=False)