In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import linregress
import ast

I am going to flesh out the features for the 'matches' table. I will start off by selecting just a couple of obvious ones, and then I will go back over things and get more features. 

I will start by looking at the "events" table and aggregating by the matchId identifier and joining these juicy stats onto the match table.

In [6]:
events = pd.read_csv('data/events_England.csv')

## About this file

This dataset describes all the events that occur during each match. Each event refers to a ball touch and contains the following information:

- **eventId**: the identifier of the event's type. Each eventId is associated with an event name.
- **eventName**: the name of the event's type (pass, foul, shot, duel, free kick, offside, or touch).
- **subEventId**: the identifier of the subevent's type.
- **subEventName**: the name of the subevent's type, associated with different event types.
- **tags**: a list of event tags providing additional information about the event (e.g., accurate).
- **eventSec**: the time the event occurs (in seconds since the beginning of the current half).
- **id**: a unique identifier of the event.
- **matchId**: the identifier of the match, linked to "wyId" in the match dataset.
- **matchPeriod**: the period of the match ("1H", "2H", "E1", "E2", or "P").
- **playerId**: the identifier of the player who generated the event, linked to "wyId" in a player dataset.
- **positions**: the origin and destination positions associated with the event as (x, y) coordinates.
- **teamId**: the identifier of the player's team, linked to "wyId" in the team dataset.
- **tagsList**: same information as the tags column, except in list format.
- **pos_orig_x**: from the positions column, origin in x-axis.
- **pos_orig_y**: from the positions column, origin in y-axis.
- **pos_dest_x**: from the positions column, destination in x-axis.
- **pos_dest_y**: from the positions column, destination in y-axis.


In [3]:
events.head()

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id,tagsList,pos_orig_y,pos_orig_x,pos_dest_y,pos_dest_x
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85.0,177959171,[1801],49,49,78,31
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.94685,83.0,177959172,[1801],78,31,75,51
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82.0,177959173,[1801],75,51,71,35
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82.0,177959174,[1801],71,35,95,41
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85.0,177959175,[1801],95,41,88,72


In [5]:
tags2name = pd.read_csv('data/tags2name.csv')

##### Joining on the tag labels to the nested tagsList column

In [7]:
%%time
##takes about 42 seconds to run

#getting the list values out of the string values in the tagList column
events['tagsList'] = events['tagsList'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

#explode the tagsList column because we will get the labels
exploded_events = events.explode('tagsList')

#cast to int because the column in the tags table is int
exploded_events['tagsList']=exploded_events['tagsList'].astype('Int64')

#keep the original index because I was losing this information in the merge 
exploded_events['origIndex']=exploded_events.index

#get the tag labels
exploded_merged_events=exploded_events.merge(tags2name[['Tag','Label']], left_on='tagsList', right_on='Tag', how='left')

#grouping by original index to prepare for the join back to events table
exploded_merged_events=exploded_merged_events.groupby('origIndex').agg({
   # 'tagsList': list, 
   # 'Tag': list,
    'Label': list
}).reset_index()

#joining back onto original events table
events=pd.concat([events, exploded_merged_events['Label']], axis=1)

CPU times: user 41.9 s, sys: 157 ms, total: 42.1 s
Wall time: 42.1 s


##### Removing some columns that are no longer needed

In [10]:
events.drop(['eventId','subEventId','tagsList','positions','tags','id'], axis=1, inplace=True)

In [27]:
events.head(8)

Unnamed: 0,subEventName,playerId,matchId,eventName,teamId,matchPeriod,eventSec,pos_orig_y,pos_orig_x,pos_dest_y,pos_dest_x,Label
0,Simple pass,25413,2499719,Pass,1609,1H,2.758649,49,49,78,31,[accurate]
1,High pass,370224,2499719,Pass,1609,1H,4.94685,78,31,75,51,[accurate]
2,Head pass,3319,2499719,Pass,1609,1H,6.542188,75,51,71,35,[accurate]
3,Head pass,120339,2499719,Pass,1609,1H,8.143395,71,35,95,41,[accurate]
4,Simple pass,167145,2499719,Pass,1609,1H,10.302366,95,41,88,72,[accurate]
5,Simple pass,3319,2499719,Pass,1609,1H,12.548934,88,72,75,77,[not accurate]
6,Head pass,8653,2499719,Pass,1631,1H,13.961228,25,23,15,39,[accurate]
7,Air duel,8013,2499719,Duel,1631,1H,14.765321,15,39,20,33,"[lost, not accurate]"


Number of shots in each game

In [67]:
events.groupby(['matchId','teamId']).agg(
    numShots=('subEventName', lambda x: (x == 'Shot').sum())
)

##have to think how ill join this onto the matches table in an effecient way

Unnamed: 0_level_0,Unnamed: 1_level_0,numShots
matchId,teamId,Unnamed: 2_level_1
2499719,1609,27
2499719,1631,7
2499720,1625,12
2499720,1651,6
2499721,1610,15
...,...,...
2500096,10531,24
2500097,1624,13
2500097,1631,17
2500098,1623,13


## Matches

In [125]:
matches = pd.read_csv('data/matches_England.csv')

In [126]:
##Dropping unneeded columns columns are all either identical or they are captured within other columns

In [127]:
matches.drop(['status','roundId','seasonId','duration','competitionId','teamsData','team1.formation','team2.formation','team1.scoreET','team1.scoreP','team1.hasFormation','team2.scoreET','team2.scoreP','team2.hasFormation','date'], axis=1, inplace=True)

In [128]:
##And then dropping the subs bench info

In [129]:
matches.drop(['team1.formation.bench','team2.formation.bench'], axis=1, inplace=True)

In [130]:
#dropping venue too, but may use it at some stage on dashboards

In [131]:
matches.drop(['venue'], axis=1, inplace=True)

In [132]:
#dropping venue too, but may use it at some stage for features or for dashboards

In [133]:
matches.drop(['referees'], axis=1, inplace=True)

I want to rejig my table so that team1 columns always refer to the home team, and team2 columns refer to the away team.

first ill replace the teamId's with the actual team names so that I know this is working as expected

### Teams

In [134]:
teams=pd.read_csv('data/teams.csv')
teams.head()

Unnamed: 0,city,name,wyId,officialName,area,type
0,Newcastle upon Tyne,Newcastle United,1613,Newcastle United FC,"{'name': 'England', 'id': '0', 'alpha3code': '...",club
1,Vigo,Celta de Vigo,692,Real Club Celta de Vigo,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
2,Barcelona,Espanyol,691,Reial Club Deportiu Espanyol,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
3,Vitoria-Gasteiz,Deportivo Alav\u00e9s,696,Deportivo Alav\u00e9s,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club
4,Valencia,Levante,695,Levante UD,"{'name': 'Spain', 'id': '724', 'alpha3code': '...",club


In [135]:
team_mapping=teams.set_index('wyId')['name'].to_dict()

In [136]:
matches['team1.teamId']=matches['team1.teamId'].map(team_mapping)
matches['team2.teamId']=matches['team2.teamId'].map(team_mapping)

In [138]:
matches.rename(columns={'team1.teamId':'team1.teamName','team2.teamId':'team2.teamName'}, inplace=True)

In [139]:
matches

Unnamed: 0,gameweek,dateutc,winner,wyId,label,team1.coachId,team1.side,team1.teamName,team1.score,team1.scoreHT,team1.formation.lineup,team1.formation.substitutions,team2.coachId,team2.side,team2.teamName,team2.score,team2.scoreHT,team2.formation.lineup,team2.formation.substitutions
0,38,2018-05-13 14:00:00,1659,2500089,"Burnley - AFC Bournemouth, 1 - 2",8880,home,Burnley,1,1,"[{'playerId': 9206, 'ownGoals': '0', 'redCards...","[{'playerIn': 9127, 'playerOut': 9206, 'minute...",8934,away,AFC Bournemouth,2,0,"[{'playerId': 259531, 'ownGoals': '0', 'redCar...","[{'playerIn': 7989, 'playerOut': 259531, 'minu..."
1,38,2018-05-13 14:00:00,1628,2500090,"Crystal Palace - West Bromwich Albion, 2 - 0",8357,home,Crystal Palace,2,0,"[{'playerId': 8623, 'ownGoals': '0', 'redCards...","[{'playerIn': 38031, 'playerOut': 8623, 'minut...",0,away,West Bromwich Albion,0,0,"[{'playerId': 25537, 'ownGoals': '0', 'redCard...","[{'playerIn': 261, 'playerOut': 25537, 'minute..."
2,38,2018-05-13 14:00:00,1609,2500091,"Huddersfield Town - Arsenal, 0 - 1",7845,away,Arsenal,1,1,"[{'playerId': 25867, 'ownGoals': '0', 'redCard...","[{'playerIn': 7945, 'playerOut': 25867, 'minut...",18572,home,Huddersfield Town,0,0,"[{'playerId': 9419, 'ownGoals': '0', 'redCards...","[{'playerIn': 38377, 'playerOut': 9419, 'minut..."
3,38,2018-05-13 14:00:00,1612,2500092,"Liverpool - Brighton & Hove Albion, 4 - 0",8093,away,Brighton & Hove Albion,0,0,"[{'playerId': 120, 'ownGoals': '2', 'redCards'...","[{'playerIn': 15526, 'playerOut': 120, 'minute...",14791,home,Liverpool,4,2,"[{'playerId': 25747, 'ownGoals': '2', 'redCard...","[{'playerIn': 8140, 'playerOut': 25747, 'minut..."
4,38,2018-05-13 14:00:00,1611,2500093,"Manchester United - Watford, 1 - 0",93112,away,Watford,0,0,"[{'playerId': 68085, 'ownGoals': '0', 'redCard...","[{'playerIn': 8889, 'playerOut': 68085, 'minut...",3295,home,Manchester United,1,1,"[{'playerId': 7939, 'ownGoals': '0', 'redCards...","[{'playerIn': 8135, 'playerOut': 7939, 'minute..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,1,2017-08-12 14:00:00,1623,2499723,"Everton - Stoke City, 1 - 0",9054,away,Stoke City,0,0,"[{'playerId': 20450, 'ownGoals': '0', 'redCard...","[{'playerIn': 15198, 'playerOut': 20450, 'minu...",268779,home,Everton,1,1,"[{'playerId': 77546, 'ownGoals': '0', 'redCard...","[{'playerIn': 340, 'playerOut': 77546, 'minute..."
376,1,2017-08-12 14:00:00,0,2499726,"Southampton - Swansea City, 0 - 0",254174,home,Southampton,0,0,"[{'playerId': 20857, 'ownGoals': '0', 'redCard...","[{'playerIn': 8953, 'playerOut': 20857, 'minut...",381291,away,Swansea City,0,0,"[{'playerId': 77550, 'ownGoals': '0', 'redCard...","[{'playerIn': 62344, 'playerOut': 77550, 'minu..."
377,1,2017-08-12 14:00:00,1627,2499728,"West Bromwich Albion - AFC Bournemouth, 1 - 0",8934,away,AFC Bournemouth,0,0,"[{'playerId': 9293, 'ownGoals': '0', 'redCards...","[{'playerIn': 9739, 'playerOut': 9293, 'minute...",149026,home,West Bromwich Albion,1,1,"[{'playerId': 434159, 'ownGoals': '0', 'redCar...","[{'playerIn': 447254, 'playerOut': 434159, 'mi..."
378,1,2017-08-12 11:30:00,0,2499727,"Watford - Liverpool, 3 - 3",14791,away,Liverpool,3,1,"[{'playerId': 15808, 'ownGoals': '0', 'redCard...","[{'playerIn': 134708, 'playerOut': 15808, 'min...",71037,home,Watford,3,2,"[{'playerId': 160, 'ownGoals': '0', 'redCards'...","[{'playerIn': 3351, 'playerOut': 160, 'minute'..."


In [151]:
home_teams1 = matches[matches['team1.side'] == 'home'].copy()

In [152]:
home_teams1=home_teams1[['team1.coachId',
       'team1.side', 'team1.teamName', 'team1.score', 'team1.scoreHT',
       'team1.formation.lineup', 'team1.formation.substitutions']]

In [158]:
home_teams1.columns=['home.coachId', 'home.side', 'home.teamName', 'home.score',
       'home.scoreHT', 'home.formation.lineup',
       'home.formation.substitutions']

In [153]:
home_teams2 = matches[matches['team2.side'] == 'home'].copy()

In [154]:
home_teams2=home_teams2[['team2.coachId', 'team2.side', 'team2.teamName', 'team2.score',
       'team2.scoreHT', 'team2.formation.lineup',
       'team2.formation.substitutions']]

In [157]:
home_teams2.columns=['home.coachId', 'home.side', 'home.teamName', 'home.score',
       'home.scoreHT', 'home.formation.lineup',
       'home.formation.substitutions']

In [175]:

home_teams=pd.concat([home_teams1, home_teams2], axis=0).sort_index()

Doing the same for the away teams

In [176]:
away_teams1 = matches[matches['team1.side'] == 'away'].copy()

away_teams1=away_teams1[['team1.coachId',
       'team1.side', 'team1.teamName', 'team1.score', 'team1.scoreHT',
       'team1.formation.lineup', 'team1.formation.substitutions']]

away_teams1.columns=['away.coachId', 'away.side', 'away.teamName', 'away.score',
       'away.scoreHT', 'away.formation.lineup',
       'away.formation.substitutions']



away_teams2 = matches[matches['team2.side'] == 'away'].copy()

away_teams2=away_teams2[['team2.coachId', 'team2.side', 'team2.teamName', 'team2.score',
       'team2.scoreHT', 'team2.formation.lineup',
       'team2.formation.substitutions']]

away_teams2.columns=['away.coachId', 'away.side', 'away.teamName', 'away.score',
       'away.scoreHT', 'away.formation.lineup',
       'away.formation.substitutions']

away_teams=pd.concat([away_teams1, away_teams2], axis=0).sort_index()

joining all this back onto the matches table

In [178]:
matches[['gameweek', 'dateutc', 'winner', 'wyId', 'label']]

Unnamed: 0,gameweek,dateutc,winner,wyId,label
0,38,2018-05-13 14:00:00,1659,2500089,"Burnley - AFC Bournemouth, 1 - 2"
1,38,2018-05-13 14:00:00,1628,2500090,"Crystal Palace - West Bromwich Albion, 2 - 0"
2,38,2018-05-13 14:00:00,1609,2500091,"Huddersfield Town - Arsenal, 0 - 1"
3,38,2018-05-13 14:00:00,1612,2500092,"Liverpool - Brighton & Hove Albion, 4 - 0"
4,38,2018-05-13 14:00:00,1611,2500093,"Manchester United - Watford, 1 - 0"
...,...,...,...,...,...
375,1,2017-08-12 14:00:00,1623,2499723,"Everton - Stoke City, 1 - 0"
376,1,2017-08-12 14:00:00,0,2499726,"Southampton - Swansea City, 0 - 0"
377,1,2017-08-12 14:00:00,1627,2499728,"West Bromwich Albion - AFC Bournemouth, 1 - 0"
378,1,2017-08-12 11:30:00,0,2499727,"Watford - Liverpool, 3 - 3"


In [180]:
match_cols=matches[['gameweek', 'dateutc', 'winner', 'wyId', 'label']].copy()

In [182]:
matches=pd.concat([match_cols , home_teams, away_teams], axis = 1)

In [183]:
matches

Unnamed: 0,gameweek,dateutc,winner,wyId,label,home.coachId,home.side,home.teamName,home.score,home.scoreHT,home.formation.lineup,home.formation.substitutions,away.coachId,away.side,away.teamName,away.score,away.scoreHT,away.formation.lineup,away.formation.substitutions
0,38,2018-05-13 14:00:00,1659,2500089,"Burnley - AFC Bournemouth, 1 - 2",8880,home,Burnley,1,1,"[{'playerId': 9206, 'ownGoals': '0', 'redCards...","[{'playerIn': 9127, 'playerOut': 9206, 'minute...",8934,away,AFC Bournemouth,2,0,"[{'playerId': 259531, 'ownGoals': '0', 'redCar...","[{'playerIn': 7989, 'playerOut': 259531, 'minu..."
1,38,2018-05-13 14:00:00,1628,2500090,"Crystal Palace - West Bromwich Albion, 2 - 0",8357,home,Crystal Palace,2,0,"[{'playerId': 8623, 'ownGoals': '0', 'redCards...","[{'playerIn': 38031, 'playerOut': 8623, 'minut...",0,away,West Bromwich Albion,0,0,"[{'playerId': 25537, 'ownGoals': '0', 'redCard...","[{'playerIn': 261, 'playerOut': 25537, 'minute..."
2,38,2018-05-13 14:00:00,1609,2500091,"Huddersfield Town - Arsenal, 0 - 1",18572,home,Huddersfield Town,0,0,"[{'playerId': 9419, 'ownGoals': '0', 'redCards...","[{'playerIn': 38377, 'playerOut': 9419, 'minut...",7845,away,Arsenal,1,1,"[{'playerId': 25867, 'ownGoals': '0', 'redCard...","[{'playerIn': 7945, 'playerOut': 25867, 'minut..."
3,38,2018-05-13 14:00:00,1612,2500092,"Liverpool - Brighton & Hove Albion, 4 - 0",14791,home,Liverpool,4,2,"[{'playerId': 25747, 'ownGoals': '2', 'redCard...","[{'playerIn': 8140, 'playerOut': 25747, 'minut...",8093,away,Brighton & Hove Albion,0,0,"[{'playerId': 120, 'ownGoals': '2', 'redCards'...","[{'playerIn': 15526, 'playerOut': 120, 'minute..."
4,38,2018-05-13 14:00:00,1611,2500093,"Manchester United - Watford, 1 - 0",3295,home,Manchester United,1,1,"[{'playerId': 7939, 'ownGoals': '0', 'redCards...","[{'playerIn': 8135, 'playerOut': 7939, 'minute...",93112,away,Watford,0,0,"[{'playerId': 68085, 'ownGoals': '0', 'redCard...","[{'playerIn': 8889, 'playerOut': 68085, 'minut..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,1,2017-08-12 14:00:00,1623,2499723,"Everton - Stoke City, 1 - 0",268779,home,Everton,1,1,"[{'playerId': 77546, 'ownGoals': '0', 'redCard...","[{'playerIn': 340, 'playerOut': 77546, 'minute...",9054,away,Stoke City,0,0,"[{'playerId': 20450, 'ownGoals': '0', 'redCard...","[{'playerIn': 15198, 'playerOut': 20450, 'minu..."
376,1,2017-08-12 14:00:00,0,2499726,"Southampton - Swansea City, 0 - 0",254174,home,Southampton,0,0,"[{'playerId': 20857, 'ownGoals': '0', 'redCard...","[{'playerIn': 8953, 'playerOut': 20857, 'minut...",381291,away,Swansea City,0,0,"[{'playerId': 77550, 'ownGoals': '0', 'redCard...","[{'playerIn': 62344, 'playerOut': 77550, 'minu..."
377,1,2017-08-12 14:00:00,1627,2499728,"West Bromwich Albion - AFC Bournemouth, 1 - 0",149026,home,West Bromwich Albion,1,1,"[{'playerId': 434159, 'ownGoals': '0', 'redCar...","[{'playerIn': 447254, 'playerOut': 434159, 'mi...",8934,away,AFC Bournemouth,0,0,"[{'playerId': 9293, 'ownGoals': '0', 'redCards...","[{'playerIn': 9739, 'playerOut': 9293, 'minute..."
378,1,2017-08-12 11:30:00,0,2499727,"Watford - Liverpool, 3 - 3",71037,home,Watford,3,2,"[{'playerId': 160, 'ownGoals': '0', 'redCards'...","[{'playerIn': 3351, 'playerOut': 160, 'minute'...",14791,away,Liverpool,3,1,"[{'playerId': 15808, 'ownGoals': '0', 'redCard...","[{'playerIn': 134708, 'playerOut': 15808, 'min..."
