## Preparing a dataset for centre back model
Aim is to collect a dataset on match lineups (defenders specifically) and match stats, specifically xG, goals and also touches in box / final 3rd.

### 1. Set up / dependencies

In [1]:
from statsbombpy import sb
import numpy as np
import pandas as pd

### 2. Read in data

In [2]:
# get the competition / season_ids
comps = sb.competitions()
comps = comps[comps.competition_id == 37]
comps.season_id

credentials were not supplied. open data access only


15    90
16    42
17     4
Name: season_id, dtype: int64

In [3]:
# get all the match ids
matches1 = sb.matches(competition_id=37, season_id=4)
matches2 = sb.matches(competition_id=37, season_id=42)
matches3 = sb.matches(competition_id=37, season_id=90)
matches = pd.concat([matches1, matches2, matches3])
match_ids = matches.match_id.values

credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only


In [4]:
# get the lineup data
lineups = pd.DataFrame()
for m in match_ids:
    lineup = sb.lineups(match_id=m)
    teams = list(lineup.keys())
    team1 = teams[0]
    team2 = teams[1]
    lineup1 = pd.DataFrame(lineup[team1])
    lineup2 = pd.DataFrame(lineup[team2])
    lineup1["team"] = team1
    lineup2["team"] = team2  
    lineup1["match_id"] = m
    lineup2["match_id"] = m
    lineups = pd.concat([lineups, lineup1, lineup2])

credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data acces

In [5]:
# extract key info to new df columns
lineups["position"] = lineups["positions"].apply(lambda x: x[0]["position"] if len(x) > 0 else None)
lineups["start_time"] = lineups["positions"].apply(lambda x: x[0]["from"] if len(x) > 0 else None)
lineups["end_time"] = lineups["positions"].apply(lambda x: x[0]["to"] if len(x) > 0 else None)

In [6]:
# only really interested in centre backs
print(lineups.position.unique())
cbs = ["Left Center Back", "Right Center Back", "Center Back"]
cb_lineups = lineups[lineups.position.isin(cbs)]

['Left Center Back' 'Left Center Forward' 'Left Center Midfield'
 'Goalkeeper' 'Right Center Forward' 'Right Center Back' 'Center Midfield'
 'Center Back' 'Right Center Midfield' 'Left Midfield' 'Center Forward'
 'Right Midfield' 'Left Wing' 'Right Back' 'Left Back' 'Right Wing'
 'Center Attacking Midfield' None 'Left Defensive Midfield'
 'Right Defensive Midfield' 'Center Defensive Midfield'
 'Left Attacking Midfield' 'Right Attacking Midfield' 'Secondary Striker'
 'Right Wing Back' 'Left Wing Back']


In [7]:
cb_lineups.shape

(1476, 12)

In [8]:
# get event data related to matches
events = sb.events(match_id=match_ids[0])

credentials were not supplied. open data access only


In [9]:
events.shape

(3793, 35)

In [10]:
events.sample(5)

Unnamed: 0,bad_behaviour,ball_receipt,ball_recovery,block,carry,counterpress,dribble,duel,duration,foul_committed,...,possession_team,related_events,second,shot,substitution,tactics,team,timestamp,type,under_pressure
1199,,,,,,,,,,,...,Manchester City WFC,[6c13009a-c442-44cf-b1e1-599d84b57d69],36,,,,Manchester City WFC,00:19:36.793,Ball Receipt*,
2217,,,,,"{'end_location': [35.0, 75.0]}",,,,1.16,,...,Chelsea FCW,"[52dbc400-a9c1-42ce-b1ef-d86f937288ab, dc407f3...",11,,,,Chelsea FCW,00:31:11.820,Carry,
3,,,,,,,,,8.16,,...,Manchester City WFC,[040940a1-5972-431e-b6ac-e723edd8e7c2],0,,,,Chelsea FCW,00:00:00.000,Half Start,
954,,,,,,,,,0.8,,...,Chelsea FCW,[46168d11-c4e2-424d-a720-960797bde79c],45,,,,Chelsea FCW,00:42:45.278,Pass,
1193,,,,,,,,,,,...,Chelsea FCW,[2c6280ac-e01c-4087-b287-07a899175937],32,,,,Chelsea FCW,00:18:32.300,Ball Receipt*,


In [11]:
# get shot data
shots = events[events.shot.notnull()]
shots = shots[["match_id", "id", "player", "minute", "possession_team", "shot"]]
shots["xG"] = shots["shot"].apply(lambda x: x["statsbomb_xg"])

In [12]:
# get possession data
possession = events[["match_id", "possession_team", "duration", "type", "location", "minute"]]

### 3. Clean data
The clean dataset needs to contain each centre back combination for each game for each team, with which minutes they played and the possession / xG stats for those minutes

In [13]:
cb_lineups["start_minute"] = cb_lineups["start_time"].apply(lambda x: int(str(x).split(":")[0]))
cb_lineups["end_minute"] = cb_lineups["end_time"].apply(lambda x: int(str(x).split(":")[0]) if x is not None else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
cb_lineups.head(2)

Unnamed: 0,player_id,player_name,player_nickname,jersey_number,country,cards,positions,team,match_id,position,start_time,end_time,start_minute,end_minute
0,4633,Magdalena Lilly Eriksson,,16,Sweden,[],"[{'position_id': 5, 'position': 'Left Center B...",Chelsea FCW,7298,Left Center Back,00:00,,0,
5,4642,Millie Bright,,4,England,[],"[{'position_id': 3, 'position': 'Right Center ...",Chelsea FCW,7298,Right Center Back,00:00,31:37,0,31.0


In [97]:
# Get the lineups for each match first
player_ids = []
starts = []
ends = []
matches = []
teams = []

# get unique match / team combos
mts = set(list(zip(cb_lineups.match_id.values, cb_lineups.team.values))[:30])

# loop through match-teams and compute different lineup combinations
for m, t in mts:
    # get basic info / num lineup changes
    match_team_df = cb_lineups[(cb_lineups.match_id == m) & (cb_lineups.team == t)]
    num_lineups = match_team_df[match_team_df.end_minute.notnull()].shape[0] + 1
    matches.append(m)
    teams.append(t)
    
    # get first lineup
    starts.append(0)
    ends.append(min(match_team_df["end_minute"]))
    current_lineup = list(match_team_df[match_team_df.start_minute == 0].player_id.values)
    add_lineup = current_lineup.copy()
    player_ids.append(add_lineup)
    
    # get subsequent lineups
    if num_lineups != 1:
        # get minutes the lineups changed
        changes = [x for x in match_team_df.start_minute.values if x > 0]
        change_ends = [x for x in match_team_df.end_minute.values if x > 0]
        changes.extend(change_ends)
        changes = list(set(changes))
        changes.sort()
        
        # loop through time points
        count = 1
        for c in changes:
            starts.append(c)
            player_on = list(match_team_df[match_team_df.start_minute == c].player_id)
            player_off = list(match_team_df[match_team_df.end_minute == c].player_id)
            if len(player_on) > 0:
                current_lineup.extend(player_on[0])
                add_lineup = current_lineup.copy()
            if len(player_off) > 0:
                current_lineup.remove(player_off[0])
                add_lineup = current_lineup.copy()
            player_ids.append(add_lineup)
            if count < len(changes):
                ends.append(changes[count])
            else:
                ends.append(None)
            matches.append(m)
            teams.append(t)
            count += 1

In [98]:
# create dataframe
lineups = pd.DataFrame({"matches": matches,
                        "teams": teams,
                        "start_time": starts,
                        "end_time": ends,
                        "player_ids":player_ids})

In [99]:
lineups

Unnamed: 0,matches,teams,start_time,end_time,player_ids
0,7298,Manchester City WFC,0.0,33.0,"[4648, 17524]"
1,7298,Manchester City WFC,33.0,,[17524]
2,19778,Manchester City WFC,0.0,,"[10185, 17524]"
3,19772,Reading WFC,0.0,,"[10198, 18152]"
4,19771,Yeovil Town LFC,0.0,4.0,"[15715, 15717]"
5,19771,Yeovil Town LFC,4.0,,[15717]
6,19778,Birmingham City WFC,0.0,,"[19502, 19503]"
7,19745,Brighton & Hove Albion WFC,0.0,,"[16394, 16395]"
8,19730,Chelsea FCW,0.0,,"[4633, 4642, 10395]"
9,19736,Chelsea FCW,0.0,,"[4633, 4642]"


In [None]:
possession.sample(2)

Unnamed: 0,match_id,possession_team,duration,type,location,minute
2608,7298,Manchester City WFC,3.24,Carry,"[71.0, 9.0]",72
1752,7298,Manchester City WFC,,Ball Receipt*,"[76.0, 9.0]",72


In [None]:
cb_lineups[cb_lineups.match_id == match_ids[132]]

Unnamed: 0,player_id,player_name,player_nickname,jersey_number,country,cards,positions,team,match_id,position,start_time,end_time
8,10185,Stephanie Houghton,,6,England,[],"[{'position_id': 3, 'position': 'Right Center ...",Manchester City WFC,2275092,Right Center Back,00:00,
10,15554,Gemma Bonner,,4,England,[],"[{'position_id': 5, 'position': 'Left Center B...",Manchester City WFC,2275092,Left Center Back,00:00,
2,15569,Kerys Harrop,,6,England,[],"[{'position_id': 5, 'position': 'Left Center B...",Birmingham City WFC,2275092,Left Center Back,74:11,
7,19592,Harriet Scott,,3,Ireland,[],"[{'position_id': 3, 'position': 'Right Center ...",Birmingham City WFC,2275092,Right Center Back,00:00,
13,31568,Rebecca Holloway,,25,Northern Ireland,[],"[{'position_id': 5, 'position': 'Left Center B...",Birmingham City WFC,2275092,Left Center Back,00:00,74:11


In [None]:
shots.sample(2)

Unnamed: 0,match_id,id,player,minute,possession_team,shot,xG
3535,7298,23d74ba8-6e5d-4855-8b3a-81b310e9d35a,Jill Scott,26,Manchester City WFC,"{'statsbomb_xg': 0.08011432, 'end_location': [...",0.080114
3537,7298,ea3833bf-1f02-4fba-9b27-8a1ebc28d732,Ramona Bachmann,39,Chelsea FCW,"{'statsbomb_xg': 0.024449918, 'end_location': ...",0.02445
