## Preparing a dataset for centre back model
Aim is to collect a dataset on match lineups (defenders specifically) and match stats, specifically xG, goals and also touches in box / final 3rd.

### 1. Set up / dependencies

In [1]:
from statsbombpy import sb
import numpy as np
import pandas as pd

### 2. Read in data

In [2]:
# get the competition / season_ids
comps = sb.competitions()
comps = comps[comps.competition_id == 37]
comps.season_id

credentials were not supplied. open data access only


15    90
16    42
17     4
Name: season_id, dtype: int64

In [3]:
# get all the match ids
matches1 = sb.matches(competition_id=37, season_id=4)
matches2 = sb.matches(competition_id=37, season_id=42)
matches3 = sb.matches(competition_id=37, season_id=90)
matches = pd.concat([matches1, matches2, matches3])
match_ids = matches.match_id.values

credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only


In [4]:
# get the lineup data
lineups = pd.DataFrame()
for m in match_ids:
    lineup = sb.lineups(match_id=m)
    teams = list(lineup.keys())
    team1 = teams[0]
    team2 = teams[1]
    lineup1 = pd.DataFrame(lineup[team1])
    lineup2 = pd.DataFrame(lineup[team2])
    lineup1["team"] = team1
    lineup2["team"] = team2  
    lineup1["match_id"] = m
    lineup2["match_id"] = m
    lineups = pd.concat([lineups, lineup1, lineup2])

credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data access only
credentials were not supplied. open data acces

In [5]:
# extract key info to new df columns
lineups["position"] = lineups["positions"].apply(lambda x: x[0]["position"] if len(x) > 0 else None)
lineups["start_time"] = lineups["positions"].apply(lambda x: x[0]["from"] if len(x) > 0 else None)
lineups["end_time"] = lineups["positions"].apply(lambda x: x[0]["to"] if len(x) > 0 else None)

In [6]:
# only really interested in centre backs
print(lineups.position.unique())
cbs = ["Left Center Back", "Right Center Back", "Center Back"]
cb_lineups = lineups[lineups.position.isin(cbs)]

['Left Center Back' 'Left Center Forward' 'Left Center Midfield'
 'Goalkeeper' 'Right Center Forward' 'Right Center Back' 'Center Midfield'
 'Center Back' 'Right Center Midfield' 'Left Midfield' 'Center Forward'
 'Right Midfield' 'Left Wing' 'Right Back' 'Left Back' 'Right Wing'
 'Center Attacking Midfield' None 'Left Defensive Midfield'
 'Right Defensive Midfield' 'Center Defensive Midfield'
 'Left Attacking Midfield' 'Right Attacking Midfield' 'Secondary Striker'
 'Right Wing Back' 'Left Wing Back']


In [7]:
cb_lineups.shape

(1476, 12)

In [8]:
# get event data related to matches
events = sb.events(match_id=match_ids[0])

credentials were not supplied. open data access only


In [11]:
# get shot data
shots = events[events.shot.notnull()]
shots = shots[["match_id", "id", "player", "minute", "possession_team", "shot"]]
shots["xG"] = shots["shot"].apply(lambda x: x["statsbomb_xg"])

In [12]:
# get possession data
possession = events[["match_id", "possession_team", "duration", "type", "location", "minute"]]

In [115]:
# will be helpful to have functions to return relevant possession / shots data
def get_shot_data(match_id):
    # get event data related to matches
    events = sb.events(match_id=match_id)
    # get possession data
    possession = events[["match_id", "possession_team", "duration", "type", "location", "minute"]]
    # return output df
    return possession

In [None]:
def get_possession_data(match_id):
    # get event data related to matches
    events = sb.events(match_id=match_id)
    # get shot data
    shots = events[events.shot.notnull()]
    shots = shots[["match_id", "id", "player", "minute", "possession_team", "shot"]]
    shots["xG"] = shots["shot"].apply(lambda x: x["statsbomb_xg"])
    # return output df
    return shots

### 3. Clean data
The clean dataset needs to contain each centre back combination for each game for each team, with which minutes they played and the possession / xG stats for those minutes

#### a) Tidy up lineup data

In [13]:
cb_lineups["start_minute"] = cb_lineups["start_time"].apply(lambda x: int(str(x).split(":")[0]))
cb_lineups["end_minute"] = cb_lineups["end_time"].apply(lambda x: int(str(x).split(":")[0]) if x is not None else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [119]:
# Get the lineups for each match first
player_ids = []
starts = []
ends = []
matches = []
teams = []

# get unique match / team combos
mts = set(list(zip(cb_lineups.match_id.values, cb_lineups.team.values)))

# loop through match-teams and compute different lineup combinations
for m, t in mts:
    # get basic info / num lineup changes
    match_team_df = cb_lineups[(cb_lineups.match_id == m) & (cb_lineups.team == t)]
    num_lineups = match_team_df[match_team_df.end_minute.notnull()].shape[0] + 1
    matches.append(m)
    teams.append(t)
    
    # get first lineup
    starts.append(0)
    ends.append(min(match_team_df["end_minute"]))
    current_lineup = list(match_team_df[match_team_df.start_minute == 0].player_id.values)
    add_lineup = current_lineup.copy()
    player_ids.append(add_lineup)
    
    # get subsequent lineups
    if num_lineups != 1:
        # get minutes the lineups changed
        changes = [x for x in match_team_df.start_minute.values if x > 0]
        change_ends = [x for x in match_team_df.end_minute.values if x > 0]
        changes.extend(change_ends)
        changes = list(set(changes))
        changes.sort()
        
        # loop through time points
        count = 1
        for c in changes:
            starts.append(c)
            player_on = list(match_team_df[match_team_df.start_minute == c].player_id)
            player_off = list(match_team_df[match_team_df.end_minute == c].player_id)
            if len(player_on) > 0:
                current_lineup.append(player_on[0])
                add_lineup = current_lineup.copy()
            if len(player_off) > 0:
                current_lineup.remove(player_off[0])
                add_lineup = current_lineup.copy()
            player_ids.append(add_lineup)
            if count < len(changes):
                ends.append(changes[count])
            else:
                ends.append(None)
            matches.append(m)
            teams.append(t)
            count += 1

[16381, 24922, 46741]
[16381, 24922, 46741]
[15578]


TypeError: 'int' object is not iterable

In [103]:
# create dataframe
lineups = pd.DataFrame({"match": matches,
                        "team": teams,
                        "start_time": starts,
                        "end_time": ends,
                        "player_ids":player_ids})

In [113]:
lineups.head(3)

Unnamed: 0,match,team,start_time,end_time,player_ids,xG_conceded
0,7298,Manchester City WFC,0.0,33.0,"[4648, 17524]",1.013509
1,7298,Manchester City WFC,33.0,,[17524],0.0
2,19778,Manchester City WFC,0.0,,"[10185, 17524]",0.0


#### b) Add possession data

In [None]:
# loop through matches and sum the opposition team possession durations for the relevant spell
# initialise vector for output
opp_possession = []





In [100]:
possession.sample(2)

Unnamed: 0,match_id,possession_team,duration,type,location,minute
1114,7298,Chelsea FCW,,Ball Receipt*,"[30.0, 68.0]",10
49,7298,Manchester City WFC,1.12,Pass,"[63.0, 70.0]",4


#### c) Add xG data

In [107]:
# get xG against for each lineup iteration in the dataset during the relevant time period
xG_conceded = []

# loop through lineups
for i, row in lineups.iterrows():
    # get relevant sub-df 
    match = row["match"]
    team = row["team"]
    start = row["start_time"]
    end = row["end_time"]
    xG_against_df = shots[(shots.match_id == match) & (shots.possession_team != team)]
    xG_against_df = xG_against_df[(xG_against_df.minute >= start) & (xG_against_df.minute < end)]

    # sum xG and append
    xG = xG_against_df.xG.sum()
    xG_conceded.append(xG)

lineups["xG_conceded"] = xG_conceded

## Save extract

In [112]:
lineups.to_csv("sample_extract.csv")