In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from statsbombpy import sb

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings(action="ignore", message="credentials were not supplied. open data access only")

In [2]:
competitions = sb.competitions()
# selected_competitions is where match_available_360 is not None
selected_competitions = competitions[competitions['match_available_360'].notnull()]

In [23]:
def get_fouls(match_ids_list, columns, subset):
  """
  This method gets all the fouls from a list of match IDs.

  Args:
    match_ids_list: A list of match IDs.
    columns: A list of the columns that are interesting.
    subset: A list of the columns that should not be NaN.

  Returns:
    A DataFrame of all the fouls.
  """

  fouls = pd.DataFrame()

  for match_id in tqdm(match_ids_list):
    match_events_df = sb.events(match_id=match_id)
    match_events_df_fouls = match_events_df.reindex(columns=columns)

    match_events_df_fouls.dropna(subset=subset, how='all', inplace=True)

    fouls = pd.concat([fouls, match_events_df_fouls], ignore_index=True)

  return fouls

In [5]:
# cmps = sb.competitions()

# # save competion_id and season_id in cmps where match_available_360 is not None
# cmps = cmps[cmps.match_available_360.notnull()]
# cmps = cmps[['competition_id', 'season_id']]

# # add column 'match_ids' to cmps. you can fill using sb.matches(row.competition_id, row.season_id). this should be a list
# cmps['match_ids'] = cmps.apply(lambda row: sb.matches(row.competition_id, row.season_id).match_id.tolist(), axis=1)

# # add column 'match_count' to cmps
# cmps['match_count'] = cmps.apply(lambda row: len(row.match_ids), axis=1)

In [6]:
# interesting_columns = ['foul_committed_advantage','foul_committed_offensive', 'foul_committed_penalty','foul_committed_card','foul_committed_counterpress','foul_committed_type', 'foul_won_advantage', 'foul_won_defensive', 'foul_won_penalty', 'id', 'index', 'location', 'match_id', 'minute', 'second', 'period', 'player', 'player_id', 'position', 'possession', 'possession_team', 'possession_team_id', 'team', 'team_id', 'timestamp', 'type']
# interesting_subset = ['foul_committed_advantage','foul_committed_card','foul_committed_offensive','foul_committed_penalty','foul_committed_counterpress','foul_committed_type']

# for index, cmp in cmps.iterrows():
#     # Call get_fouls() for each match in cmps. get_fouls() takes the match_ids list as the first argument.
#     all_fouls = get_fouls(cmp["match_ids"], interesting_columns, interesting_subset)
    
#     # Get count of each value in column 'foul_committed_card' and add this to cmps DataFrame. Save count of yellow cards in column 'yellow_card_count' and count of red cards in column 'red_card_count' and count of second yellow cards in column 'second_yellow_card_count'. Create these columns in cmps DataFrame if they don't exist.
#     yellow_card_count = all_fouls['foul_committed_card'].value_counts().get('Yellow Card', 0)
#     red_card_count = all_fouls['foul_committed_card'].value_counts().get('Red Card', 0)
#     second_yellow_card_count = all_fouls['foul_committed_card'].value_counts().get('Second Yellow', 0)
    
#     cmps.loc[index, 'yellow_card_count'] = yellow_card_count
#     cmps.loc[index, 'red_card_count'] = red_card_count
#     cmps.loc[index, 'second_yellow_card_count'] = second_yellow_card_count

100%|██████████| 64/64 [00:33<00:00,  1.89it/s]
100%|██████████| 35/35 [00:18<00:00,  1.87it/s]
100%|██████████| 51/51 [00:27<00:00,  1.85it/s]
100%|██████████| 31/31 [00:16<00:00,  1.93it/s]
100%|██████████| 64/64 [00:33<00:00,  1.91it/s]


In [5]:
# cmps.head(10)
# # save cmps to pickle file
# cmps.to_pickle('cmps.pkl')

# load cmps from pickle file
cmps = pd.read_pickle('cmps.pkl')
cmps.head()

Unnamed: 0,competition_id,season_id,match_ids,match_count,yellow_card_count,red_card_count,second_yellow_card_count
26,43,106,"[3857256, 3869151, 3857257, 3857258, 3857288, ...",64,182.0,1.0,1.0
35,11,90,"[3773386, 3773565, 3773457, 3773631, 3773665, ...",35,112.0,4.0,4.0
62,55,43,"[3795108, 3788769, 3788766, 3795220, 3788761, ...",51,124.0,5.0,1.0
64,53,106,"[3835331, 3835324, 3844384, 3847567, 3845506, ...",31,59.0,0.0,2.0
65,72,107,"[3904629, 3906390, 3906389, 3904628, 3893806, ...",64,97.0,3.0,2.0


In [6]:
def get_match_ids_info(competitions):
  """
  Get the match IDs and information for each match in the selected competitions.

  Args:
    competitions: A pandas DataFrame of selected competitions.

  Returns:
    A tuple of two lists: `match_ids_list` and `match_ids_info`.
  """

  match_ids_list = []
  match_ids_info = {}

  for index, row in competitions.iterrows():
    matches = sb.matches(row.competition_id, row.season_id)

    for match in matches.itertuples():
      match_ids_list.append(match.match_id)
      match_ids_info[match.match_id] = [match.home_team, match.away_team]

  return match_ids_list, match_ids_info

In [7]:
# interesting_columns = ['foul_committed_advantage','foul_committed_offensive', 'foul_committed_penalty','foul_committed_card','foul_committed_counterpress','foul_committed_type', 'foul_won_advantage', 'foul_won_defensive', 'foul_won_penalty', 'id', 'index', 'location', 'match_id', 'minute', 'second', 'period', 'player', 'player_id', 'position', 'possession', 'possession_team', 'possession_team_id', 'team', 'team_id', 'timestamp', 'type']
# interesting_subset = ['foul_committed_advantage','foul_committed_card','foul_committed_offensive','foul_committed_penalty','foul_committed_counterpress','foul_committed_type']

# match_ids_list, match_ids_info = get_match_ids_info(selected_competitions)
# all_fouls = get_fouls(match_ids_list, interesting_columns, interesting_subset)

100%|██████████| 245/245 [02:07<00:00,  1.92it/s]


In [8]:
# all_fouls.to_pickle('all_fouls.pkl')

# load all_fouls from pickle file
all_fouls = pd.read_pickle('all_fouls.pkl')

In [9]:
all_fouls.head(2)

Unnamed: 0,foul_committed_advantage,foul_committed_offensive,foul_committed_penalty,foul_committed_card,foul_committed_counterpress,foul_committed_type,foul_won_advantage,foul_won_defensive,foul_won_penalty,id,index,location,match_id,minute,second,period,player,player_id,position,possession,possession_team,possession_team_id,team,team_id,timestamp,type
0,,,,Yellow Card,,,,,,9221da02-9c87-4f98-a8ea-de9994ad562b,527,"[10.8, 76.4]",3857256,14,27,1,Silvan Widmer,7796.0,Right Back,37,Serbia,786,Switzerland,773,00:14:27.221,Foul Committed
1,True,,,,,,,,,f2bd21f6-87c3-44c0-bb37-04e996db413e,798,"[55.1, 36.2]",3857256,23,9,1,Saša Lukić,6687.0,Right Defensive Midfield,61,Serbia,786,Serbia,786,00:23:09.628,Foul Committed


In [10]:
def get_goals(match_ids):
  """
  Get all goals from a list of match IDs.

  Args:
    match_ids: A list of match IDs.

  Returns:
    A DataFrame of goals and match events
  """

  goals = pd.DataFrame()
  match_events = pd.DataFrame()

  for match_id in tqdm(match_ids):
    match_events_df = sb.events(match_id=match_id)
    match_events = pd.concat([match_events, match_events_df], ignore_index=True)

    match_events_df_shots = match_events_df[match_events_df.type == 'Shot']
    match_events_df_goals = match_events_df_shots[match_events_df_shots.shot_outcome == 'Goal']

    match_events_df_goals['seconds'] = match_events_df_goals['minute'] * 60 + match_events_df_goals['second']
    goals = pd.concat([goals, match_events_df_goals], ignore_index=True)

  return goals, match_events

In [11]:
# all_goals, match_events = get_goals(match_ids_list)

# all_goals.to_pickle('all_goals.pkl')
# match_events.to_pickle('match_events.pkl')

# load all_goals and match_events from pickle file
all_goals = pd.read_pickle('all_goals.pkl')
match_events = pd.read_pickle('match_events.pkl')

  goals = pd.concat([goals, match_events_df_goals], ignore_index=True)
100%|██████████| 245/245 [02:57<00:00,  1.38it/s]


In [12]:
def get_scoreline(goals, match_events, match_id, seconds):
    match_goals = goals[goals['match_id'] == match_id]
    
    # find unique team_id
    teams = match_events['team'].unique()
    match_goals = match_goals[match_goals['seconds'] <= seconds]

    # find goal scored by each_team
    team_goal_count = match_goals.groupby('team')['team'].count()
    team_goal_dict = {}
    
    for team in teams:
        # if team_goal_count['team'] is NaN, then add to team_goal_dict with key as team_name and value as 0
        if team not in team_goal_count:
            team_goal_dict[team] = 0
        else:
            team_goal_dict[team] = team_goal_count[team]
            
    return team_goal_dict


def distance_to_goal(isHomeTeam, location):
    if isHomeTeam:
        # calculate distance from location to away goal
        return np.sqrt((120 - location[0])**2 + (40 - location[1])**2)
    else:
        # calculate distance from location to home goal
        return np.sqrt((0 - location[0])**2 + (40 - location[1])**2)


def angle_to_goal(isHomeTeam, location):
    if isHomeTeam:
        # calculate angle from location to away goal
        return np.arctan((40 - location[1]) / (120 - location[0]))
    else:
        # calculate angle from location to home goal
        return np.arctan((40 - location[1]) / (0 - location[0]))

In [13]:
# Write method to calculate previous foul count of the player making the foul. Inputs will be match_id, player_id, timestamp
def previous_foul_count(fouls, match_id, player_id, seconds):
    # get all fouls in the match
    match_fouls_df = fouls[fouls['match_id'] == match_id]
    
    # get all fouls by player_id
    player_fouls_df = match_fouls_df[match_fouls_df['player_id'] == player_id]
    
    player_fouls_df['seconds'] = player_fouls_df['minute'] * 60 + player_fouls_df['second']

    # get all fouls before timestamp
    previous_fouls_df = player_fouls_df[player_fouls_df['seconds'] <= seconds]
    
    # return count of fouls
    return len(previous_fouls_df)


# Write method to calculate previous foul count of the team making the foul. Inputs will be match_id, team_id, timestamp
def previous_foul_count_team(fouls, match_id, team_id, seconds):
    # get all fouls in the match
    match_fouls_df = fouls[fouls['match_id'] == match_id]
    
    # get all fouls by team_id
    team_fouls_df = match_fouls_df[match_fouls_df['team_id'] == team_id]
    
    team_fouls_df['seconds'] = team_fouls_df['minute'] * 60 + team_fouls_df['second']

    # get all fouls before timestamp
    team_fouls_df = team_fouls_df[team_fouls_df['seconds'] <= seconds]
    
    # return count of fouls
    return len(team_fouls_df)

In [14]:
def make_advanced_fouls(fouls, goals, match_events, match_ids_info):
  """
  This function takes a DataFrame of fouls and a `match_ids_info` DataFrame and returns a new DataFrame with advanced foul metrics.

  Args:
    all_fouls: A DataFrame of fouls.
    match_ids_info: A DataFrame that contains information about the matches, such as the home team and the away team.

  Returns:
    A DataFrame of advanced fouls.
  """

  # Copy the original DataFrame.
  fouls_advanced = fouls.copy()

  fouls_advanced['seconds_till_now'] = fouls_advanced['minute'] * 60 + fouls_advanced['second']
  fouls_advanced['scoreline_till_now'] = fouls_advanced.apply(lambda x: get_scoreline(goals, match_events, x['match_id'], x['seconds_till_now']), axis=1)
  fouls_advanced['distance_to_goal'] = fouls_advanced.apply(lambda x: distance_to_goal(x['team'] == match_ids_info[x['match_id']][0], x['location']), axis=1)
  fouls_advanced['angle_to_goal'] = fouls_advanced.apply(lambda x: angle_to_goal(x['team'] == match_ids_info[x['match_id']][0], x['location']), axis=1)

  # Calculate the number of fouls committed by the player up to the time of the foul.
  # all_fouls_advanced['foul_count_player_till_now'] = all_fouls_advanced.apply(lambda x: previous_foul_count(all_fouls, x['match_id'], x['player_id'], x['seconds_till_now']), axis=1)

  # Calculate the number of fouls committed by the team up to the time of the foul.
  # all_fouls_advanced['foul_count_team_till_now'] = all_fouls_advanced.apply(lambda x: previous_foul_count_team(all_fouls, x['match_id'], x['team_id'], x['seconds_till_now']), axis=1)

  return fouls_advanced

In [24]:
# all_fouls_advanced = make_advanced_fouls(all_fouls, all_goals, match_events, match_ids_info)
# all_fouls_advanced['foul_count_player_till_now'] = all_fouls_advanced.apply(lambda x: previous_foul_count(all_fouls, x['match_id'], x['player_id'], x['seconds_till_now']), axis=1)
# all_fouls_advanced['foul_count_team_till_now'] = all_fouls_advanced.apply(lambda x: previous_foul_count_team(all_fouls, x['match_id'], x['team_id'], x['seconds_till_now']), axis=1)

# all_fouls_advanced.to_pickle('all_fouls_advanced.pkl')


# load all_fouls_advanced from pickle file
all_fouls_advanced = pd.read_pickle('all_fouls_advanced.pkl')

In [17]:
len(all_fouls_advanced)

1684

In [18]:
all_fouls_advanced['foul_committed_type'].value_counts()

Handball          286
Dangerous Play     38
Foul Out           20
Dive                7
Name: foul_committed_type, dtype: int64

In [19]:
import numpy as np

type_dict = {'Handball': 0, 'Dangerous Play': 1, 'Foul Out': 2, 'Dive': 3, '6 Seconds': 4, 'Backpass Pick': 5}

card_dict = {'Yellow Card': 0, 'Second Yellow': 0, 'Red Card': 1}

def create_features(team_fouls_df):
    all_features = []
    all_labels = []

    for _, row in team_fouls_df.iterrows():
        # select features only where foul_committed_type is not in type_dict keys
        if row['foul_committed_type'] in type_dict.keys():
            continue
        
        team_making_foul = row['team']

        scoreline = row['scoreline_till_now']
        team_not_making_foul = [team for team in scoreline.keys() if team != team_making_foul][0]
        
        score_difference_till_now = scoreline[team_not_making_foul] - scoreline[team_making_foul]

        features = [row['minute'], score_difference_till_now, row['distance_to_goal'], row['angle_to_goal'], row['foul_count_player_till_now'], row['foul_count_team_till_now'], row['id']]

        all_features.append(features)
        all_labels.append([card_dict.get(row['foul_committed_card'], 2)])
    
    return all_features, all_labels

In [20]:
import pickle

all_the_features = []
all_the_labels = []

# use tqdm to iterate match_ids_list
for match_id in tqdm(match_ids_list):
    # get all fouls in the match
    match_fouls_df = all_fouls_advanced[all_fouls_advanced['match_id'] == match_id]

    feature, label = create_features(match_fouls_df)

    # append features and labels to all_the_features and all_the_labels
    all_the_features.extend(feature)
    all_the_labels.extend(label)

# save all_the_features and all_the_labels to files using pickle
with open('all_the_features.pkl', 'wb') as f:
    pickle.dump(all_the_features, f)

with open('all_the_labels.pkl', 'wb') as f:
    pickle.dump(all_the_labels, f)

100%|██████████| 245/245 [00:00<00:00, 1411.56it/s]


In [21]:
print (len(all_the_features))

# first 10 items in all_the_features and all_the_labels
print (all_the_features[:100])
print (all_the_labels[:100])

# print count of unique labels
print (np.unique(all_the_labels, return_counts=True))

1333
[[14, 0, 37.96840792026972, 1.2823667682765016, 1, 1, '9221da02-9c87-4f98-a8ea-de9994ad562b'], [23, 1, 65.01115288933123, 0.058484844490939804, 1, 1, 'f2bd21f6-87c3-44c0-bb37-04e996db413e'], [33, 0, 69.64165420206503, -0.3063112726462804, 1, 2, '8e51baec-6948-4285-86a1-73197126e265'], [46, 0, 52.10614167255142, -0.26406386883592514, 1, 2, 'b711f529-182e-4ae3-8c9a-189b7697b2ea'], [55, 1, 42.569942447694245, 0.5541172736434499, 1, 3, 'ec46c71c-af4c-4d99-a854-e4ffc30460d6'], [80, 1, 98.5211652387445, 0.09249778257668083, 1, 5, '142138e3-20c9-46b2-80cd-db92a583bc81'], [86, 1, 64.82352967865913, 0.061745215024700977, 3, 6, '3d99f9db-1f6c-45e1-b604-73f118e0bb16'], [98, -1, 33.64342432036311, 0.9415629774702305, 1, 3, '4626dd9d-ced6-4b46-9fb6-01a9b74ba80a'], [99, 1, 31.49857139617605, -0.7044940642422177, 4, 7, 'dabe7e24-5529-4444-9da9-7099b4743e21'], [10, 0, 84.25942083826591, 0.237206798883644, 1, 1, '663f1d0d-552a-4125-8aad-a47d30270b0c'], [14, 0, 86.55963262398934, 0.4238547686137660

In [22]:
# save match_ids_list to file using pickle
with open('match_ids_list.pkl', 'wb') as f:
    pickle.dump(match_ids_list, f)

In [25]:
all_fouls_advanced.head(2)

Unnamed: 0,foul_committed_advantage,foul_committed_offensive,foul_committed_penalty,foul_committed_card,foul_committed_counterpress,foul_committed_type,foul_won_advantage,foul_won_defensive,foul_won_penalty,id,index,location,match_id,minute,second,period,player,player_id,position,possession,possession_team,possession_team_id,team,team_id,timestamp,type,seconds_till_now,scoreline_till_now,distance_to_goal,angle_to_goal,foul_count_player_till_now,foul_count_team_till_now
0,,,,Yellow Card,,,,,,9221da02-9c87-4f98-a8ea-de9994ad562b,527,"[10.8, 76.4]",3857256,14,27,1,Silvan Widmer,7796.0,Right Back,37,Serbia,786,Switzerland,773,00:14:27.221,Foul Committed,867,"{'Serbia': 0, 'Switzerland': 0, 'Argentina': 0...",37.968408,1.282367,1,1
1,True,,,,,,,,,f2bd21f6-87c3-44c0-bb37-04e996db413e,798,"[55.1, 36.2]",3857256,23,9,1,Saša Lukić,6687.0,Right Defensive Midfield,61,Serbia,786,Serbia,786,00:23:09.628,Foul Committed,1389,"{'Serbia': 0, 'Switzerland': 1, 'Argentina': 0...",65.011153,0.058485,1,1
