In [11]:
import pandas as pd
import numpy as np

#Plotting libs
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


#Football libs
import socceraction
from socceraction.data.statsbomb import StatsBombLoader
from mplsoccer import Pitch, Sbopen, VerticalPitch
import socceraction.spadl as spadl
import matplotsoccer as mps
import socceraction.xthreat as xthreat
import socceraction.spadl as spadl
from socceraction.vaep import VAEP

# utils
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz
# process is used to compare a string to MULTIPLE other strings
from fuzzywuzzy import process

# ML libs
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_classif, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler,  MinMaxScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA



In [12]:
# Set up the StatsBomb data loader
SBL = StatsBombLoader()


In [14]:
# Create a dataframe with all games from FA Women's Super League 2020/2021 season as our model test dataset

# Create a dataframe with all the games in the previous 2 seasons as our train and validation set

df_games_season_4 = SBL.games(competition_id=37, season_id=4).set_index("game_id")
df_games_season_42 = SBL.games(competition_id=37, season_id=42).set_index("game_id")
df_games = pd.concat([df_games_season_4, df_games_season_42], axis=0)
df_games_test = SBL.games(competition_id=37, season_id=90).set_index("game_id")


## Convert training dataset to SPADL Schema and compute VAEP Score for all actions

In [30]:
VAEP_model = VAEP(nb_prev_actions=5)

# compute features and labels for each game
all_features, all_labels, all_actions = [], [], []
for game_id, game in tqdm(list(df_games.iterrows())):
    
    # load the game's events
    game_events = SBL.events(game_id)
    
    # convert the events to actions
    game_home_team_id = df_games.at[game_id, "home_team_id"]
    game_actions = spadl.statsbomb.convert_to_actions(game_events, game_home_team_id)
    game_actions = socceraction.spadl.add_names(game_actions)
    
    # compute features and labels
    all_actions.append(game_actions)
    all_features.append(VAEP_model.compute_features(game, game_actions))
    all_labels.append(VAEP_model.compute_labels(game, game_actions))

# combine all features and labels in a single dataframe
all_actions = pd.concat(all_actions)
all_features = pd.concat(all_features)
all_labels = pd.concat(all_labels)

# fit the model
VAEP_model.fit(all_features, all_labels)

100%|██████████| 195/195 [14:59<00:00,  4.61s/it]  


[0]	validation_0-auc:0.76706
[1]	validation_0-auc:0.77429
[2]	validation_0-auc:0.77774
[3]	validation_0-auc:0.77774
[4]	validation_0-auc:0.79932
[5]	validation_0-auc:0.80871
[6]	validation_0-auc:0.81038
[7]	validation_0-auc:0.81044
[8]	validation_0-auc:0.81274
[9]	validation_0-auc:0.81507
[10]	validation_0-auc:0.81654
[11]	validation_0-auc:0.81702
[12]	validation_0-auc:0.81728
[13]	validation_0-auc:0.81773
[14]	validation_0-auc:0.81835
[15]	validation_0-auc:0.82076
[16]	validation_0-auc:0.82212
[17]	validation_0-auc:0.82323
[18]	validation_0-auc:0.82406
[19]	validation_0-auc:0.82557
[20]	validation_0-auc:0.82654
[21]	validation_0-auc:0.82780
[22]	validation_0-auc:0.82928
[23]	validation_0-auc:0.83039
[24]	validation_0-auc:0.83064
[25]	validation_0-auc:0.83183
[26]	validation_0-auc:0.83253
[27]	validation_0-auc:0.83383
[28]	validation_0-auc:0.83418
[29]	validation_0-auc:0.83417
[30]	validation_0-auc:0.83451
[31]	validation_0-auc:0.83539
[32]	validation_0-auc:0.83576
[33]	validation_0-au

<socceraction.vaep.base.VAEP at 0x7fd70c62a970>

In [31]:
VAEP_ratings = []
for game_id, game in tqdm(list(df_games.iterrows())):
    df_events = SBL.events(game_id)
    home_team_id = df_games.at[game_id, "home_team_id"]
    df_actions = spadl.statsbomb.convert_to_actions(df_events, home_team_id)
    ratings = VAEP_model.rate(df_games.loc[game_id], df_actions)
    VAEP_ratings.append(ratings)

100%|██████████| 195/195 [14:05<00:00,  4.34s/it]


In [32]:
VAEP_ratings = pd.concat(VAEP_ratings)
rated_df = pd.concat([all_actions, VAEP_ratings], axis=1)

## Compute xT for each action

In [33]:
# 1. Convert direction of play
actions_ltr = pd.concat([
  spadl.play_left_to_right(all_actions[all_actions['game_id'] == game_id], game.home_team_id)
  for game_id, game in df_games.iterrows()
])

actions_ltr = socceraction.spadl.add_names(actions_ltr)
actions_ltr['original_index']=actions_ltr.index

# 3. Train xT model
xTModel = xthreat.ExpectedThreat(l=16, w=12)
xTModel.fit(actions_ltr)

# 4. Rate ball-progressing actions
# xT should only be used to value actions that move the ball
# and that keep the current team in possession of the ball
mov_actions = xthreat.get_successful_move_actions(actions_ltr)
mov_actions["xT_value"] = xTModel.predict(mov_actions)

# iterations:  38


In [34]:
print(f'rated_df shape: {rated_df.shape}')
print(f'move_actions shape: {mov_actions.shape}')

rated_df shape: (375518, 20)
move_actions shape: (267218, 19)


## Creating new Features

### Features to add in

**Notebooks:**

- Single match exploration and variables/data discovery
- Data prep across seasons - build main dataset and split into test/validate/test
- Statistical testing inputs against targets
- Clustering player targets, predicting next actions, validating and testing 
- Predicting transfers
- Evaluating Models - Confusion Matrix, ROC Curve
- Final findings 


**Input Features:**

- 'game_id' - maybe (might need it to generate features, but drop before feeding model)
- 'original_event_id' - maybe
- 'period_id' 
- 'time_seconds' - maybe
- 'team_id',   
- 'player_id', 
- 'start_x', 
- 'start_y', 


These would not feed into the model, as they wouldnt be known yet for the line I am running prediction on - they should be represented in the n-x columns only as part of the previous moves. 

```
- 'end_x', 
- 'end_y',
- 'type_id',
- 'result_id', 
- 'bodypart_id', 
- 'action_id', 
- 'type_name', 
- 'result_name',
- 'bodypart_name', 
```


**To add:**
- Distance X
- Distance Y 
- 'start_pitch_zone'
- end_pitch_zone
- opponent
- home (how would this work when getting predictions? and when training)
- away

**For actions n-5:**
(if using a dataset with multiple matches, make sure they are from the same game)
- Prev_action_n-x_x_distance
- Prev_action_n-x_y_distance
- n-x_same_team
- n-x_same_player
- n-x_x_direction 
- n-x_y_direction
- n-x_action
- n-x_pitch_zone

**Target output:**

- Type of action 
- Next Zone

```some of these will need to be dummied```

**Football Scoring Metrics**
- xT per action 
- VAEP 
- aggregates at player level

Then swap the player ID for the player that we'd want to swap in to create a prediction dataset. 
(see if there is a transfer within the datast we could use as validation of the model)

- use the first 2 seasons for train and validate and the last season for test 
- predict the actions of players that had a transfer between season 2 and 3 - to test how the prediction does 

Each line of data should be standalone, rather than sequential - transforming sequential - time series type data - into non-sequential data

Then xT and VEAP output

In [35]:
rated_df.reset_index(drop=True, inplace=True)
rated_df.head()

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,type_id,result_id,bodypart_id,action_id,type_name,result_name,bodypart_name,offensive_value,defensive_value,vaep_value
0,7298,2a456ec2-352c-499b-b5cc-e68bf84c7e9a,1,0.0,971,4647.0,52.058824,33.56962,50.294118,30.987342,0,1,0,0,pass,success,foot,0.0,-0.0,0.0
1,7298,7e908bd8-8e2f-44f8-9cc6-0435cd9ed3ed,1,0.0,971,4659.0,50.294118,30.987342,45.0,27.544304,21,1,0,1,dribble,success,foot,0.000491,4.2e-05,0.000532
2,7298,38023613-6b26-44e2-a0f8-9aab9960d2ff,1,0.0,971,4659.0,45.0,27.544304,31.764706,3.443038,0,1,0,2,pass,success,foot,0.000245,3.5e-05,0.00028
3,7298,0d016edd-87cf-4e77-80fa-4504f367db00,1,2.0,971,5088.0,31.764706,3.443038,10.588235,7.746835,21,1,0,3,dribble,success,foot,0.005318,1.6e-05,0.005334
4,7298,ccb57323-17d3-43db-8ae4-0d170c59c9cb,1,6.0,971,5088.0,10.588235,7.746835,10.588235,7.746835,19,0,0,4,bad_touch,fail,foot,-0.005726,-0.000571,-0.006296


In [36]:
df_actions_zones = rated_df.copy()

matches = []
for game_id, game in df_games.iterrows():
    # # load the game's events
    # game_events = SBL.events(game_id)
    # convert play left to right
    game_home_team_id = df_games.at[game_id, "home_team_id"]
    matches.append(spadl.play_left_to_right(df_actions_zones[df_actions_zones['game_id'] == game_id], home_team_id=game_home_team_id))

df_actions_zones = pd.concat(matches)

In [37]:
players = []
for game_id, game in tqdm(list(df_games.iterrows())):
    
    # load the games players
    players.append(SBL.players(game_id))


players = pd.concat(players, ignore_index=True)

100%|██████████| 195/195 [02:57<00:00,  1.10it/s]


In [38]:
goal_keepers = players[players['starting_position_id'] == 1].drop_duplicates(subset='player_name')

In [39]:
goal_keepers = goal_keepers[['player_id', 'player_name']]

In [40]:
zones = []
end_zones = []

for i, row in df_actions_zones.iterrows():
    if ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_1')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_2')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_3')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_4')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_5')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_6')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_7')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_8')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_9')
    else:
        zones.append('no_zone')

for i, row in df_actions_zones.iterrows():
    if ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_1')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_2')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_3')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_4')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_5')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_6')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_7')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_8')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_9')
    else:
        end_zones.append('no_zone')

df_actions_zones['start_pitch_zone'] = zones
df_actions_zones['end_pitch_zone'] = end_zones


In [41]:
mov_actions_with_zones = mov_actions.copy()

In [42]:
zones = []
end_zones = []

for i, row in mov_actions_with_zones.iterrows():
    if ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_1')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_2')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_3')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_4')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_5')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_6')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_7')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_8')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_9')
    else:
        zones.append('no_zone')

for i, row in mov_actions_with_zones.iterrows():
    if ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_1')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_2')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_3')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_4')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_5')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_6')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_7')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_8')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_9')
    else:
        end_zones.append('no_zone')

mov_actions_with_zones['start_pitch_zone'] = zones
mov_actions_with_zones['end_pitch_zone'] = end_zones

In [43]:
# opposition 
matches = list(df_actions_zones['game_id'].unique())
match_teams = {match: list(df_actions_zones[df_actions_zones['game_id'] == match]['team_id'].unique()) for match in matches}

df_actions_zones['opponent_id'] = [match_teams[k][1] if match_teams[k][0] == row['team_id'] else match_teams[k][0] \
     for i, row in df_actions_zones.iterrows() \
        for k, v in match_teams.items() \
            if row['game_id'] == k]

# update action type to pass/dribble/other to deal with class imbalance
df_actions_zones['type_name_encoded'] = ['pass' if x =='pass' else\
     ('dribble' if x =='dribble' else 'other')\
         for x in df_actions_zones['type_name']]

# Home or Away
df_actions_zones['home'] = [1 if np.array(df_games[df_games.index == row['game_id']]['home_team_id'] == row['team_id'])[0] \
    else 0 for i, row in df_actions_zones.iterrows() \
        for match in matches if row['game_id'] == match]

# x and y distance traveled 
df_actions_zones['x_dif'] = (df_actions_zones['end_x'] - df_actions_zones['start_x'])
df_actions_zones['y_dif'] = (df_actions_zones['end_y'] - df_actions_zones['start_y'])

#prior moves
print('Starting prior moves block')
for n in range(1,6):
    df_actions_zones[f'n-{n}_x_distance'] = df_actions_zones.groupby('game_id').shift(n)[['x_dif']]
    df_actions_zones[f'n-{n}_y_distance'] = df_actions_zones.groupby('game_id').shift(n)[['y_dif']]
    df_actions_zones[f'n-{n}_same_team'] = df_actions_zones.team_id.eq(df_actions_zones.team_id.shift(n))
    df_actions_zones[f'n-{n}_same_player'] = df_actions_zones.player_id.eq(df_actions_zones.player_id.shift(n))
    df_actions_zones[f'n-{n}_x_fwd_direction'] = [1 if row['x_dif'] > 0 else 0 for i, row in df_actions_zones.groupby('game_id').shift(n).iterrows()]
    df_actions_zones[f'n-{n}_y_lft_right_direction'] = [1 if row['y_dif'] > 0 else 0 for i, row in df_actions_zones.groupby('game_id').shift(n).iterrows()]
    df_actions_zones[f'n-{n}_start_pitch_zone'] = df_actions_zones.groupby('game_id').shift(n)[['start_pitch_zone']]
    df_actions_zones[f'n-{n}_end_pitch_zone'] = df_actions_zones.groupby('game_id').shift(n)[['end_pitch_zone']]
    df_actions_zones[f'n-{n}_start_x'] = df_actions_zones.groupby('game_id').shift(n)[['start_x']]
    df_actions_zones[f'n-{n}_start_y'] = df_actions_zones.groupby('game_id').shift(n)[['start_y']]
    df_actions_zones[f'n-{n}_end_x'] = df_actions_zones.groupby('game_id').shift(n)[['end_x']]
    df_actions_zones[f'n-{n}_end_y'] = df_actions_zones.groupby('game_id').shift(n)[['end_y']]
    df_actions_zones[f'n-{n}_type_name'] = df_actions_zones.groupby('game_id').shift(n)[['type_name']]
    df_actions_zones[f'n-{n}_type_name_encoded'] = df_actions_zones.groupby('game_id').shift(n)[['type_name_encoded']]
    df_actions_zones[f'n-{n}_result_name'] = df_actions_zones.groupby('game_id').shift(n)[['result_name']]
    df_actions_zones[f'n-{n}_bodypart_name'] = df_actions_zones.groupby('game_id').shift(n)[['bodypart_name']]
    df_actions_zones[f'n-{n}_offensive_value'] = df_actions_zones.groupby('game_id').shift(n)[['offensive_value']]
    df_actions_zones[f'n-{n}_defensive_value'] = df_actions_zones.groupby('game_id').shift(n)[['defensive_value']]
    df_actions_zones[f'n-{n}_vaep_value'] = df_actions_zones.groupby('game_id').shift(n)[['vaep_value']]
    # df_actions_zones[f'n-{n}_type_id'] = df_actions_zones.groupby('game_id').shift(n)[['type_id']]
    # df_actions_zones[f'n-{n}_result_id'] = df_actions_zones.groupby('game_id').shift(n)[['result_id']]
    # df_actions_zones[f'n-{n}_bodypart_id'] = df_actions_zones.groupby('game_id').shift(n)[['bodypart_id']]

Starting prior moves block


TO DO:

0. add a notebook 1 with a single match exploration - this becomes notebook 2 
1. group by zones and add in xT & VAEP Scores 
2. Visualise those 
3. Find top players for each team by xT for their end locations 
4. Calculate the xT difference between the start and end location 
5. EDA into notebook 2 - add in 5 EDA sections
    - zone analysis
    - top players
    - heatmaps for passes/dribbles/crosses and end_zones 
    - distance analysis 

other:
- modify code to add in seasons 1,2 - create season 3 as a separate test dataset 
- find player transfers


## Creating Test Set

In [75]:
## Creating test set

VAEP_model = VAEP(nb_prev_actions=5)

# compute features and labels for each game
all_features_test, all_labels_test, all_actions_test = [], [], []
for game_id, game in tqdm(list(df_games_test.iterrows())):
    # load the game's events
    game_events = SBL.events(game_id)
    
    # convert the events to actions
    game_home_team_id = df_games_test.at[game_id, "home_team_id"]
    game_actions = spadl.statsbomb.convert_to_actions(game_events, game_home_team_id)
    game_actions = socceraction.spadl.add_names(game_actions)
    
    # compute features and labels
    all_actions_test.append(game_actions)
    all_features_test.append(VAEP_model.compute_features(game, game_actions))
    all_labels_test.append(VAEP_model.compute_labels(game, game_actions))

# combine all features and labels in a single dataframe
all_actions_test = pd.concat(all_actions_test)
all_features_test = pd.concat(all_features_test)
all_labels_test = pd.concat(all_labels_test)

# fit the model
VAEP_model.fit(all_features_test, all_labels_test)

VAEP_ratings_test = []
for game_id, game in tqdm(list(df_games_test.iterrows())):
    df_events = SBL.events(game_id)
    home_team_id = df_games_test.at[game_id, "home_team_id"]
    df_actions = spadl.statsbomb.convert_to_actions(df_events, home_team_id)
    ratings = VAEP_model.rate(df_games_test.loc[game_id],df_actions)
    VAEP_ratings_test.append(ratings)
VAEP_ratings_test = pd.concat(VAEP_ratings_test)
rated_df_test = pd.concat([all_actions_test, VAEP_ratings_test], axis=1)

# Converting to left to right play
matches = []
for game_id, game in df_games_test.iterrows():
  
    # convert play left to right
    game_home_team_id = df_games_test.at[game_id, "home_team_id"]
    matches.append(spadl.play_left_to_right(rated_df_test[rated_df_test['game_id'] == game_id], home_team_id=game_home_team_id))

df_actions_zones_test = pd.concat(matches)

# Get players in test set
players_test = []
for game_id, game in tqdm(list(df_games_test.iterrows())):
    
    # load the games players
    players_test.append(SBL.players(game_id))


players_test = pd.concat(players_test, ignore_index=True)


#Create start and end zones
zones = []
end_zones = []

for i, row in df_actions_zones_test.iterrows():
    if ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_1')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_2')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_3')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_4')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_5')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_6')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_7')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_8')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_9')
    else:
        zones.append('no_zone')

for i, row in df_actions_zones_test.iterrows():
    if ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_1')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_2')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_3')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_4')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_5')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_6')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_7')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_8')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_9')
    else:
        end_zones.append('no_zone')

df_actions_zones_test['end_pitch_zone'] = end_zones
df_actions_zones_test['start_pitch_zone'] = zones

# opposition 
matches = list(df_actions_zones_test['game_id'].unique())
match_teams = {match: list(df_actions_zones_test[df_actions_zones_test['game_id'] == match]['team_id'].unique()) for match in matches}

df_actions_zones_test['opponent_id'] = [match_teams[k][1] if match_teams[k][0] == row['team_id'] else match_teams[k][0] \
     for i, row in df_actions_zones_test.iterrows() \
        for k, v in match_teams.items() \
            if row['game_id'] == k]

# update action type to pass/dribble/other to deal with class imbalance
df_actions_zones_test['type_name_encoded'] = ['pass' if x =='pass' else\
     ('dribble' if x =='dribble' else 'other')\
         for x in df_actions_zones_test['type_name']]

# Home or Away
df_actions_zones_test['home'] = [1 if np.array(df_games_test[df_games_test.index == row['game_id']]['home_team_id'] == row['team_id'])[0] \
    else 0 for i, row in df_actions_zones_test.iterrows() \
        for match in matches if row['game_id'] == match]

# x and y distance traveled 
df_actions_zones_test['x_dif'] = (df_actions_zones_test['end_x'] - df_actions_zones_test['start_x'])
df_actions_zones_test['y_dif'] = (df_actions_zones_test['end_y'] - df_actions_zones_test['start_y'])

#prior moves
print('Starting prior moves block')
for n in range(1,6):
    df_actions_zones_test[f'n-{n}_x_distance'] = df_actions_zones_test.groupby('game_id').shift(n)[['x_dif']]
    df_actions_zones_test[f'n-{n}_y_distance'] = df_actions_zones_test.groupby('game_id').shift(n)[['y_dif']]
    df_actions_zones_test[f'n-{n}_same_team'] = df_actions_zones_test.team_id.eq(df_actions_zones_test.team_id.shift(n))
    df_actions_zones_test[f'n-{n}_same_player'] = df_actions_zones_test.player_id.eq(df_actions_zones_test.player_id.shift(n))
    df_actions_zones_test[f'n-{n}_x_fwd_direction'] = [1 if row['x_dif'] > 0 else 0 for i, row in df_actions_zones_test.groupby('game_id').shift(n).iterrows()]
    df_actions_zones_test[f'n-{n}_y_lft_right_direction'] = [1 if row['y_dif'] > 0 else 0 for i, row in df_actions_zones_test.groupby('game_id').shift(n).iterrows()]
    df_actions_zones_test[f'n-{n}_start_pitch_zone'] = df_actions_zones_test.groupby('game_id').shift(n)[['start_pitch_zone']]
    df_actions_zones_test[f'n-{n}_end_pitch_zone'] = df_actions_zones_test.groupby('game_id').shift(n)[['end_pitch_zone']]
    df_actions_zones_test[f'n-{n}_start_x'] = df_actions_zones_test.groupby('game_id').shift(n)[['start_x']]
    df_actions_zones_test[f'n-{n}_start_y'] = df_actions_zones_test.groupby('game_id').shift(n)[['start_y']]
    df_actions_zones_test[f'n-{n}_end_x'] = df_actions_zones_test.groupby('game_id').shift(n)[['end_x']]
    df_actions_zones_test[f'n-{n}_end_y'] = df_actions_zones_test.groupby('game_id').shift(n)[['end_y']]
    df_actions_zones_test[f'n-{n}_type_name'] = df_actions_zones_test.groupby('game_id').shift(n)[['type_name']]
    df_actions_zones_test[f'n-{n}_type_name_encoded'] = df_actions_zones_test.groupby('game_id').shift(n)[['type_name_encoded']]
    df_actions_zones_test[f'n-{n}_result_name'] = df_actions_zones_test.groupby('game_id').shift(n)[['result_name']]
    df_actions_zones_test[f'n-{n}_bodypart_name'] = df_actions_zones_test.groupby('game_id').shift(n)[['bodypart_name']]
    df_actions_zones_test[f'n-{n}_offensive_value'] = df_actions_zones_test.groupby('game_id').shift(n)[['offensive_value']]
    df_actions_zones_test[f'n-{n}_defensive_value'] = df_actions_zones_test.groupby('game_id').shift(n)[['defensive_value']]
    df_actions_zones_test[f'n-{n}_vaep_value'] = df_actions_zones_test.groupby('game_id').shift(n)[['vaep_value']]

100%|██████████| 131/131 [05:37<00:00,  2.57s/it]


[0]	validation_0-auc:0.78892
[1]	validation_0-auc:0.79847
[2]	validation_0-auc:0.80114
[3]	validation_0-auc:0.80128
[4]	validation_0-auc:0.81684
[5]	validation_0-auc:0.82116
[6]	validation_0-auc:0.82170
[7]	validation_0-auc:0.82428
[8]	validation_0-auc:0.82713
[9]	validation_0-auc:0.82660
[10]	validation_0-auc:0.82980
[11]	validation_0-auc:0.83026
[12]	validation_0-auc:0.83075
[13]	validation_0-auc:0.83394
[14]	validation_0-auc:0.83664
[15]	validation_0-auc:0.83606
[16]	validation_0-auc:0.83762
[17]	validation_0-auc:0.84002
[18]	validation_0-auc:0.84094
[19]	validation_0-auc:0.84215
[20]	validation_0-auc:0.84440
[21]	validation_0-auc:0.84651
[22]	validation_0-auc:0.84891
[23]	validation_0-auc:0.84990
[24]	validation_0-auc:0.85026
[25]	validation_0-auc:0.85179
[26]	validation_0-auc:0.85246
[27]	validation_0-auc:0.85258
[28]	validation_0-auc:0.85281
[29]	validation_0-auc:0.85387
[30]	validation_0-auc:0.85336
[31]	validation_0-auc:0.85460
[32]	validation_0-auc:0.85477
[33]	validation_0-au

100%|██████████| 131/131 [05:38<00:00,  2.59s/it]
100%|██████████| 131/131 [03:02<00:00,  1.39s/it]


Starting prior moves block
