# Notebook 2 of 6 - Creating main train and test sets
**Author:** Alexandru Mihalache 

**Date:** November 2022


In [1]:
import pandas as pd
import numpy as np

#Plotting libs
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


#Football libs
import socceraction
from socceraction.data.statsbomb import StatsBombLoader
from mplsoccer import Pitch, Sbopen, VerticalPitch
import socceraction.spadl as spadl
import matplotsoccer as mps
import socceraction.xthreat as xthreat
import socceraction.spadl as spadl
from socceraction.vaep import VAEP

# utils
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz
# process is used to compare a string to MULTIPLE other strings
from fuzzywuzzy import process

  from pandas import MultiIndex, Int64Index


### Introduction

In this notebook we will create the main test and train datasets. 

The final output will include: 
- All actions for all matches 
- VAEP and xT Datasets (xT needs to be separate as it only takes actions with successful outcomes, whereas VAEP will include all actions)
- New columns describing the previous 5 columns

We will follow a similar structure as in the intro notebook, but will loop through all matches.

In [3]:
# Set up the StatsBomb data loader
SBL = StatsBombLoader()

In [4]:
# Create a dataframe with all games from FA Women's Super League 2020/2021 season as our model test dataset

# Create a dataframe with all the games in the previous 2 seasons as our train and validation set

df_games_season_4 = SBL.games(competition_id=37, season_id=4).set_index("game_id")
df_games_season_42 = SBL.games(competition_id=37, season_id=42).set_index("game_id")
df_games = pd.concat([df_games_season_4, df_games_season_42], axis=0)
df_games_test = SBL.games(competition_id=37, season_id=90).set_index("game_id")


## Compute VAEP for all matches

In [6]:
VAEP_model = VAEP(nb_prev_actions=5)

# compute features and labels for each game
all_features, all_labels, all_actions = [], [], []
for game_id, game in tqdm(list(df_games.iterrows())):
    
    # load the game's events
    game_events = SBL.events(game_id)
    
    # convert the events to actions
    game_home_team_id = df_games.at[game_id, "home_team_id"]
    game_actions = spadl.statsbomb.convert_to_actions(game_events, game_home_team_id)
    game_actions = socceraction.spadl.add_names(game_actions)
    
    # compute features and labels
    all_actions.append(game_actions)
    all_features.append(VAEP_model.compute_features(game, game_actions))
    all_labels.append(VAEP_model.compute_labels(game, game_actions))

# combine all features and labels in a single dataframe
all_actions = pd.concat(all_actions)
all_features = pd.concat(all_features)
all_labels = pd.concat(all_labels)

# fit the model
VAEP_model.fit(all_features, all_labels)

100%|██████████| 195/195 [07:36<00:00,  2.34s/it]


[0]	validation_0-auc:0.76862
[1]	validation_0-auc:0.77564
[2]	validation_0-auc:0.77559
[3]	validation_0-auc:0.77559
[4]	validation_0-auc:0.79918
[5]	validation_0-auc:0.80302
[6]	validation_0-auc:0.80532
[7]	validation_0-auc:0.80533
[8]	validation_0-auc:0.80701
[9]	validation_0-auc:0.80871
[10]	validation_0-auc:0.80957
[11]	validation_0-auc:0.81050
[12]	validation_0-auc:0.81059
[13]	validation_0-auc:0.81107
[14]	validation_0-auc:0.81171
[15]	validation_0-auc:0.81253
[16]	validation_0-auc:0.81364
[17]	validation_0-auc:0.81355
[18]	validation_0-auc:0.81460
[19]	validation_0-auc:0.81554
[20]	validation_0-auc:0.81747
[21]	validation_0-auc:0.81881
[22]	validation_0-auc:0.81962
[23]	validation_0-auc:0.82136
[24]	validation_0-auc:0.82250
[25]	validation_0-auc:0.82321
[26]	validation_0-auc:0.82408
[27]	validation_0-auc:0.82456
[28]	validation_0-auc:0.82720
[29]	validation_0-auc:0.82780
[30]	validation_0-auc:0.82836
[31]	validation_0-auc:0.82815
[32]	validation_0-auc:0.82846
[33]	validation_0-au

<socceraction.vaep.base.VAEP at 0x7fc121c4ff40>

In [8]:
VAEP_ratings = []
for game_id, game in tqdm(list(df_games.iterrows())):
    df_events = SBL.events(game_id)
    home_team_id = df_games.at[game_id, "home_team_id"]
    df_actions = spadl.statsbomb.convert_to_actions(df_events, home_team_id)
    ratings = VAEP_model.rate(df_games.loc[game_id], df_actions)
    VAEP_ratings.append(ratings)

100%|██████████| 195/195 [10:03<00:00,  3.10s/it]


In [47]:
VAEP_ratings = pd.concat(VAEP_ratings)
rated_df = pd.concat([all_actions, VAEP_ratings], axis=1)

## Compute xT for each match

In [48]:
# 1. Convert direction of play
actions_ltr = pd.concat([
  spadl.play_left_to_right(all_actions[all_actions['game_id'] == game_id], game.home_team_id)
  for game_id, game in df_games.iterrows()
])

actions_ltr = socceraction.spadl.add_names(actions_ltr)

# 3. Train xT model
xTModel = xthreat.ExpectedThreat(l=16, w=12)
xTModel.fit(actions_ltr)

# 4. Rate ball-progressing actions
# xT should only be used to value actions that move the ball
# and that keep the current team in possession of the ball
mov_actions = xthreat.get_successful_move_actions(actions_ltr)
mov_actions["xT_value"] = xTModel.predict(mov_actions)

# iterations:  38


In [49]:
print(f'rated_df shape: {rated_df.shape}')
print(f'move_actions shape: {mov_actions.shape}')

rated_df shape: (375518, 20)
move_actions shape: (267218, 18)


## Creating new Features

The SPADL schema is necessary to compute xT and VAEP using the Soccer Action library. 
However for this project we want to capture the play style of a team and then predict the output of a new transfer using this model. 

To capture the play style, I will create features to capture the description of the previous 5 actions. This means that each action will be described by the 5 actions preceding it. This helps remove the sequencing nature from the data and will allow us to make the model simpler in this version of this project. 

Both VAEP and xT looks at sequences of moves, for our prediction, we would not be able to predict whole sequences to use the same VAEP and xT built in methods, and therefore would need to derive a new approch to this - hence the creation of the 5 prior move history. 

This does however raise a limitation to this project and the models, which is that they will be an approximation of the predicted values, rather than a typical prediction, meaning we should expect less accuracy than if we were able to predict full sequences and then compute the VAEP and xT scores using the Soccer Action library.

### Features to add in

**Input Features:**

- 'game_id' - maybe (might need it to generate features, but drop before feeding model)
- 'original_event_id' - maybe
- 'period_id' 
- 'time_seconds' - maybe
- 'team_id',   
- 'player_id', 
- 'start_x', 
- 'start_y', 
- 'end_x', 
- 'end_y',
- 'type_id',
- 'result_id', 
- 'bodypart_id', 
- 'action_id', 
- 'type_name', 
- 'result_name',
- 'bodypart_name', 

**To add:**
- Distance X - how far is the ball moving horizontally and in which direction
- Distance Y  - how far is the ball moving vertically and in which direction
- start_pitch_zone - will need to split the pitch in 9 zones based on the x, y coordinates
- end_pitch_zone - will need to split the pitch in 9 zones based on the x, y coordinates
- opponent - capture the opponent id
- home - home or away action

**For actions n-5:**
For each of the previous 5 actions, lag the dataset to store their values in a new column

- Prev_action_n-x_x_distance
- Prev_action_n-x_y_distance
- n-x_same_team
- n-x_same_player
- n-x_x_direction 
- n-x_y_direction
- n-x_action
- n-x_pitch_zone

**Target output:**

- Type of action 
- Next Zone
- Predicted xT 
- Predicted VAEP


**Test/Train Approach:**

Using a typical train/test approach would not be ideal for this project as we have seasons to consider, where players transfer, and we need the transfer information to assess the performance of the models. 

An alternative approach will be used, sticking to the principles of train/test splits

- use the first 2 seasons for train and validate and the last season for test - this represents 2/3 of the data, therefore ideal for training models ensuring we supply enough data to train the model with the final third being out test set
- As part of the modeling approach I will predict the actions of players that had a transfer between season 2 and 3 - to test how the prediction does against how they actually performed in season 3 based on the test dataset

**Note:** Each line of data should be standalone, rather than sequential - transforming sequential - into non-sequential data

In [50]:
rated_df.reset_index(drop=True, inplace=True)
rated_df.head()

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,type_id,result_id,bodypart_id,action_id,type_name,result_name,bodypart_name,offensive_value,defensive_value,vaep_value
0,7298,2a456ec2-352c-499b-b5cc-e68bf84c7e9a,1,0.0,971,4647.0,52.058824,33.56962,50.294118,30.987342,0,1,0,0,pass,success,foot,0.0,-0.0,0.0
1,7298,7e908bd8-8e2f-44f8-9cc6-0435cd9ed3ed,1,0.0,971,4659.0,50.294118,30.987342,45.0,27.544304,21,1,0,1,dribble,success,foot,0.001244,-6.1e-05,0.001183
2,7298,38023613-6b26-44e2-a0f8-9aab9960d2ff,1,0.0,971,4659.0,45.0,27.544304,31.764706,3.443038,0,1,0,2,pass,success,foot,0.000295,0.000537,0.000833
3,7298,0d016edd-87cf-4e77-80fa-4504f367db00,1,2.0,971,5088.0,31.764706,3.443038,10.588235,7.746835,21,1,0,3,dribble,success,foot,0.011547,-6.9e-05,0.011478
4,7298,ccb57323-17d3-43db-8ae4-0d170c59c9cb,1,6.0,971,5088.0,10.588235,7.746835,10.588235,7.746835,19,0,0,4,bad_touch,fail,foot,-0.012453,-0.000211,-0.012663


Create an utility dataset of all the matches to help in filtering and looping through when creating the main datasets.

In [7]:
df_actions_zones = rated_df.copy()

matches = []
for game_id, game in df_games.iterrows():
    # # load the game's events
    # game_events = SBL.events(game_id)
    # convert play left to right
    game_home_team_id = df_games.at[game_id, "home_team_id"]
    matches.append(spadl.play_left_to_right(df_actions_zones[df_actions_zones['game_id'] == game_id], home_team_id=game_home_team_id))

df_actions_zones = pd.concat(matches)

NameError: name 'rated_df' is not defined

### Split the pitch into 9 zones

Use the pitch zones, to bin the actions in a match, into different pitch area bins, covering both start zones and end zones. 

In [52]:
zones = []
end_zones = []

for i, row in df_actions_zones.iterrows():
    if ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_1')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_2')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_3')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_4')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_5')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_6')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_7')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_8')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_9')
    else:
        zones.append('no_zone')

for i, row in df_actions_zones.iterrows():
    if ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_1')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_2')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_3')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_4')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_5')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_6')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_7')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_8')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_9')
    else:
        end_zones.append('no_zone')

df_actions_zones['start_pitch_zone'] = zones
df_actions_zones['end_pitch_zone'] = end_zones


### Create new features

In [53]:
# opposition 
matches = list(df_actions_zones['game_id'].unique())
match_teams = {match: list(df_actions_zones[df_actions_zones['game_id'] == match]['team_id'].unique()) for match in matches}

df_actions_zones['opponent_id'] = [match_teams[k][1] if match_teams[k][0] == row['team_id'] else match_teams[k][0] \
     for i, row in df_actions_zones.iterrows() \
        for k, v in match_teams.items() \
            if row['game_id'] == k]

# update action type to pass/dribble/other to deal with class imbalance
df_actions_zones['type_name_encoded'] = ['pass' if x =='pass' else\
     ('dribble' if x =='dribble' else 'other')\
         for x in df_actions_zones['type_name']]

# Home or Away
df_actions_zones['home'] = [1 if np.array(df_games[df_games.index == row['game_id']]['home_team_id'] == row['team_id'])[0] \
    else 0 for i, row in df_actions_zones.iterrows() \
        for match in matches if row['game_id'] == match]

# x and y distance traveled 
df_actions_zones['x_dif'] = (df_actions_zones['end_x'] - df_actions_zones['start_x'])
df_actions_zones['y_dif'] = (df_actions_zones['end_y'] - df_actions_zones['start_y'])

#prior moves
print('Starting prior moves block')
for n in range(1,6):
    df_actions_zones[f'n-{n}_x_distance'] = df_actions_zones.groupby('game_id').shift(n)[['x_dif']]
    df_actions_zones[f'n-{n}_y_distance'] = df_actions_zones.groupby('game_id').shift(n)[['y_dif']]
    df_actions_zones[f'n-{n}_same_team'] = df_actions_zones.team_id.eq(df_actions_zones.team_id.shift(n))
    df_actions_zones[f'n-{n}_same_player'] = df_actions_zones.player_id.eq(df_actions_zones.player_id.shift(n))
    df_actions_zones[f'n-{n}_x_fwd_direction'] = [1 if row['x_dif'] > 0 else 0 for i, row in df_actions_zones.groupby('game_id').shift(n).iterrows()]
    df_actions_zones[f'n-{n}_y_lft_right_direction'] = [1 if row['y_dif'] > 0 else 0 for i, row in df_actions_zones.groupby('game_id').shift(n).iterrows()]
    df_actions_zones[f'n-{n}_start_pitch_zone'] = df_actions_zones.groupby('game_id').shift(n)[['start_pitch_zone']]
    df_actions_zones[f'n-{n}_end_pitch_zone'] = df_actions_zones.groupby('game_id').shift(n)[['end_pitch_zone']]
    df_actions_zones[f'n-{n}_start_x'] = df_actions_zones.groupby('game_id').shift(n)[['start_x']]
    df_actions_zones[f'n-{n}_start_y'] = df_actions_zones.groupby('game_id').shift(n)[['start_y']]
    df_actions_zones[f'n-{n}_end_x'] = df_actions_zones.groupby('game_id').shift(n)[['end_x']]
    df_actions_zones[f'n-{n}_end_y'] = df_actions_zones.groupby('game_id').shift(n)[['end_y']]
    df_actions_zones[f'n-{n}_type_name'] = df_actions_zones.groupby('game_id').shift(n)[['type_name']]
    df_actions_zones[f'n-{n}_type_name_encoded'] = df_actions_zones.groupby('game_id').shift(n)[['type_name_encoded']]
    df_actions_zones[f'n-{n}_result_name'] = df_actions_zones.groupby('game_id').shift(n)[['result_name']]
    df_actions_zones[f'n-{n}_bodypart_name'] = df_actions_zones.groupby('game_id').shift(n)[['bodypart_name']]
    df_actions_zones[f'n-{n}_offensive_value'] = df_actions_zones.groupby('game_id').shift(n)[['offensive_value']]
    df_actions_zones[f'n-{n}_defensive_value'] = df_actions_zones.groupby('game_id').shift(n)[['defensive_value']]
    df_actions_zones[f'n-{n}_vaep_value'] = df_actions_zones.groupby('game_id').shift(n)[['vaep_value']]
    # df_actions_zones[f'n-{n}_type_id'] = df_actions_zones.groupby('game_id').shift(n)[['type_id']]
    # df_actions_zones[f'n-{n}_result_id'] = df_actions_zones.groupby('game_id').shift(n)[['result_id']]
    # df_actions_zones[f'n-{n}_bodypart_id'] = df_actions_zones.groupby('game_id').shift(n)[['bodypart_id']]

Starting prior moves block


In [54]:
mov_actions_with_zones = mov_actions.copy()

In [55]:
zones = []
end_zones = []

for i, row in mov_actions_with_zones.iterrows():
    if ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_1')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_2')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_3')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_4')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_5')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_6')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_7')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_8')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_9')
    else:
        zones.append('no_zone')

for i, row in mov_actions_with_zones.iterrows():
    if ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_1')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_2')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_3')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_4')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_5')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_6')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_7')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_8')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_9')
    else:
        end_zones.append('no_zone')

mov_actions_with_zones['start_pitch_zone'] = zones
mov_actions_with_zones['end_pitch_zone'] = end_zones

In [56]:
mov_actions_with_zones.head()

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,type_id,result_id,bodypart_id,action_id,type_name,result_name,bodypart_name,xT_value,start_pitch_zone,end_pitch_zone
0,7298,2a456ec2-352c-499b-b5cc-e68bf84c7e9a,1,0.0,971,4647.0,52.941176,34.43038,54.705882,37.012658,0,1,0,0,pass,success,foot,0.0,zone_5,zone_5
1,7298,7e908bd8-8e2f-44f8-9cc6-0435cd9ed3ed,1,0.0,971,4659.0,54.705882,37.012658,60.0,40.455696,21,1,0,1,dribble,success,foot,0.003058,zone_5,zone_5
2,7298,38023613-6b26-44e2-a0f8-9aab9960d2ff,1,0.0,971,4659.0,60.0,40.455696,73.235294,64.556962,0,1,0,2,pass,success,foot,0.001818,zone_5,zone_9
3,7298,0d016edd-87cf-4e77-80fa-4504f367db00,1,2.0,971,5088.0,73.235294,64.556962,94.411765,60.253165,21,1,0,3,dribble,success,foot,0.009766,zone_9,zone_9
8,7298,,1,30.0,746,10172.0,37.058824,6.025316,37.058824,12.050633,21,1,0,8,dribble,success,foot,0.000138,zone_4,zone_4


In [57]:
# opposition 
matches = list(mov_actions_with_zones['game_id'].unique())
match_teams = {match: list(mov_actions_with_zones[mov_actions_with_zones['game_id'] == match]['team_id'].unique()) for match in matches}

mov_actions_with_zones['opponent_id'] = [match_teams[k][1] if match_teams[k][0] == row['team_id'] else match_teams[k][0] \
     for i, row in mov_actions_with_zones.iterrows() \
        for k, v in match_teams.items() \
            if row['game_id'] == k]

# update action type to pass/dribble/other to deal with class imbalance
mov_actions_with_zones['type_name_encoded'] = ['pass' if x =='pass' else\
     ('dribble' if x =='dribble' else 'other')\
         for x in mov_actions_with_zones['type_name']]

# Home or Away
mov_actions_with_zones['home'] = [1 if np.array(df_games[df_games.index == row['game_id']]['home_team_id'] == row['team_id'])[0] \
    else 0 for i, row in mov_actions_with_zones.iterrows() \
        for match in matches if row['game_id'] == match]

# x and y distance traveled 
mov_actions_with_zones['x_dif'] = (mov_actions_with_zones['end_x'] - mov_actions_with_zones['start_x'])
mov_actions_with_zones['y_dif'] = (mov_actions_with_zones['end_y'] - mov_actions_with_zones['start_y'])

#prior moves
print('Starting prior moves block')
for n in range(1,6):
    mov_actions_with_zones[f'n-{n}_x_distance'] = mov_actions_with_zones.groupby('game_id').shift(n)[['x_dif']]
    mov_actions_with_zones[f'n-{n}_y_distance'] = mov_actions_with_zones.groupby('game_id').shift(n)[['y_dif']]
    mov_actions_with_zones[f'n-{n}_same_team'] = mov_actions_with_zones.team_id.eq(mov_actions_with_zones.team_id.shift(n))
    mov_actions_with_zones[f'n-{n}_same_player'] = mov_actions_with_zones.player_id.eq(mov_actions_with_zones.player_id.shift(n))
    mov_actions_with_zones[f'n-{n}_x_fwd_direction'] = [1 if row['x_dif'] > 0 else 0 for i, row in mov_actions_with_zones.groupby('game_id').shift(n).iterrows()]
    mov_actions_with_zones[f'n-{n}_y_lft_right_direction'] = [1 if row['y_dif'] > 0 else 0 for i, row in mov_actions_with_zones.groupby('game_id').shift(n).iterrows()]
    mov_actions_with_zones[f'n-{n}_start_pitch_zone'] = mov_actions_with_zones.groupby('game_id').shift(n)[['start_pitch_zone']]
    mov_actions_with_zones[f'n-{n}_end_pitch_zone'] = mov_actions_with_zones.groupby('game_id').shift(n)[['end_pitch_zone']]
    mov_actions_with_zones[f'n-{n}_start_x'] = mov_actions_with_zones.groupby('game_id').shift(n)[['start_x']]
    mov_actions_with_zones[f'n-{n}_start_y'] = mov_actions_with_zones.groupby('game_id').shift(n)[['start_y']]
    mov_actions_with_zones[f'n-{n}_end_x'] = mov_actions_with_zones.groupby('game_id').shift(n)[['end_x']]
    mov_actions_with_zones[f'n-{n}_end_y'] = mov_actions_with_zones.groupby('game_id').shift(n)[['end_y']]
    mov_actions_with_zones[f'n-{n}_type_name'] = mov_actions_with_zones.groupby('game_id').shift(n)[['type_name']]
    mov_actions_with_zones[f'n-{n}_type_name_encoded'] = mov_actions_with_zones.groupby('game_id').shift(n)[['type_name_encoded']]
    mov_actions_with_zones[f'n-{n}_result_name'] = mov_actions_with_zones.groupby('game_id').shift(n)[['result_name']]
    mov_actions_with_zones[f'n-{n}_bodypart_name'] = mov_actions_with_zones.groupby('game_id').shift(n)[['bodypart_name']]
    mov_actions_with_zones[f'n-{n}_xT_value'] = mov_actions_with_zones.groupby('game_id').shift(n)[['xT_value']]
    


Starting prior moves block


In [58]:
mov_actions_with_zones.head()

Unnamed: 0,game_id,original_event_id,period_id,time_seconds,team_id,player_id,start_x,start_y,end_x,end_y,...,n-5_end_pitch_zone,n-5_start_x,n-5_start_y,n-5_end_x,n-5_end_y,n-5_type_name,n-5_type_name_encoded,n-5_result_name,n-5_bodypart_name,n-5_xT_value
0,7298,2a456ec2-352c-499b-b5cc-e68bf84c7e9a,1,0.0,971,4647.0,52.941176,34.43038,54.705882,37.012658,...,,,,,,,,,,
1,7298,7e908bd8-8e2f-44f8-9cc6-0435cd9ed3ed,1,0.0,971,4659.0,54.705882,37.012658,60.0,40.455696,...,,,,,,,,,,
2,7298,38023613-6b26-44e2-a0f8-9aab9960d2ff,1,0.0,971,4659.0,60.0,40.455696,73.235294,64.556962,...,,,,,,,,,,
3,7298,0d016edd-87cf-4e77-80fa-4504f367db00,1,2.0,971,5088.0,73.235294,64.556962,94.411765,60.253165,...,,,,,,,,,,
8,7298,,1,30.0,746,10172.0,37.058824,6.025316,37.058824,12.050633,...,,,,,,,,,,


## Creating Test Set

In [59]:
## Creating test set

VAEP_model = VAEP(nb_prev_actions=5)

# compute features and labels for each game
all_features_test, all_labels_test, all_actions_test = [], [], []
for game_id, game in tqdm(list(df_games_test.iterrows())):
    # load the game's events
    game_events = SBL.events(game_id)
    
    # convert the events to actions
    game_home_team_id = df_games_test.at[game_id, "home_team_id"]
    game_actions = spadl.statsbomb.convert_to_actions(game_events, game_home_team_id)
    game_actions = socceraction.spadl.add_names(game_actions)
    
    # compute features and labels
    all_actions_test.append(game_actions)
    all_features_test.append(VAEP_model.compute_features(game, game_actions))
    all_labels_test.append(VAEP_model.compute_labels(game, game_actions))

# combine all features and labels in a single dataframe
all_actions_test = pd.concat(all_actions_test)
all_features_test = pd.concat(all_features_test)
all_labels_test = pd.concat(all_labels_test)

# fit the model
VAEP_model.fit(all_features_test, all_labels_test)

VAEP_ratings_test = []
for game_id, game in tqdm(list(df_games_test.iterrows())):
    df_events = SBL.events(game_id)
    home_team_id = df_games_test.at[game_id, "home_team_id"]
    df_actions = spadl.statsbomb.convert_to_actions(df_events, home_team_id)
    ratings = VAEP_model.rate(df_games_test.loc[game_id],df_actions)
    VAEP_ratings_test.append(ratings)
VAEP_ratings_test = pd.concat(VAEP_ratings_test)
rated_df_test = pd.concat([all_actions_test, VAEP_ratings_test], axis=1)

# Converting to left to right play
matches = []
for game_id, game in df_games_test.iterrows():
  
    # convert play left to right
    game_home_team_id = df_games_test.at[game_id, "home_team_id"]
    matches.append(spadl.play_left_to_right(rated_df_test[rated_df_test['game_id'] == game_id], home_team_id=game_home_team_id))

df_actions_zones_test = pd.concat(matches)

# Get players in test set
players_test = []
for game_id, game in tqdm(list(df_games_test.iterrows())):
    
    # load the games players
    players_test.append(SBL.players(game_id))


players_test = pd.concat(players_test, ignore_index=True)


#Create start and end zones
zones = []
end_zones = []

for i, row in df_actions_zones_test.iterrows():
    if ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_1')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_2')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_3')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_4')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_5')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_6')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_7')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_8')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_9')
    else:
        zones.append('no_zone')

for i, row in df_actions_zones_test.iterrows():
    if ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_1')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_2')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_3')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_4')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_5')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_6')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_7')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_8')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_9')
    else:
        end_zones.append('no_zone')

df_actions_zones_test['end_pitch_zone'] = end_zones
df_actions_zones_test['start_pitch_zone'] = zones

# opposition 
matches = list(df_actions_zones_test['game_id'].unique())
match_teams = {match: list(df_actions_zones_test[df_actions_zones_test['game_id'] == match]['team_id'].unique()) for match in matches}

df_actions_zones_test['opponent_id'] = [match_teams[k][1] if match_teams[k][0] == row['team_id'] else match_teams[k][0] \
     for i, row in df_actions_zones_test.iterrows() \
        for k, v in match_teams.items() \
            if row['game_id'] == k]

# update action type to pass/dribble/other to deal with class imbalance
df_actions_zones_test['type_name_encoded'] = ['pass' if x =='pass' else\
     ('dribble' if x =='dribble' else 'other')\
         for x in df_actions_zones_test['type_name']]

# Home or Away
df_actions_zones_test['home'] = [1 if np.array(df_games_test[df_games_test.index == row['game_id']]['home_team_id'] == row['team_id'])[0] \
    else 0 for i, row in df_actions_zones_test.iterrows() \
        for match in matches if row['game_id'] == match]

# x and y distance traveled 
df_actions_zones_test['x_dif'] = (df_actions_zones_test['end_x'] - df_actions_zones_test['start_x'])
df_actions_zones_test['y_dif'] = (df_actions_zones_test['end_y'] - df_actions_zones_test['start_y'])

#prior moves
print('Starting prior moves block')
for n in range(1,6):
    df_actions_zones_test[f'n-{n}_x_distance'] = df_actions_zones_test.groupby('game_id').shift(n)[['x_dif']]
    df_actions_zones_test[f'n-{n}_y_distance'] = df_actions_zones_test.groupby('game_id').shift(n)[['y_dif']]
    df_actions_zones_test[f'n-{n}_same_team'] = df_actions_zones_test.team_id.eq(df_actions_zones_test.team_id.shift(n))
    df_actions_zones_test[f'n-{n}_same_player'] = df_actions_zones_test.player_id.eq(df_actions_zones_test.player_id.shift(n))
    df_actions_zones_test[f'n-{n}_x_fwd_direction'] = [1 if row['x_dif'] > 0 else 0 for i, row in df_actions_zones_test.groupby('game_id').shift(n).iterrows()]
    df_actions_zones_test[f'n-{n}_y_lft_right_direction'] = [1 if row['y_dif'] > 0 else 0 for i, row in df_actions_zones_test.groupby('game_id').shift(n).iterrows()]
    df_actions_zones_test[f'n-{n}_start_pitch_zone'] = df_actions_zones_test.groupby('game_id').shift(n)[['start_pitch_zone']]
    df_actions_zones_test[f'n-{n}_end_pitch_zone'] = df_actions_zones_test.groupby('game_id').shift(n)[['end_pitch_zone']]
    df_actions_zones_test[f'n-{n}_start_x'] = df_actions_zones_test.groupby('game_id').shift(n)[['start_x']]
    df_actions_zones_test[f'n-{n}_start_y'] = df_actions_zones_test.groupby('game_id').shift(n)[['start_y']]
    df_actions_zones_test[f'n-{n}_end_x'] = df_actions_zones_test.groupby('game_id').shift(n)[['end_x']]
    df_actions_zones_test[f'n-{n}_end_y'] = df_actions_zones_test.groupby('game_id').shift(n)[['end_y']]
    df_actions_zones_test[f'n-{n}_type_name'] = df_actions_zones_test.groupby('game_id').shift(n)[['type_name']]
    df_actions_zones_test[f'n-{n}_type_name_encoded'] = df_actions_zones_test.groupby('game_id').shift(n)[['type_name_encoded']]
    df_actions_zones_test[f'n-{n}_result_name'] = df_actions_zones_test.groupby('game_id').shift(n)[['result_name']]
    df_actions_zones_test[f'n-{n}_bodypart_name'] = df_actions_zones_test.groupby('game_id').shift(n)[['bodypart_name']]
    df_actions_zones_test[f'n-{n}_offensive_value'] = df_actions_zones_test.groupby('game_id').shift(n)[['offensive_value']]
    df_actions_zones_test[f'n-{n}_defensive_value'] = df_actions_zones_test.groupby('game_id').shift(n)[['defensive_value']]
    df_actions_zones_test[f'n-{n}_vaep_value'] = df_actions_zones_test.groupby('game_id').shift(n)[['vaep_value']]

100%|██████████| 131/131 [04:35<00:00,  2.10s/it]


[0]	validation_0-auc:0.76503
[1]	validation_0-auc:0.77144
[2]	validation_0-auc:0.77724
[3]	validation_0-auc:0.77752
[4]	validation_0-auc:0.78558
[5]	validation_0-auc:0.79599
[6]	validation_0-auc:0.79794
[7]	validation_0-auc:0.79922
[8]	validation_0-auc:0.79908
[9]	validation_0-auc:0.79891
[10]	validation_0-auc:0.80006
[11]	validation_0-auc:0.81091
[12]	validation_0-auc:0.81388
[13]	validation_0-auc:0.81576
[14]	validation_0-auc:0.81659
[15]	validation_0-auc:0.81738
[16]	validation_0-auc:0.81919
[17]	validation_0-auc:0.82214
[18]	validation_0-auc:0.82434
[19]	validation_0-auc:0.82485
[20]	validation_0-auc:0.82829
[21]	validation_0-auc:0.83163
[22]	validation_0-auc:0.83224
[23]	validation_0-auc:0.83308
[24]	validation_0-auc:0.83444
[25]	validation_0-auc:0.83436
[26]	validation_0-auc:0.83532
[27]	validation_0-auc:0.83506
[28]	validation_0-auc:0.83504
[29]	validation_0-auc:0.83562
[30]	validation_0-auc:0.83634
[31]	validation_0-auc:0.83704
[32]	validation_0-auc:0.83733
[33]	validation_0-au

100%|██████████| 131/131 [05:32<00:00,  2.54s/it]
100%|██████████| 131/131 [01:55<00:00,  1.14it/s]


Starting prior moves block


#### Save dataframes to CSV

In [60]:
df_actions_zones.to_csv('data/vaep.csv', index=False)
df_actions_zones_test.to_csv('data/vaep_test.csv', index=False)

## Create xT based test set

In [61]:
# 1. Convert direction of play
actions_ltr_test = pd.concat([
  spadl.play_left_to_right(all_actions_test[all_actions_test['game_id'] == game_id], game.home_team_id)
  for game_id, game in df_games_test.iterrows()
])

actions_ltr_test = socceraction.spadl.add_names(actions_ltr_test)

# 3. Train xT model
xTModel = xthreat.ExpectedThreat(l=16, w=12)
xTModel.fit(actions_ltr_test)

# 4. Rate ball-progressing actions
# xT should only be used to value actions that move the ball
# and that keep the current team in possession of the ball
mov_actions_test = xthreat.get_successful_move_actions(actions_ltr)
mov_actions_test["xT_value"] = xTModel.predict(mov_actions_test)


zones = []
end_zones = []

for i, row in mov_actions_test.iterrows():
    if ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_1')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_2')
    elif ((row['start_x'] >=0) & (row['start_x'] < 35)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_3')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_4')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_5')
    elif ((row['start_x'] >=35) & (row['start_x'] < 70)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_6')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=0) & (row['start_y'] < 23)):
        zones.append('zone_7')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=23) & (row['start_y'] < 46)):
        zones.append('zone_8')
    elif ((row['start_x'] >=70) & (row['start_x'] <= 105)) & ((row['start_y']>=46) & (row['start_y'] <= 69)):
        zones.append('zone_9')
    else:
        zones.append('no_zone')

for i, row in mov_actions_test.iterrows():
    if ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_1')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_2')
    elif ((row['end_x'] >=0) & (row['end_x'] < 35)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_3')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_4')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_5')
    elif ((row['end_x'] >=35) & (row['end_x'] < 70)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_6')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=0) & (row['end_y'] < 23)):
        end_zones.append('zone_7')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=23) & (row['end_y'] < 46)):
        end_zones.append('zone_8')
    elif ((row['end_x'] >=70) & (row['end_x'] <= 105)) & ((row['end_y']>=46) & (row['end_y'] <= 69)):
        end_zones.append('zone_9')
    else:
        end_zones.append('no_zone')

mov_actions_test['start_pitch_zone'] = zones
mov_actions_test['end_pitch_zone'] = end_zones

# opposition 
matches = list(mov_actions_test['game_id'].unique())
match_teams = {match: list(mov_actions_test[mov_actions_test['game_id'] == match]['team_id'].unique()) for match in matches}

mov_actions_test['opponent_id'] = [match_teams[k][1] if match_teams[k][0] == row['team_id'] else match_teams[k][0] \
     for i, row in mov_actions_test.iterrows() \
        for k, v in match_teams.items() \
            if row['game_id'] == k]

# update action type to pass/dribble/other to deal with class imbalance
mov_actions_test['type_name_encoded'] = ['pass' if x =='pass' else\
     ('dribble' if x =='dribble' else 'other')\
         for x in mov_actions_test['type_name']]

# Home or Away
mov_actions_test['home'] = [1 if np.array(df_games[df_games.index == row['game_id']]['home_team_id'] == row['team_id'])[0] \
    else 0 for i, row in mov_actions_test.iterrows() \
        for match in matches if row['game_id'] == match]

# x and y distance traveled 
mov_actions_test['x_dif'] = (mov_actions_test['end_x'] - mov_actions_test['start_x'])
mov_actions_test['y_dif'] = (mov_actions_test['end_y'] - mov_actions_test['start_y'])

#prior moves
print('Starting prior moves block')
for n in range(1,6):
    mov_actions_test[f'n-{n}_x_distance'] = mov_actions_test.groupby('game_id').shift(n)[['x_dif']]
    mov_actions_test[f'n-{n}_y_distance'] = mov_actions_test.groupby('game_id').shift(n)[['y_dif']]
    mov_actions_test[f'n-{n}_same_team'] = mov_actions_test.team_id.eq(mov_actions_test.team_id.shift(n))
    mov_actions_test[f'n-{n}_same_player'] = mov_actions_test.player_id.eq(mov_actions_test.player_id.shift(n))
    mov_actions_test[f'n-{n}_x_fwd_direction'] = [1 if row['x_dif'] > 0 else 0 for i, row in mov_actions_test.groupby('game_id').shift(n).iterrows()]
    mov_actions_test[f'n-{n}_y_lft_right_direction'] = [1 if row['y_dif'] > 0 else 0 for i, row in mov_actions_test.groupby('game_id').shift(n).iterrows()]
    mov_actions_test[f'n-{n}_start_pitch_zone'] = mov_actions_test.groupby('game_id').shift(n)[['start_pitch_zone']]
    mov_actions_test[f'n-{n}_end_pitch_zone'] = mov_actions_test.groupby('game_id').shift(n)[['end_pitch_zone']]
    mov_actions_test[f'n-{n}_start_x'] = mov_actions_test.groupby('game_id').shift(n)[['start_x']]
    mov_actions_test[f'n-{n}_start_y'] = mov_actions_test.groupby('game_id').shift(n)[['start_y']]
    mov_actions_test[f'n-{n}_end_x'] = mov_actions_test.groupby('game_id').shift(n)[['end_x']]
    mov_actions_test[f'n-{n}_end_y'] = mov_actions_test.groupby('game_id').shift(n)[['end_y']]
    mov_actions_test[f'n-{n}_type_name'] = mov_actions_test.groupby('game_id').shift(n)[['type_name']]
    mov_actions_test[f'n-{n}_type_name_encoded'] = mov_actions_test.groupby('game_id').shift(n)[['type_name_encoded']]
    mov_actions_test[f'n-{n}_result_name'] = mov_actions_test.groupby('game_id').shift(n)[['result_name']]
    mov_actions_test[f'n-{n}_bodypart_name'] = mov_actions_test.groupby('game_id').shift(n)[['bodypart_name']]
    mov_actions_test[f'n-{n}_xT_value'] = mov_actions_test.groupby('game_id').shift(n)[['xT_value']]

# iterations:  39
Starting prior moves block


In [62]:
mov_actions_with_zones.to_csv('data/xt.csv', index=False)
mov_actions_test.to_csv('data/xt_test.csv', index=False)

In [65]:
players.to_csv('data/players.csv', index=False)
players_test.to_csv('data/players_test.csv', index=False)
df_games.to_csv('data/games.csv')
df_games_test.to_csv('data/games_test.csv')

### Transfer data - utility dataset

Create an utility dataset to help us filter and find targets for our analysis.

In [None]:
## Identifying transfers
# Extenal research into players transfered into teams for the 20/21 season (our test set)
transfer_players = ["Anna Patten","Carlotte Wubben-Moy","Lydia Williams","Noelle Maritz","Stephanie Catley","Fran Stenson","Mana Iwabuchi","Ramona Petzelberger","Anita Asante","Diana Silva","Freya Gregory","Stine Larsen","Caroline Siems","Lisa Weiß","Ruby Mace","Sophie Whitehouse","Veatriki Sarri","Ruesha Littlejohn","Emily Murphy","Jamie-Lee Napier","Rachel Corsie","Mollie Green","Chloe McCarron","Christie Murray","Emma Koivisto","Katie Startup","Rebekah Stott","Inessa Kaagman","Nora Heroum","Katie Robinson","Kiera Skeels","Molly Pike","Benedicte Håland","Emma Bissell","Aimee Palmer","Jemma Purfield","Laura Rafferty","Ella Rutherford","Zećira Mušović","Pernille Harder","Niamh Charles","Jessie Fleming","Melanie Leupolz","Alisha Lehmann","Jill Scott","Claire Emslie","Valérie Gauvin","Nicoline Sørensen","Damaris Egurrola","Poppy Pattinson","Rikke Sevecke","Ingrid Wold","Abby Dahlkemper","Alex Greenwood","Lucy Bronze","Samantha Mewis","Chloe Kelly","Esme Morgan","Emily Ramsey","Maria Thorisdottir","Fran Bentley","Alessia Russo","Ivana Fuso","Lucy Staniforth","Mollie Green","Carrie Jones","Ona Batlle","Danielle Carter","Deanna Cooper","Jessica Fishlock","Silvana Flores","Ga-Eul Jeon","Emma Mukandi","Erin Nayler","Lily Woodham","So-hyun Cho","Abbie McManus","Alanna Kennedy","Sophie Whitehouse","Alex Morgan","Shelina Zadorsky","Kerys Harrop","Aurora Mikalsen","Rachel Williams","Dagný Brynjarsdóttir","Anouk Denton","Jacynta Galabadaarachchi","Emily Ramsey","Lois Joel","Rachel Daly","Emily van Egmond","Hawa Cissoko","Kateřina Svitková","Mackenzie Arnold","Mia Cruickshank","Ruby Grant","Maz Pacheco"]

# We likely have some name mis-matches between the source of the transfer data and our dataset.

transfer_players_df = pd.DataFrame(transfer_players, columns=['player_name'])
list1 = transfer_players
list2 = players_test['player_name'].tolist()
threshold=70

# empty lists for storing the
# matches later
mat1 = []
mat2 = []

# iterating through list1 to extract 
# it's closest match from list2
for i in list1:
    mat1.append(process.extract(i, list2, limit=2))
transfer_players_df['matches'] = mat1

# iterating through the closest
# matches to filter out the
# maximum closest match

for j in transfer_players_df['matches']:
    
    for k in j:
        p = []
        if k[1] >= threshold:
            p.append(k[0])      
    mat2.append(",".join(p))
    
      
# storing the resultant matches 
# back to dframe1
transfer_players_df['matches'] = mat2
  
player_target_list = transfer_players_df['matches'].tolist()
player_target_df = players_test[(players_test['player_name'].isin(player_target_list)) & (players_test['starting_position_name']!='Goalkeeper')].groupby(['player_id','player_name'])['minutes_played'].sum().reset_index().sort_values(by='minutes_played',ascending=False)

player_target_df.to_csv('data/target_players.csv', index=False)