In [11]:
import pandas as pd
import numpy as np

#Plotting liibs
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


#Football libs
import socceraction
from socceraction.data.statsbomb import StatsBombLoader
from mplsoccer import Pitch, Sbopen, VerticalPitch
import socceraction.spadl as spadl
import matplotsoccer as mps
import socceraction.xthreat as xthreat
import socceraction.spadl as spadl
from socceraction.vaep import VAEP

# utils
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz
# process is used to compare a string to MULTIPLE other strings
from fuzzywuzzy import process

# ML libs
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_classif, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler,  MinMaxScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA



In [102]:
numeric_features = [
    'start_x',
    'start_y',
    'time_seconds',
    'n-1_x_distance',
    'n-1_y_distance',
    'n-1_start_x',
    'n-1_start_y',
    'n-1_end_x',
    'n-1_end_y',
    'n-1_offensive_value',
    'n-1_defensive_value',
    'n-1_vaep_value',
    'n-2_x_distance',
    'n-2_y_distance',
    'n-2_start_x',
    'n-2_start_y',
    'n-2_end_x',
    'n-2_end_y',
    'n-2_offensive_value',
    'n-2_defensive_value',
    'n-2_vaep_value',
    'n-3_x_distance',
    'n-3_y_distance',
    'n-3_start_x',
    'n-3_start_y',
    'n-3_end_x',
    'n-3_end_y',
    'n-3_offensive_value',
    'n-3_defensive_value',
    'n-3_vaep_value',
    'n-4_x_distance',
    'n-4_y_distance',
    'n-4_start_x',
    'n-4_start_y',
    'n-4_end_x',
    'n-4_end_y',
    'n-4_offensive_value',
    'n-4_defensive_value',
    'n-4_vaep_value',
    'n-5_x_distance',
    'n-5_y_distance',
    'n-5_start_x',
    'n-5_start_y',
    'n-5_end_x',
    'n-5_end_y',
    'n-5_offensive_value',
    'n-5_defensive_value',
    'n-5_vaep_value',
    ]

categorical_features = [
    'period_id',
    'start_pitch_zone',
    'opponent_id',
    'home',
    'n-1_same_team',
    'n-1_x_fwd_direction',
    'n-1_y_lft_right_direction',
    'n-1_start_pitch_zone',
    'n-1_end_pitch_zone',
    'n-1_type_name_encoded',
    'n-1_result_name',
    'n-1_bodypart_name',
    'n-2_same_team',
    'n-2_x_fwd_direction',
    'n-2_y_lft_right_direction',
    'n-2_start_pitch_zone',
    'n-2_end_pitch_zone',
    'n-2_type_name_encoded',
    'n-2_result_name',
    'n-2_bodypart_name',
    'n-3_same_team',
    'n-3_x_fwd_direction',
    'n-3_y_lft_right_direction',
    'n-3_start_pitch_zone',
    'n-3_end_pitch_zone',
    'n-3_type_name_encoded',
    'n-3_result_name',
    'n-3_bodypart_name',
    'n-4_same_team',
    'n-4_x_fwd_direction',
    'n-4_y_lft_right_direction',
    'n-4_start_pitch_zone',
    'n-4_end_pitch_zone',
    'n-4_type_name_encoded',
    'n-4_result_name',
    'n-4_bodypart_name',
    'n-5_same_team',
    'n-5_x_fwd_direction',
    'n-5_y_lft_right_direction',
    'n-5_start_pitch_zone',
    'n-5_end_pitch_zone',
    'n-5_type_name_encoded',
    'n-5_result_name',
    'n-5_bodypart_name',
    ]

# some of the features below will need dropping before training the model - but are required here for filtering dataset
# passthrough_features = [
#     'game_id',
#     'player_id',
#     ]

drop_features = [
    'original_event_id',
    'game_id',
    'player_id',
    'team_id',
    'end_x',
    'end_y',
    'type_id',
    'result_id',
    'bodypart_id',
    'action_id',
    'type_name',
    'result_name',
    'bodypart_name',
    'offensive_value',
    'defensive_value',
    'vaep_value',
    'x_dif',
    'y_dif',
    'n-1_same_player',
    'n-2_same_player',
    'n-3_same_player',
    'n-4_same_player',
    'n-5_same_player',
    ]

In [103]:
ct = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features),
    # ('passthrough', passthrough_features),
    ('drop', drop_features))

## Identifying transfers

In [117]:
transfer_players = ["Anna Patten","Carlotte Wubben-Moy","Lydia Williams","Noelle Maritz","Stephanie Catley","Fran Stenson","Mana Iwabuchi","Ramona Petzelberger","Anita Asante","Diana Silva","Freya Gregory","Stine Larsen","Caroline Siems","Lisa Weiß","Ruby Mace","Sophie Whitehouse","Veatriki Sarri","Ruesha Littlejohn","Emily Murphy","Jamie-Lee Napier","Rachel Corsie","Mollie Green","Chloe McCarron","Christie Murray","Emma Koivisto","Katie Startup","Rebekah Stott","Inessa Kaagman","Nora Heroum","Katie Robinson","Kiera Skeels","Molly Pike","Benedicte Håland","Emma Bissell","Aimee Palmer","Jemma Purfield","Laura Rafferty","Ella Rutherford","Zećira Mušović","Pernille Harder","Niamh Charles","Jessie Fleming","Melanie Leupolz","Alisha Lehmann","Jill Scott","Claire Emslie","Valérie Gauvin","Nicoline Sørensen","Damaris Egurrola","Poppy Pattinson","Rikke Sevecke","Ingrid Wold","Abby Dahlkemper","Alex Greenwood","Lucy Bronze","Samantha Mewis","Chloe Kelly","Esme Morgan","Emily Ramsey","Maria Thorisdottir","Fran Bentley","Alessia Russo","Ivana Fuso","Lucy Staniforth","Mollie Green","Carrie Jones","Ona Batlle","Danielle Carter","Deanna Cooper","Jessica Fishlock","Silvana Flores","Ga-Eul Jeon","Emma Mukandi","Erin Nayler","Lily Woodham","So-hyun Cho","Abbie McManus","Alanna Kennedy","Sophie Whitehouse","Alex Morgan","Shelina Zadorsky","Kerys Harrop","Aurora Mikalsen","Rachel Williams","Dagný Brynjarsdóttir","Anouk Denton","Jacynta Galabadaarachchi","Emily Ramsey","Lois Joel","Rachel Daly","Emily van Egmond","Hawa Cissoko","Kateřina Svitková","Mackenzie Arnold","Mia Cruickshank","Ruby Grant","Maz Pacheco"]


In [118]:
players[players['player_name'].isin(transfer_players)].groupby(['player_id','player_name'])['minutes_played'].sum().reset_index().sort_values(by='minutes_played',ascending=False).head(10)

Unnamed: 0,player_id,player_name,minutes_played
3,10172,Jill Scott,3125
14,18153,Alisha Lehmann,2852
4,15562,Lucy Staniforth,2584
7,15579,Inessa Kaagman,2516
13,17274,Poppy Pattinson,2350
5,15569,Kerys Harrop,2244
2,4648,Abbie McManus,2139
6,15570,Chloe Kelly,2113
9,15631,Niamh Charles,1642
11,16388,Ella Rutherford,1519


In [119]:
transfer_players_df = pd.DataFrame(transfer_players, columns=['player_name'])

In [120]:
list1 = transfer_players
list2 = players_test['player_name'].tolist()
threshold=70

# empty lists for storing the
# matches later
mat1 = []
mat2 = []

# iterating through list1 to extract 
# it's closest match from list2
for i in list1:
    mat1.append(process.extract(i, list2, limit=2))
transfer_players_df['matches'] = mat1

# iterating through the closest
# matches to filter out the
# maximum closest match

for j in transfer_players_df['matches']:
    
    for k in j:
        p = []
        if k[1] >= threshold:
            p.append(k[0])      
    mat2.append(",".join(p))
    
      
# storing the resultant matches 
# back to dframe1
transfer_players_df['matches'] = mat2
  
player_target_list = transfer_players_df['matches'].tolist()

In [121]:
player_target_df = players_test[(players_test['player_name'].isin(player_target_list)) & (players_test['starting_position_name']!='Goalkeeper')].groupby(['player_id','player_name'])['minutes_played'].sum().reset_index().sort_values(by='minutes_played',ascending=False)


In [122]:
player_target_df.head(5)

Unnamed: 0,player_id,player_name,minutes_played
43,24239,Jemma Elizabeth Purfield,2016
30,15579,Inessa Kaagman,2015
11,5076,Emily Louise van Egmond,1940
10,5074,Shelina Laura Zadorsky,1930
50,31534,Ella Toone,1887


In [123]:
players[players['player_id']==15579].head(1)

Unnamed: 0,game_id,team_id,player_id,player_name,nickname,jersey_number,is_starter,starting_position_id,starting_position_name,minutes_played
276,19734,967,15579,Inessa Kaagman,,8,True,21,Left Wing,94


## VAEP Approach

```Create new column transform for this use case ```

In [179]:
numeric_features_vaep = [
    'start_x',
    'start_y',
    'end_x',
    'end_y',
    'x_dif',
    'y_dif',
    'time_seconds',
    'n-1_x_distance',
    'n-1_y_distance',
    'n-1_start_x',
    'n-1_start_y',
    'n-1_end_x',
    'n-1_end_y',
    'n-2_x_distance',
    'n-2_y_distance',
    'n-2_start_x',
    'n-2_start_y',
    'n-2_end_x',
    'n-2_end_y',
    'n-3_x_distance',
    'n-3_y_distance',
    'n-3_start_x',
    'n-3_start_y',
    'n-3_end_x',
    'n-3_end_y',
    'n-4_x_distance',
    'n-4_y_distance',
    'n-4_start_x',
    'n-4_start_y',
    'n-4_end_x',
    'n-4_end_y',
    'n-5_x_distance',
    'n-5_y_distance',
    'n-5_start_x',
    'n-5_start_y',
    'n-5_end_x',
    'n-5_end_y',
    ]

categorical_features_vaep = [
    'period_id',
    'start_pitch_zone',
    'end_pitch_zone',
    'opponent_id',
    'home',
    'type_name',
    'result_name',
    'bodypart_name',
    'n-1_same_team',
    'n-1_x_fwd_direction',
    'n-1_y_lft_right_direction',
    'n-1_start_pitch_zone',
    'n-1_end_pitch_zone',
    'n-1_result_name',
    'n-1_bodypart_name',
    'n-2_same_team',
    'n-2_x_fwd_direction',
    'n-2_y_lft_right_direction',
    'n-2_start_pitch_zone',
    'n-2_end_pitch_zone',
    'n-2_result_name',
    'n-2_bodypart_name',
    'n-3_same_team',
    'n-3_x_fwd_direction',
    'n-3_y_lft_right_direction',
    'n-3_start_pitch_zone',
    'n-3_end_pitch_zone',
    'n-3_result_name',
    'n-3_bodypart_name',
    'n-4_same_team',
    'n-4_x_fwd_direction',
    'n-4_y_lft_right_direction',
    'n-4_start_pitch_zone',
    'n-4_end_pitch_zone',
    'n-4_result_name',
    'n-4_bodypart_name',
    'n-5_same_team',
    'n-5_x_fwd_direction',
    'n-5_y_lft_right_direction',
    'n-5_start_pitch_zone',
    'n-5_end_pitch_zone',
    'n-5_result_name',
    'n-5_bodypart_name',
    'n-1_same_player',
    'n-2_same_player',
    'n-3_same_player',
    'n-4_same_player',
    'n-5_same_player',
    ]

# some of the features below will need dropping before training the model - but are required here for filtering dataset
# passthrough_features = [
#     'vaep_value'
#     ]

drop_features_vaep = [
    'original_event_id',
    'game_id',
    'player_id',
    'team_id',
    'type_id',
    'result_id',
    'bodypart_id',
    'action_id',
    'type_name_encoded',
    'n-5_type_name_encoded',
    'n-4_type_name_encoded',
    'n-3_type_name_encoded',
    'n-2_type_name_encoded',
    'n-1_type_name_encoded',
    'offensive_value',
    'defensive_value',
    'n-5_offensive_value',
    'n-5_defensive_value',
    'n-5_vaep_value',
    'n-4_offensive_value',
    'n-4_defensive_value',
    'n-4_vaep_value',
    'n-3_offensive_value',
    'n-3_defensive_value',
    'n-3_vaep_value',
    'n-2_offensive_value',
    'n-2_defensive_value',
    'n-2_vaep_value',
    'n-1_offensive_value',
    'n-1_defensive_value',
    'n-1_vaep_value'
    ]

In [136]:
ct_vaep = make_column_transformer(
    (StandardScaler(), numeric_features_vaep),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features_vaep),
    ('drop', drop_features_vaep))

## Modeling 

**Approach to modeling:**

1. Pick a player to replace in a team
2. Get the top players in their cluster as potential replacements 
3. Fit a model for each of the scouted players 
    - use only that player's data to fit the model
    - the models will predict their next location and the action they take 
    - the parameters in the model should cover the player characteristics and team characteristics. For example, the 5 past moves cover what the team does, and how then the player reacts to those. The player action is the predicted target 

**Model validation and testing approach:**

- Of the scouted players, find those that have undergone a transfer in season 3 in our data set - use season 3 data to test our predictions. 
- Score the model based on the real data from their new team


**Models to train and test:**

- Baseline model for logistic regression for next action and end pitch location
- Random Forest classifier for end pitch location and next action 
- xGBoost classifier for end pitch location and next action 
- Random Forest regressor for end_x and end_y location 
- xGBoost regressor for enx_x and end_y location

## Grid Search with Cross-Validation

## Scoring output

**Create a class that:**

1. Takes in a player id 
2. Slices the dataset for this player
3. Clusters most similar players 
4. Take the top 3 players 
5. Slices the datasets for each player
6. Fits a model for each 
7. Run a prediction based on the original player dataset
8. Scores the results 

add in:

kNN
SVM
DT
Random Forest
xGBoost

with a cv 5

Run it for End-zone 
Run it for Next Action
Run it for VAEP Regression 
Run it for xT regression (build the dataset)

add in kbest
add in PCA

In [156]:
def create_team_data(team_col: str, team_id: int, player_col: str, player_id: int, train_df, test_df, target: str):
    
    # get the team and player data
    team_train_set = train_df[train_df[team_col]==team_id].dropna()
    team_test_set = test_df[(test_df[team_col]==team_id)&(test_df[team_col]==player_id)].dropna()

    # get the train data
    X_train = team_train_set.drop(columns=[target])
    y_train = team_train_set[target]

    # get the test data
    X_test = team_test_set.drop(columns=[target])
    y_test = team_test_set[target]

    return X_train, y_train, X_test, y_test

In [157]:
X_train, y_train, X_test, y_test = create_team_data('team_id',965, 'player_id', 15579, modeling_train_df, modeling_test_df, 'vaep_value')

# team_train_set = modeling_train_df[modeling_train_df['team_id']==965].dropna()
# team_test_set = modeling_test_df[(modeling_test_df['team_id']==965)&(modeling_test_df['player_id']==15579)].dropna()
# # team_test_set = modeling_test_df[modeling_test_df['team_id']==973].dropna()

# X_train = team_train_set.drop(columns=['vaep_value'])
# y_train_vaep = team_train_set['vaep_value']


# X_test = team_test_set.drop(columns=['vaep_value'])
# y_test_vaep = team_test_set['vaep_value']

In [171]:
ct_vaep = make_column_transformer(
    (StandardScaler(), numeric_features_vaep),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features_vaep),
    ('drop', drop_features_vaep),
    ('passthrough', numeric_features_vaep))

In [172]:
from tempfile import mkdtemp
cachedir = mkdtemp()

In [186]:
# define column transformer
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
num_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
preprocessor = ColumnTransformer(transformers=[('num', num_transformer, numeric_features_vaep),
                                               ('cat', cat_transformer, categorical_features_vaep),
                                               ('drop', 'drop', drop_features_vaep)])

estimator = Pipeline(steps=[('preprocessor', preprocessor),
                       ('dim_reducer', PCA()),
                       ('model', LinearRegression())], memory=cachedir)

In [197]:
# Define a parameter grid
param_grid = [
    #  {
    #     'model': [DecisionTreeRegressor(random_state=1)],
    #     'model__criterion': ['squared_error','absolute_error','poisson'],
    #     'model__max_depth': list(range(2, 25, 2)),
    #     'model__splitter': ['best', 'random']}, 
    
     {
        'model': [DecisionTreeRegressor(random_state=1)],
        'model__max_depth': list(range(2, 11, 1))},
         
    
    
    # Linear Regression with PCA (using PCA right away to avoid issues from multi-coliniarity in a linear regression model)
    {
        'preprocessor__num__scaler': [None, StandardScaler()],
        'dim_reducer': [PCA()],
        'dim_reducer__n_components': [0.95, 0.9, 0.85, 0.8],
        'model': [LinearRegression()]},

    # Linear Regression without PCA 
    
    {
        'preprocessor__num__scaler': [None, StandardScaler()],
        'model': [LinearRegression()]},
    
    # Ridge Regression with PCA (using PCA right away to avoid issues from multi-coliniarity in a linear regression model)

    {
        'preprocessor__num__scaler': [None, StandardScaler()],
        'dim_reducer': [PCA()],
        'dim_reducer__n_components': [0.95, 0.9, 0.85, 0.8],
        'model': [Ridge(solver='auto', random_state=1)],
        'model__alpha': list(range(1, 10, 1))},

    # Ridge Regression without PCA
    {
        'preprocessor__num__scaler': [None, StandardScaler()],
        'model': [Ridge(solver='auto', random_state=1)],
        'model__alpha': list(range(1, 10, 1))},

    # # KNN Regressor with PCA
    # {
    #     'preprocessor__num__scaler': [StandardScaler()],
    #     'dim_reducer':[PCA()],
    #     'dim_reducer__n_components': [0.95, 0.9, 0.85, 0.8],  
    #     'model': [KNeighborsRegressor(n_jobs=-1)],
    #     'model__n_neighbours': list(range(2, 27, 2))},

    # # KNN Regressor with KBest
    # {
    #     'preprocessor__num__scaler': [StandardScaler()],
    #     'dim_reducer':[SelectKBest()],
    #     'dim_reducer__score_func': [f_regression],
    #     'dim_reducer__k': [5, 10, 15, 20, 25],
    #     'model': [KNeighborsRegressor(n_jobs=-1)],
    #     'model__n_neighbours': list(range(2, 27, 2))},
    
    # {
    #     'model': [RandomForestRegressor()],
       
        
    # {
    #     'model': [xgb()],
       
       
    # }
]



# Instantiate a gridsearch
grid = GridSearchCV(estimator, param_grid, cv = 5, verbose = 2)
fitted_grid = grid.fit(X_train, y_train)

fitted_grid.best_estimator_
fitted_grid.score(X_test, y_test)

Fitting 5 folds for each of 109 candidates, totalling 545 fits
[CV] END model=DecisionTreeRegressor(random_state=1), model__max_depth=2; total time=   2.6s
[CV] END model=DecisionTreeRegressor(random_state=1), model__max_depth=2; total time=   2.5s
[CV] END model=DecisionTreeRegressor(random_state=1), model__max_depth=2; total time=   2.5s
[CV] END model=DecisionTreeRegressor(random_state=1), model__max_depth=2; total time=   2.5s
[CV] END model=DecisionTreeRegressor(random_state=1), model__max_depth=2; total time=   2.5s
[CV] END model=DecisionTreeRegressor(random_state=1), model__max_depth=3; total time=   2.9s
[CV] END model=DecisionTreeRegressor(random_state=1), model__max_depth=3; total time=   2.9s
[CV] END model=DecisionTreeRegressor(random_state=1), model__max_depth=3; total time=   2.9s
[CV] END model=DecisionTreeRegressor(random_state=1), model__max_depth=3; total time=   3.1s
[CV] END model=DecisionTreeRegressor(random_state=1), model__max_depth=3; total time=   3.0s
[CV] EN

KeyboardInterrupt: 