In [45]:
import pandas as pd
import numpy as np

#Plotting liibs
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


#Football libs
import socceraction
from socceraction.data.statsbomb import StatsBombLoader
from mplsoccer import Pitch, Sbopen, VerticalPitch
import socceraction.spadl as spadl
import matplotsoccer as mps
import socceraction.xthreat as xthreat
import socceraction.spadl as spadl
from socceraction.vaep import VAEP

# utils
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
# fuzz is used to compare TWO strings
from fuzzywuzzy import fuzz
# process is used to compare a string to MULTIPLE other strings
from fuzzywuzzy import process
import load_data
import pre_processing_utils as ppu

# ML libs
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, chi2, f_classif, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler,  MinMaxScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA

In [46]:
import importlib
importlib.reload(ppu)

<module 'pre_processing_utils' from '/Users/alexmihalache/Library/CloudStorage/OneDrive-Personal/BrainStation/Capstone/Capstone Project - FAWSL Analysis/pre_processing_utils.py'>

In [48]:
# load data

xt, xt_test, vaep, vaep_test, games, games_test, players, players_test, target_players = load_data.load_data()

In [None]:
modeling_train_df = vaep.copy()
modeling_test_df = vaep_test.copy()
modeling_xt_train_df = xt.copy()
modeling_xt_test_df = xt_test.copy()


## Classification Features

In [None]:
numeric_features, categorical_features, drop_features = col_transformer.set_ct_mode('team-vaep')

ct = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features),
    # ('passthrough', passthrough_features),
    ('drop', drop_features))

## Identifying transfers

In [None]:
target_players.head()

Unnamed: 0,player_id,player_name,minutes_played
0,24239,Jemma Elizabeth Purfield,2016
1,15579,Inessa Kaagman,2015
2,5076,Emily Louise van Egmond,1940
3,5074,Shelina Laura Zadorsky,1930
4,31534,Ella Toone,1887


In [None]:
players[players['player_id']==24239].head(1)

Unnamed: 0,game_id,team_id,player_id,player_name,nickname,jersey_number,is_starter,starting_position_id,starting_position_name,minutes_played
360,19810,966,24239,Jemma Elizabeth Purfield,,23,True,6,Left Back,96


In [None]:
players_test[players_test['player_id']==24239].head(1)

Unnamed: 0,game_id,team_id,player_id,player_name,nickname,jersey_number,is_starter,starting_position_id,starting_position_name,minutes_played
371,3775636,973,24239,Jemma Elizabeth Purfield,,23,True,6,Left Back,103


## VAEP Approach

In [None]:
ct_vaep = make_column_transformer(
    (StandardScaler(), numeric_features_vaep),
    (OneHotEncoder(handle_unknown='ignore'), categorical_features_vaep),
    ('drop', drop_features_vaep))

## Modeling 

**Approach to modeling:**

1. Pick a player to replace in a team
2. Get the top players in their cluster as potential replacements 
3. Fit a model for each of the scouted players 
    - use only that player's data to fit the model
    - the models will predict their next location and the action they take 
    - the parameters in the model should cover the player characteristics and team characteristics. For example, the 5 past moves cover what the team does, and how then the player reacts to those. The player action is the predicted target 

**Model validation and testing approach:**

- Of the scouted players, find those that have undergone a transfer in season 3 in our data set - use season 3 data to test our predictions. 
- Score the model based on the real data from their new team


**Models to train and test:**

- Baseline model for logistic regression for next action and end pitch location
- Random Forest classifier for end pitch location and next action 
- xGBoost classifier for end pitch location and next action 
- Random Forest regressor for end_x and end_y location 
- xGBoost regressor for enx_x and end_y location

## Grid Search with Cross-Validation

## Scoring output

**Create a class that:**

1. Takes in a player id 
2. Slices the dataset for this player
3. Clusters most similar players 
4. Take the top 3 players 
5. Slices the datasets for each player
6. Fits a model for each 
7. Run a prediction based on the original player dataset
8. Scores the results 

add in:

kNN
SVM
DT
Random Forest
xGBoost

with a cv 5

Run it for End-zone 
Run it for Next Action
Run it for VAEP Regression 
Run it for xT regression (build the dataset)

add in kbest
add in PCA

In [None]:
 X_train, y_train, X_test, y_test = ppu.create_player_data('regression', modeling_train_df, modeling_test_df, 15579, reg_target = 'vaep_value')

# Team - VAEP

In [None]:
X_train, y_train, X_test, y_test = ppu.create_team_data('team_id',965, modeling_train_df, modeling_test_df, 'vaep_value')

# team_train_set = modeling_train_df[modeling_train_df['team_id']==965].dropna()
# team_test_set = modeling_test_df[(modeling_test_df['team_id']==965)&(modeling_test_df['player_id']==15579)].dropna()
# # team_test_set = modeling_test_df[modeling_test_df['team_id']==973].dropna()

# X_train = team_train_set.drop(columns=['vaep_value'])
# y_train_vaep = team_train_set['vaep_value']


# X_test = team_test_set.drop(columns=['vaep_value'])
# y_test_vaep = team_test_set['vaep_value']

In [None]:
from tempfile import mkdtemp
cachedir = mkdtemp()

In [None]:
numeric_features, categorical_features, drop_features = ppu.set_ct_mode('team-vaep')

In [None]:
# define column transformer
cat_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
num_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, numeric_features),
    ('cat', cat_transformer, categorical_features),
    ('drop', 'drop', drop_features)])

estimator = Pipeline(steps=[('preprocessor', preprocessor),
                       ('dim_reducer', PCA()),
                       ('model', LinearRegression())], memory=cachedir)

In [None]:
# Define a parameter grid
param_grid = [
     {
        'model': [RandomForestRegressor(random_state=1, oob_score=True, n_jobs=-1)],
        'model__max_depth': list(range(2, 25, 2)),
        'model__max_features': ['sqrt', 'auto', 'log2'],
        'model__n_estimators': list(range(10, 100, 10))}, 
    
     {
        'model': [xgb.XGBRegressor(random_state=1)],
        'model__max_depth': list(range(2, 11, 1))},

]



# Instantiate a gridsearch
grid = GridSearchCV(estimator, param_grid, cv = 5, verbose = 2)
fitted_grid = grid.fit(X_train, y_train)

# fitted_grid.best_estimator_
# fitted_grid.score(X_test, y_test)

Fitting 5 folds for each of 298 candidates, totalling 1490 fits
[CV] END model=DecisionTreeRegressor(random_state=1), model__criterion=squared_error, model__max_depth=2, model__splitter=best; total time=   2.6s
[CV] END model=DecisionTreeRegressor(random_state=1), model__criterion=squared_error, model__max_depth=2, model__splitter=best; total time=   2.5s
[CV] END model=DecisionTreeRegressor(random_state=1), model__criterion=squared_error, model__max_depth=2, model__splitter=best; total time=   2.5s
[CV] END model=DecisionTreeRegressor(random_state=1), model__criterion=squared_error, model__max_depth=2, model__splitter=best; total time=   2.5s
[CV] END model=DecisionTreeRegressor(random_state=1), model__criterion=squared_error, model__max_depth=2, model__splitter=best; total time=   2.5s
[CV] END model=DecisionTreeRegressor(random_state=1), model__criterion=squared_error, model__max_depth=2, model__splitter=random; total time=   1.7s
[CV] END model=DecisionTreeRegressor(random_state=1)

KeyboardInterrupt: 

In [32]:
print(fitted_grid.best_estimator_)
fitted_grid.score(X_test, y_test)

Pipeline(memory='/var/folders/t5/kc4tlr5n2yn5jkkh1t9rk9500000gn/T/tmpzw0z4314',
         steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['start_x', 'start_y',
                                                   'end_x', 'end_y', 'x_dif',
                                                   'y_dif', 'time_seconds',
                                                   'n-1_x_distance',
                                                   'n-1_y_distance',
                                                   'n-1_start_x', 'n-1_start_y',
                                                   'n-1_end_x', 'n-1_end_y',
                                                   'n-2_x_di...
                                                   'n-5_defensive_value',
    

0.06629592197157597