# Calculating SHAP values

**ATTENTION:**

Notebook language: **Python**

## Loading model and data

In [None]:
import pickle5 as pickle

In [None]:
with open('./model/model.pickle', 'rb') as fp:
    model = pickle.load(fp) 

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_preprocessed = pd.read_csv('./data/data_preprocessed.csv', index_col=0)
df_raw = pd.read_csv('./data/raw_data.csv', index_col=0)

In [None]:
X_preprocessed = df_preprocessed.drop('status', axis=1)

In [None]:
y_preprocessed = df_preprocessed.status

## Calculating predictions (background prediction)

In [None]:
y_hat = pd.DataFrame(model.predict_proba(X_preprocessed)[:, 1])
y_hat.to_csv('./data/y_hat.csv')

## Creating explainer

In [None]:
import dalex as dx

In [None]:
explainer = dx.Explainer(model, X_preprocessed, y_preprocessed)

## Functions to calculate explanations

In [None]:
import os

path = './results'

if not os.path.exists(path):
    os.makedirs(path)
else:
    print("The folder already exists")

In [None]:
from scripts.calculate_SHAP import extract_preprocessed__calculate__save

## Calculations

### Robert Lewandowski

#### Season 2021

In [None]:
subset = df_raw[np.logical_and(df_raw['player'] == 'Robert Lewandowski', df_raw['season'] == 2021)]

In [None]:
np.random.seed(42)

In [None]:
extract_preprocessed__calculate__save(
    main_dir = './results', 
    task_hierarchy = ['lewandowski', 'season2021'],
    explainer = explainer, 
    subset = subset, 
    df_preprocessed = df_preprocessed,
    target = 'status'
)

#### Season 2020

In [None]:
subset = df_raw[np.logical_and(df_raw['player'] == 'Robert Lewandowski', df_raw['season'] == 2020)]

In [None]:
np.random.seed(42)

In [None]:
extract_preprocessed__calculate__save(
    main_dir = './results', 
    task_hierarchy = ['lewandowski', 'season2020'],
    explainer = explainer, 
    subset = subset, 
    df_preprocessed = df_preprocessed,
    target = 'status'
)

#### Season 2019

In [None]:
subset = df_raw[np.logical_and(df_raw['player'] == 'Robert Lewandowski', df_raw['season'] == 2019)]

In [None]:
np.random.seed(42)

In [None]:
extract_preprocessed__calculate__save(
    main_dir = './results', 
    task_hierarchy = ['lewandowski', 'season2019'],
    explainer = explainer, 
    subset = subset, 
    df_preprocessed = df_preprocessed,
    target = 'status'
)

### Cristiano Ronaldo

#### Season 2021

In [None]:
subset = df_raw[np.logical_and(df_raw['player'] == 'Cristiano Ronaldo', df_raw['season'] == 2021)]

In [None]:
np.random.seed(42)

In [None]:
extract_preprocessed__calculate__save(
    main_dir = './results', 
    task_hierarchy = ['ronaldo', 'season2021'],
    explainer = explainer, 
    subset = subset, 
    df_preprocessed = df_preprocessed,
    target = 'status'
)

#### Season 2020

In [None]:
subset = df_raw[np.logical_and(df_raw['player'] == 'Cristiano Ronaldo', df_raw['season'] == 2020)]

In [None]:
np.random.seed(42)

In [None]:
extract_preprocessed__calculate__save(
    main_dir = './results', 
    task_hierarchy = ['ronaldo', 'season2020'],
    explainer = explainer, 
    subset = subset, 
    df_preprocessed = df_preprocessed,
    target = 'status'
)

### Bundesliga

season_start_year = 2021 ----> season 2021/2022

The calculations for the whole Bundesliga were performed on Eden cluster and the whole subset was divided into 80 smaller tasks.

On the cluster Slurm Workload Manager is installed, that is why the code below is compatible with this solution.

In [None]:
import subprocess

subprocess.run(["squeue", "./scripts/aSHAP-calculate-job.job"])

Concatenate outputs

In [None]:
import os

path = os.path.join('results', 'bundesliga', 'all_teams', 'season2021')

In [None]:
files_to_concatenate = [
    'full_shaps.csv',
    'shaps.csv',
    'X_subset_original.csv',
    'X_subset_preprocessed.csv',
    'y_hat.csv',
    'y.csv'
]

In [None]:
output_dict = {}

In [None]:
for file in files_to_concatenate:
    output_dict[file] = pd.concat([
        pd.read_csv(os.path.join(path, str(task_id), file), index_col=0) 
        for task_id in range(1, 81)
    ])

In [None]:
output_dict['shaps.csv'].reset_index(drop=True, inplace=True)
output_dict['y_hat.csv'].reset_index(drop=True, inplace=True)
output_dict['full_shaps.csv'].reset_index(drop=True, inplace=True)

In [None]:
for key, df in output_dict.items():
    df.to_csv(os.path.join(path, key))

In [None]:
import shutil

In [None]:
for task_id in range(1, 81):
    shutil.rmtree(os.path.join(path, str(task_id)))

#### Extracting team data

##### Bayern Munich

In [None]:
def exctract_indexes_both_formats(df, func):
    out_1 = func(df)
    indexes_main = out_1.index
    
    df_1 = df.copy()
    df_1 = df_1.reset_index(drop=True)
    out_2 = func(df_1)
    indexes_new = out_2.index
    
    return indexes_main, indexes_new

In [None]:
def extract_and_save(path, dictionary, indexes_main, indexes_new):
    if not os.path.exists(path):
        os.makedirs(path)
    
    dictionary['X_subset_original.csv'].loc[indexes_main].\
        to_csv(os.path.join(path, 'X_subset_original.csv'))
    dictionary['X_subset_preprocessed.csv'].loc[indexes_main].\
        to_csv(os.path.join(path, 'X_subset_preprocessed.csv'))
    dictionary['y.csv'].loc[indexes_main].\
        to_csv(os.path.join(path, 'y.csv'))
    
    dictionary['y_hat.csv'].loc[indexes_new].\
        to_csv(os.path.join(path, 'y_hat.csv'))
    dictionary['shaps.csv'].loc[indexes_new].\
        to_csv(os.path.join(path, 'shaps.csv'))
    
    B = 15 # parameter from SHAP calculations
    index_full_shap = []
    for i in list(indexes_new):
        lower = i * B
        higher = (i + 1) * B - 1
        index_full_shap += list(range(lower, higher + 1))
        
    dictionary['full_shaps.csv'].loc[index_full_shap].\
        to_csv(os.path.join(path, 'full_shaps.csv'))

In [None]:
X = output_dict['X_subset_original.csv']

In [None]:
team_func = lambda team_name: lambda df: df[
    np.logical_or(
        np.logical_and(
            df.home_team == team_name,
            df.h_a == 'h'
        ),
        np.logical_and(
            df.away_team == team_name,
            df.h_a == 'a'
        )
    )
]

In [None]:
bayern_func = team_func('Bayern Munich')

In [None]:
indexes_main, indexes_new = exctract_indexes_both_formats(X, bayern_func)

In [None]:
extract_and_save(
    path = os.path.join('results', 'bundesliga', 'bayern_munich', 'season2021'),
    dictionary = output_dict,
    indexes_main = indexes_main,
    indexes_new = indexes_new
)

##### Borussia Dortmund

According to https://www.flashscore.com/football/germany/bundesliga-2021-2022/ Borussia Dortmund is the second best team in season 2021.

In [None]:
borussia_func = team_func('Borussia Dortmund')

In [None]:
indexes_main, indexes_new = exctract_indexes_both_formats(X, borussia_func)

In [None]:
extract_and_save(
    path = os.path.join('results', 'bundesliga', 'borussia_dortmund', 'season2021'),
    dictionary = output_dict,
    indexes_main = indexes_main,
    indexes_new = indexes_new
)

##### VfB Stuttgart

According to https://www.flashscore.com/football/germany/bundesliga-2021-2022/ VfB Stuttgart is 15th team in season 2021.

In [None]:
stuttgart_func = team_func('VfB Stuttgart')

In [None]:
indexes_main, indexes_new = exctract_indexes_both_formats(X, stuttgart_func)

In [None]:
extract_and_save(
    path = os.path.join('results', 'bundesliga', 'vfb_stuttgart', 'season2021'),
    dictionary = output_dict,
    indexes_main = indexes_main,
    indexes_new = indexes_new
)

#### Extracting player data

##### Robert Lewandowski

According to https://www.flashscore.com/football/germany/bundesliga-2021-2022/ Robert Lewandowski is 1st top scorer .

In [None]:
player_func = lambda player_name: lambda df: df[df.player == player_name]

In [None]:
lewandowski_func = player_func('Robert Lewandowski')

In [None]:
indexes_main, indexes_new = exctract_indexes_both_formats(X, lewandowski_func)

In [None]:
extract_and_save(
    path = os.path.join('results', 'bundesliga', 'lewandowski', 'season2021'),
    dictionary = output_dict,
    indexes_main = indexes_main,
    indexes_new = indexes_new
)

#### Extracting player data

##### Patrik Schick

According to https://www.flashscore.com/football/germany/bundesliga-2021-2022/ Patrik Schick is 2nd top scorer.

In [None]:
schick_func = player_func('Patrik Schick')

In [None]:
indexes_main, indexes_new = exctract_indexes_both_formats(X, schick_func)

In [None]:
extract_and_save(
    path = os.path.join('results', 'bundesliga', 'schick', 'season2021'),
    dictionary = output_dict,
    indexes_main = indexes_main,
    indexes_new = indexes_new
)

##### Max Kruse

According to https://www.flashscore.com/football/germany/bundesliga-2021-2022/ Max Kruse is 10th top scorer.

In [None]:
kruse_func = player_func('Max Kruse')

In [None]:
indexes_main, indexes_new = exctract_indexes_both_formats(X, kruse_func)

In [None]:
extract_and_save(
    path = os.path.join('results', 'bundesliga', 'kruse', 'season2021'),
    dictionary = output_dict,
    indexes_main = indexes_main,
    indexes_new = indexes_new
)