# Calculating SHAP values

**ATTENTION:**

Notebook language: **Python**

## Loading model and data

In [1]:
import pickle5 as pickle

In [2]:
with open('./model/model.pickle', 'rb') as fp:
    model = pickle.load(fp) 

In [3]:
import pandas as pd
import numpy as np

In [4]:
df_preprocessed = pd.read_csv('./data/data_preprocessed.csv', index_col=0)
df_raw = pd.read_csv('./data/raw_data.csv', index_col=0)

In [5]:
X_preprocessed = df_preprocessed.drop('status', axis=1)

## Calculating predictions (background prediction)

In [6]:
y_hat = pd.DataFrame(model.predict_proba(X_preprocessed)[:, 1])
y_hat.to_csv('./data/y_hat.csv')

## Creating explainer

In [11]:
import shap
from skranger.utils.shap import shap_patch

In [8]:
with shap_patch():
    explainer = shap.TreeExplainer(model)

## Functions to calculate explanations

In [12]:
import os

path = './results'

if not os.path.exists(path):
    os.makedirs(path)
else:
    print("The folder already exists")

In [13]:
from scripts.calculate_SHAP import extract_preprocessed__calculate__save

## Calculations

### Robert Lewandowski

#### Season 2021

In [14]:
subset = df_raw[np.logical_and(df_raw['player'] == 'Robert Lewandowski', df_raw['season'] == 2021)]

In [15]:
extract_preprocessed__calculate__save(
    main_dir = './results', 
    task_hierarchy = ['lewandowski', 'season2021'],
    explainer = explainer, 
    subset = subset, 
    df_preprocessed = df_preprocessed,
    target = 'status'
)

The folder already exists


#### Season 2020

In [16]:
subset = df_raw[np.logical_and(df_raw['player'] == 'Robert Lewandowski', df_raw['season'] == 2020)]

In [17]:
extract_preprocessed__calculate__save(
    main_dir = './results', 
    task_hierarchy = ['lewandowski', 'season2020'],
    explainer = explainer, 
    subset = subset, 
    df_preprocessed = df_preprocessed,
    target = 'status'
)

The folder already exists


#### Season 2019

In [18]:
subset = df_raw[np.logical_and(df_raw['player'] == 'Robert Lewandowski', df_raw['season'] == 2019)]

In [19]:
extract_preprocessed__calculate__save(
    main_dir = './results', 
    task_hierarchy = ['lewandowski', 'season2019'],
    explainer = explainer, 
    subset = subset, 
    df_preprocessed = df_preprocessed,
    target = 'status'
)

The folder already exists


### EPL 2021

#### All teams

In [20]:
subset = df_raw[np.logical_and(df_raw.league == 'EPL', df_raw.season == 2021)]

In [None]:
extract_preprocessed__calculate__save(
    main_dir = './results', 
    task_hierarchy = ['epl', 'season2021', 'all_teams'],
    explainer = explainer, 
    subset = subset, 
    df_preprocessed = df_preprocessed,
    target = 'status'
)

#### Manchaster United

I'll extract values for chosen teams from calculations above since SHAP values are calculated for all teams from EPL there.

In [None]:
import os

In [None]:
main_dir = './results'
task_hierarchy = ['epl', 'season2021', 'Manchester_United']

path = os.path.join(main_dir, os.path.join(*task_hierarchy))

if not os.path.exists(path):
        os.makedirs(path)
    else:
        print("The folder already exists")

In [None]:
subset_mu = subset[(subset.home_team == 'Manchester United' & subset.h_a == 'h') |
                    (subset.away_team == 'Manchester United' & subset.h_a == 'a')]

In [None]:
indexes = subset_mu.index
X = df_preprocessed.loc[list(indexes)]

In [None]:
X.to_csv(os.path.join(path, 'X_subset_preprocessed.csv'))
subset_mu.to_csv(os.path.join(path, 'X_subset_original.csv'))

In [None]:
indexes_resetted = subset.reset_index(drop=True)\
                   [(subset.home_team == 'Manchester United' & subset.h_a == 'h') |
                    (subset.away_team == 'Manchester United' & subset.h_a == 'a')]\
                   .index

In [None]:
shaps = pd.read_csv(os.path.join(*[main_dir, 
                                   os.path.join(*['epl', 'season2021', 'all_teams']), 
                                   'shaps.csv']
                                ), index_col=0)
shaps = shaps[list(indexes_resetted)]
shaps.to_csv(os.path.join(path, 'shaps.csv'))

In [None]:
y_hat = pd.read_csv(os.path.join(*[main_dir, 
                                   os.path.join(*['epl', 'season2021', 'all_teams']), 
                                   'y_hat.csv']
                                ), index_col=0)
y_hat = y_hat[list(indexes_resetted)]
y_hat.to_csv(os.path.join(path, 'y_hat.csv'))

#### Manchester City - champion

In [None]:
main_dir = './results'
task_hierarchy = ['epl', 'season2021', 'Manchester_City']

path = os.path.join(main_dir, os.path.join(*task_hierarchy))

if not os.path.exists(path):
        os.makedirs(path)
    else:
        print("The folder already exists")

In [None]:
subset_mu = subset[(subset.home_team == 'Manchester City' & subset.h_a == 'h') |
                    (subset.away_team == 'Manchester City' & subset.h_a == 'a')]

In [None]:
indexes = subset_mu.index
X = df_preprocessed.loc[list(indexes)]

In [None]:
X.to_csv(os.path.join(path, 'X_subset_preprocessed.csv'))
subset_mu.to_csv(os.path.join(path, 'X_subset_original.csv'))

In [None]:
indexes_resetted = subset.reset_index(drop=True)\
                   [(subset.home_team == 'Manchester City' & subset.h_a == 'h') |
                    (subset.away_team == 'Manchester City' & subset.h_a == 'a')]\
                   .index

In [None]:
shaps = pd.read_csv(os.path.join(*[main_dir, 
                                   os.path.join(*['epl', 'season2021', 'all_teams']), 
                                   'shaps.csv']
                                ), index_col=0)
shaps = shaps[list(indexes_resetted)]
shaps.to_csv(os.path.join(path, 'shaps.csv'))

In [None]:
y_hat = pd.read_csv(os.path.join(*[main_dir, 
                                   os.path.join(*['epl', 'season2021', 'all_teams']), 
                                   'y_hat.csv']
                                ), index_col=0)
y_hat = y_hat[list(indexes_resetted)]
y_hat.to_csv(os.path.join(path, 'y_hat.csv'))