# Import Required Libraries

In [None]:
# Data Manipulation libraries:
import numpy as np
import pandas as pd
from copy import deepcopy

# Plotting libraries
import mplsoccer
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from plotly.subplots import make_subplots

# Load the Data

In [None]:
eventDataLL1920 = pd.read_csv("../data/matchwise_events_data_updated.csv",
                              low_memory=False)

In [None]:
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 100)

# Data Preparation

## Separate X and Y of start and end locations

In [None]:
eventDataLL1920.columns[eventDataLL1920.columns.str.contains("goalkeeper")]

In [None]:
eventDataLL1920["goalkeeper.end_location"].dropna().str.len().unique()

In [None]:
# Start location for any action:
eventDataLL1920["startX"] = eventDataLL1920["location"]\
    .str.split(", ", expand=True)[0].str[1:].apply(pd.to_numeric)
eventDataLL1920["startY"] = eventDataLL1920["location"]\
    .str.split(", ", expand=True)[1].str[:-1].apply(pd.to_numeric)

# End location for GK:
eventDataLL1920["endXGK"] = eventDataLL1920["goalkeeper.end_location"]\
    .str.split(", ", expand=True)[0].str[1:].apply(pd.to_numeric)
eventDataLL1920["endYGK"] = eventDataLL1920["goalkeeper.end_location"]\
    .str.split(", ", expand=True)[1].str[:-1].apply(pd.to_numeric)

## Obtain all the GK and other relevant Columns

In [None]:
gkAdditionalCols = ['match_id', 'id',
                    'timestamp', 'minute', 'second',
                    'type.id', 'type.name',
                    'possession_team.id', 'possession_team.name',
                    'play_pattern.id', 'play_pattern.name',
                    'team.id', 'team.name',
                    'player.id', 'jersey_number', 'player.name',
                    'position.id', 'position.name',
                    'under_pressure', 'location',
                    'started', 'minsPlayed',
                    'startX', 'startY', 'endXGK', 'endYGK']

In [None]:
gkCols = eventDataLL1920.columns[eventDataLL1920.columns.str.contains("goalkeeper")].tolist()

In [None]:
gkCols = gkAdditionalCols + gkCols
gkCols

## Obtain Rows that contain GK Events

In [None]:
eventDataLL1920[["type.id", "type.name"]].drop_duplicates()

In [None]:
 eventDataLL1920[(eventDataLL1920["type.id"] == 23)]

In [None]:
 eventDataLL1920[(eventDataLL1920["type.id"] == 23)]["type.name"].unique()

In [None]:
gkData = deepcopy(eventDataLL1920[(eventDataLL1920["type.id"] == 23)])

In [None]:
gkData = gkData[gkCols]

In [None]:
gkData.columns

## Obtain GK Passing Data

In [None]:
eventDataLL1920[["position.id", "position.name"]].drop_duplicates()

In [None]:
eventDataLL1920[(eventDataLL1920["type.id"] == 30)
                & (eventDataLL1920["position.id"] == 1)]["position.name"].unique()

In [None]:
eventDataLL1920[(eventDataLL1920["type.id"] == 30)
                & (eventDataLL1920["position.id"] == 1)].groupby(["player.name"])["player.name"].count()

In [None]:
gkPassData = eventDataLL1920[(eventDataLL1920["type.id"] == 30)
                             & (eventDataLL1920["position.id"] == 1)]

# EDA

## Numerical Exploration

### Exploring Meta Data

In [None]:
gkData.info()

In [None]:
gkData.describe().round(1)

In [None]:
gkData.describe(include="object").T.round(1)

In [None]:
gkCols

### Exploring Categorical Columns

In [None]:
catCols = ['goalkeeper.position.name', 'goalkeeper.type.name', 'goalkeeper.body_part.name',
           'goalkeeper.outcome.name', 'goalkeeper.technique.name', 'goalkeeper.lost_in_play']

In [None]:
for col in catCols:
    print(col, ":", gkData[col].unique(), "\n")

## Visual Exploration

### Exploring Categorical Columns

In [None]:
i = j = 1
# Create an empty subplot:
fig = make_subplots(rows=2, cols=3,
                    shared_xaxes=False,
                    vertical_spacing=0.3,
                    horizontal_spacing=0.1,
                    subplot_titles=catCols)
# Add bar plot for goals conceded in all subplots:
for col in catCols:
    plot_df = gkData[col].value_counts(normalize=True).multiply(100).round(1).reset_index()
    fig.add_trace(go.Bar(x=plot_df["index"],
                         y=plot_df[col],
                         text=plot_df[col],
                         name=col),
                  row=i, col=j)
    j += 1
    if j > 3:
        j = 1
        i += 1
fig.update_traces(textposition='inside', textfont_size=10)
fig.update_layout(height=800)

fig.show()

## Player-Wise GK Passing Exploration

In [None]:
eventDataLL1920[["pass.outcome.id", "pass.outcome.name"]].drop_duplicates()

In [None]:
gkPassData.groupby(["player.id"]).agg({"player.name": "first",
                                       "team.name": "first",
                                       "type.id": "count",
                                       "pass.outcome.id": lambda x: x.isnull().sum()})

In [None]:
gkPlayerPassData = gkPassData.groupby(["player.id"]).agg({"player.name": "first",
                                       "team.name": "first",
                                       "type.id": "count",
                                       "pass.outcome.id": lambda x: x.isnull().sum()})

In [None]:
playerMatchMinsdf = gkPassData.drop_duplicates(subset=["player.id", "match_id"]).groupby(["player.id"])\
    .agg({"minsPlayed": "sum"})

In [None]:
pd.concat([gkPlayerPassData, playerMatchMinsdf], axis=1)

In [None]:
gkPlayerPassData = pd.concat([gkPlayerPassData, playerMatchMinsdf], axis=1)

In [None]:
gkPlayerPassData["passesPer90"] = gkPlayerPassData["type.id"]\
    .divide(gkPlayerPassData["minsPlayed"]).multiply(90).round(1)

In [None]:
gkPlayerPassData

In [None]:
gkPlayerPassData.style.highlight_max(subset=["passesPer90"]).set_precision(1)

In [None]:
gkPlayerPassData.style.background_gradient(subset=["passesPer90"]).set_precision(1)

In [None]:
gkCols

# GK Action Distribution Pitch Map

## Generating Action Ditribution Map

In [None]:
pitch = mplsoccer.VerticalPitch(line_color='#ffffff',
                                pitch_color='#383838',
                                line_zorder=2,
                                pad_bottom=10)
fig, ax = pitch.draw(figsize=(6, 8))
scatter = pitch.scatter(gkData["startX"], gkData["startY"],
                        ax=ax,
                        s=100, c="white", edgecolors="red")

In [None]:
pitch = mplsoccer.VerticalPitch(line_color='#ffffff',
                                pitch_color='#383838',
                                line_zorder=2,
                                pad_bottom=10)
fig, ax = pitch.draw(figsize=(6, 8))
hexbin = pitch.hexbin(gkData["startX"], gkData["startY"],
                       ax=ax, edgecolors='red',
                       gridsize=(15, 15), cmap="Reds")

In [None]:
pitch = mplsoccer.VerticalPitch(line_color='#ffffff',
                                pitch_color='#383838',
                                line_zorder=2,
                                half=True,
                                pad_bottom=10)
fig, ax = pitch.draw(figsize=(6, 8))
hexbin = pitch.hexbin(gkData["startX"], gkData["startY"],
                       ax=ax, edgecolors='red',
                       gridsize=(15, 15), cmap="Reds")

In [None]:
pitch = mplsoccer.VerticalPitch(line_color='#ffffff',
                                pitch_color='#383838',
                                line_zorder=2,
                                pad_bottom=-10)
fig, ax = pitch.draw(figsize=(6, 8))
scatter = pitch.hexbin(120-gkData["startX"], 80-gkData["startY"],
                       ax=ax, edgecolors='red',
                       gridsize=(15, 15), cmap="Reds")

In [None]:
pitch = mplsoccer.VerticalPitch(line_color='#ffffff',
                                pitch_color='#383838',
                                line_zorder=2,
                                half=True,
                                pad_bottom=-20)
fig, ax = pitch.draw(figsize=(6, 8))
scatter = pitch.hexbin(120-gkData["startX"], 80-gkData["startY"],
                       ax=ax, edgecolors='red',
                       gridsize=(15, 15), cmap="Reds")

## Player-Wise Action Map Generation

In [None]:
gkPlayerPassData

In [None]:
playerID = 20055
pitch = mplsoccer.VerticalPitch(line_color='#ffffff',
                                pitch_color='#383838',
                                line_zorder=2,
                                half=True,
                                pad_bottom=-10)
fig, ax = pitch.draw(figsize=(6, 8))
playerPlotData = gkData[gkData["player.id"] == playerID]
hexbin = pitch.hexbin(120-playerPlotData["startX"], 80-playerPlotData["startY"],
                       ax=ax, edgecolors='red',
                       gridsize=(15, 15), cmap="Reds")

In [None]:
playerID = 3509
pitch = mplsoccer.VerticalPitch(line_color='#ffffff',
                                pitch_color='#383838',
                                line_zorder=2,
                                half=True,
                                pad_bottom=-10)
fig, ax = pitch.draw(figsize=(6, 8))
playerPlotData = gkData[gkData["player.id"] == playerID]
hexbin = pitch.hexbin(120-playerPlotData["startX"], 80-playerPlotData["startY"],
                       ax=ax, edgecolors='red',
                       gridsize=(15, 15), cmap="Reds")

In [None]:
gkPlayerPassData.shape

In [None]:
""" Distribution of Pressure on a Pitch Map """
# Set the Pitch Parameters:
pitch = mplsoccer.VerticalPitch(pitch_color='#101010',
                                line_color='#ffffff',
                                half=True,
                                line_zorder=2,
                                pad_bottom=-15)
# Draw the pitch grid according to the set Pitch Parameters:
fig, axs = pitch.grid(nrows=6, ncols=4,
                      axis=False, figheight=40,
                      space=0.2, grid_height=0.98, grid_width=0.9,
                      title_height=0, endnote_height=0)

playerIDs = gkPlayerPassData.index
for idx, ax in enumerate(axs["pitch"].flat):
    if idx < len(playerIDs):
        # Get the data for the current player in the loop:
        playerData = gkData[gkData["player.id"] == playerIDs[idx]]

        # Plotting the Action Distribution:
        hexbin = pitch.hexbin(120-playerData["startX"],
                              80-playerData["startY"],
                              ax=ax, edgecolors='red',
                              gridsize=(15, 15), cmap="Reds")
        
        playerName = playerData["player.name"].unique().item()
        teamName = playerData["team.name"].unique().item()
        totActions = len(playerData)
        ax.set_title(playerName + "\n" + teamName + "\n Actions: " + str(totActions),
                     fontsize=30)

## Player-Wise Save Maps

In [None]:
gkData[gkData["goalkeeper.type.id"] == 33]

In [None]:
pitch = mplsoccer.VerticalPitch(line_color='#ffffff',
                                pitch_color='#383838',
                                line_zorder=2,
                                half=True,
                                pad_bottom=-10)
fig, ax = pitch.draw(figsize=(6, 8))
saveData = gkData[gkData["goalkeeper.type.id"] == 33]
scatter = pitch.scatter(120-saveData["startX"], 80-saveData["startY"],
                        ax=ax,
                        s=100, c="white", edgecolors="red")

In [None]:
""" Distribution of Pressure on a Pitch Map """
# Set the Pitch Parameters:
pitch = mplsoccer.VerticalPitch(pitch_color='#101010',
                                line_color='#ffffff',
                                half=True,
                                line_zorder=2,
                                pad_bottom=-10)
# Draw the pitch grid according to the set Pitch Parameters:
fig, axs = pitch.grid(nrows=6, ncols=4,
                      axis=False, figheight=40,
                      space=0.2, grid_height=0.98, grid_width=0.9,
                      title_height=0, endnote_height=0)

playerIDs = gkPlayerPassData.index
for idx, ax in enumerate(axs["pitch"].flat):
    if idx < len(playerIDs):
        # Get the data for the current player in the loop:
        playerData = gkData[(gkData["player.id"] == playerIDs[idx])
                            & (gkData["goalkeeper.type.id"] == 33)]
        if playerData.empty:
            continue
        # Plotting the Action Distribution:
        hexbin = pitch.hexbin(120-playerData["startX"],
                              80-playerData["startY"],
                              ax=ax,
                              gridsize=(15, 15),
                              cmap="Reds")
        
        playerName = playerData["player.name"].unique().item()
        teamName = playerData["team.name"].unique().item()
        totSaves = len(playerData)
        ax.set_title(playerName + "\n" + teamName + "\n Saves: " + str(totSaves),
                     fontsize=30)

# Save Analysis

In [None]:
gkData.groupby(["player.id"]).agg({"player.name": "first",
                                   "team.name": "first",
                                   "type.id": "count",
                                   "goalkeeper.type.id": lambda x: (x == 33).sum()})

In [None]:
gkPlayerData = gkData.groupby(["player.id"]).agg({"player.name": "first",
                                   "team.name": "first",
                                   "type.id": "count",
                                   "goalkeeper.type.id": lambda x: (x == 33).sum()})

In [None]:
playerMatchMinsdf = gkData.drop_duplicates(subset=["player.id", "match_id"]).groupby(["player.id"])\
    .agg({"minsPlayed": "sum"})

In [None]:
pd.concat([gkPlayerData, playerMatchMinsdf], axis=1)

In [None]:
gkPlayerData = pd.concat([gkPlayerData, playerMatchMinsdf], axis=1)

In [None]:
gkPlayerData["savesPer90"] = gkPlayerData["goalkeeper.type.id"]\
    .divide(gkPlayerData["minsPlayed"]).multiply(90).round(1)

In [None]:
gkPlayerData

In [None]:
gkPlayerData["minsPerSave"] = gkPlayerData["minsPlayed"]\
    .divide(gkPlayerData["goalkeeper.type.id"]).round(1)

In [None]:
gkPlayerData

In [None]:
gkPlayerData.style.highlight_max(subset=["savesPer90"]).set_precision(1)

In [None]:
gkPlayerData.style.background_gradient(subset=["savesPer90"]).set_precision(1)

In [None]:
gkPlayerData.style.highlight_max(subset=["minsPerSave"]).set_precision(1)

In [None]:
gkPlayerData.replace(np.inf, np.nan)

In [None]:
gkPlayerData.replace(np.inf, np.nan).dropna(subset=["minsPerSave"])

In [None]:
gkPlayerDataFiltered = deepcopy(gkPlayerData.replace(np.inf, np.nan).dropna(subset=["minsPerSave"]))

In [None]:
gkPlayerDataFiltered.style.highlight_max(subset=["minsPerSave"]).set_precision(1)

In [None]:
gkPlayerDataFiltered.style.background_gradient(subset=["minsPerSave"]).set_precision(1)

# Save Ratings

In [None]:
gkData[gkData["goalkeeper.type.id"] == 26]

In [None]:
gkData[gkData["goalkeeper.type.id"] == 26]["goalkeeper.outcome.name"].unique()

In [None]:
gkData[gkData["goalkeeper.type.id"] == 33]["goalkeeper.outcome.name"].unique()

**Rating Parameters:**
1. Saved Twice
1. Touched Out
1. Success
1. In Play Danger
1. In Play Safe
1. No Touch
1. Touched In

saveRating = (Success * 3) + (Saved Twice * 2) + (In Play Safe * 2) + (Touched Out * 1) + (In Play Danger * -1) + (No Touch * -3) + (Touched In * -2)

NOTE: All the columns involved in the rating must be at a per90 basis

In [None]:
saveData = deepcopy(gkData[gkData["goalkeeper.type.id"].isin([26, 33])])

In [None]:
saveData[["goalkeeper.outcome.id", "goalkeeper.outcome.name"]].drop_duplicates()

In [None]:
gkSaveData = saveData.groupby(["player.id"]).agg({"player.name": "first",
                                   "team.name": "first",
                                   "type.id": "count",
                                   "goalkeeper.outcome.id": [lambda x: (x == 15).sum(),
                                                             lambda x: (x == 56).sum(),
                                                             lambda x: (x == 53).sum(),
                                                             lambda x: (x == 59).sum(),
                                                             lambda x: (x == 52).sum(),
                                                             lambda x: (x == 55).sum(),
                                                             lambda x: (x == 58).sum()]})

In [None]:
gkSaveData

In [None]:
gkSaveData.columns = ["player.name", "team.name", "totActions",
                      "Saved Twice", "Touched Out", "Success",
                      "In Play Danger", "In Play Safe",
                      "No Touch", "Touched In"]

In [None]:
pd.concat([gkSaveData, playerMatchMinsdf], axis=1)

In [None]:
gkSaveData = pd.concat([gkSaveData, playerMatchMinsdf], axis=1)

In [None]:
gkSaveData

In [None]:
gkSaveDataFiltered = gkSaveData.dropna(subset=["player.name"])

In [None]:
gkSaveDataFiltered

In [None]:
per90Cols = ["Saved Twice", "Touched Out", "Success",
             "In Play Danger", "In Play Safe",
             "No Touch", "Touched In"]
for col in per90Cols:
    gkSaveDataFiltered[col + " Per90"] = gkSaveDataFiltered[col]\
    .divide(gkSaveDataFiltered["minsPlayed"]).multiply(90).round(1)

In [None]:
gkSaveDataFiltered.head()

In [None]:
gkSaveDataFiltered["saveRating"] =\
    (gkSaveDataFiltered["Success Per90"]*3)\
    .add(gkSaveDataFiltered["Saved Twice Per90"]*2)\
    .add(gkSaveDataFiltered["In Play Safe Per90"]*2)\
    .add(gkSaveDataFiltered["Touched Out Per90"]*1)\
    .add(gkSaveDataFiltered["In Play Danger Per90"]*-1)\
    .add(gkSaveDataFiltered["No Touch Per90"]*-1)\
    .add(gkSaveDataFiltered["Touched In Per90"]*-1)

In [None]:
gkSaveDataFiltered

In [None]:
gkSaveDataFiltered.style.highlight_max(subset=["saveRating"]).set_precision(1)

In [None]:
gkSaveDataFiltered.style.background_gradient(subset=["saveRating"]).set_precision(1)

In [None]:
gkSaveDataFinal = gkSaveDataFiltered[["player.name", "team.name", "saveRating"]]\
    .sort_values("saveRating", ascending=False)

In [None]:
gkSaveDataFinal.style.background_gradient(subset=["saveRating"]).set_precision(1)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler((1, 10))

In [None]:
scaler.fit_transform(np.array(gkSaveDataFinal["saveRating"]).reshape(-1, 1))

In [None]:
gkSaveDataFinal["saveRatingScaled"] =\
    scaler.fit_transform(np.array(gkSaveDataFinal["saveRating"]).reshape(-1, 1))

In [None]:
gkSaveDataFinal.style.background_gradient(subset=["saveRatingScaled"]).set_precision(1)