# Postmatch Analyzer

### Using understat data to better understand soccer games with data, visualizations, and analysis

Bryce Frentz  
December 2020

Goal is to use these as a way to think about post match analysis and work into a larger framework maybe

In [1]:
from selenium import webdriver
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import probability_wdl as wdl
import numpy as np
from mplsoccer.pitch import Pitch
import soccer_scraper as ss

In [2]:
%matplotlib inline

In [3]:
%matplotlib widget

In [4]:
## Get match id
match = ''
match = input("Please enter the Understat Match ID: ")

Please enter the Understat Match ID:  14651


In [5]:
## Match id (CHANGE HERE)
if match == '':
    match = '14570'    # Manchester United vs Leeds 6-2
# match = '14596'    # Manchester United vs Aston Villa 2-1
# match = '14620'    # Liverpool vs Manchester United 0-0
# match = '14607'    # Fulham vs Manchester United 1-2

In [6]:
## scrape from given webpage
link = "https://understat.com/match/" + match
content = ss.scrape_understat(link)

In [7]:
## a stands for Away, h stands for Home
## Check on if data was imported
content.keys()

dict_keys(['a', 'h'])

In [8]:
## make dataframes for both home and away shots
away_df = pd.DataFrame(content["a"])
home_df = pd.DataFrame(content["h"])

players_away = away_df['player'].unique()
players_home = home_df['player'].unique()


In [9]:
## Recast datatypes for shot data

# Home
home_df['date'] = pd.to_datetime(home_df['date'])
home_df['X']               = home_df['X'].astype(float)
home_df['Y']               = home_df['Y'].astype(float)
home_df['a_goals']         = home_df['a_goals'].astype(int)
home_df['a_team']          = home_df['a_team'].astype(str)
home_df['h_a']             = home_df['h_a'].astype(str)
home_df['h_goals']         = home_df['h_goals'].astype(int)
home_df['h_team']          = home_df['h_team'].astype(str)
home_df['id']              = home_df['id'].astype(int)
home_df['lastAction']      = home_df['lastAction'].astype(str)
home_df['match_id']        = home_df['match_id'].astype(int)
home_df['minute']          = home_df['minute'].astype(int)
home_df['player']          = home_df['player'].astype(str)
home_df['player_assisted'] = home_df['player_assisted'].astype(str)
home_df['player_id']       = home_df['player_id'].astype(int)
home_df['result']          = home_df['result'].astype(str)
home_df['season']          = home_df['season'].astype(int)
home_df['shotType']        = home_df['shotType'].astype(str)
home_df['situation']       = home_df['situation'].astype(str)
home_df['xG']              = home_df['xG'].astype(float)
home_df['cumulative_xg']   = home_df['xG'].cumsum()

# Away
away_df['date'] = pd.to_datetime(away_df['date'])
away_df['X']               = away_df['X'].astype(float)
away_df['Y']               = away_df['Y'].astype(float)
away_df['a_goals']         = away_df['a_goals'].astype(int)
away_df['a_team']          = away_df['a_team'].astype(str)
away_df['h_a']             = away_df['h_a'].astype(str)
away_df['h_goals']         = away_df['h_goals'].astype(int)
away_df['h_team']          = away_df['h_team'].astype(str)
away_df['id']              = away_df['id'].astype(int)
away_df['lastAction']      = away_df['lastAction'].astype(str)
away_df['match_id']        = away_df['match_id'].astype(int)
away_df['minute']          = away_df['minute'].astype(int)
away_df['player']          = away_df['player'].astype(str)
away_df['player_assisted'] = away_df['player_assisted'].astype(str)
away_df['player_id']       = away_df['player_id'].astype(int)
away_df['result']          = away_df['result'].astype(str)
away_df['season']          = away_df['season'].astype(int)
away_df['shotType']        = away_df['shotType'].astype(str)
away_df['situation']       = away_df['situation'].astype(str)
away_df['xG']              = away_df['xG'].astype(float)
away_df['cumulative_xg']         = away_df['xG'].cumsum()

In [10]:
## combine both the dataframes order by minutes
main_df = pd.concat([home_df, away_df], axis=0).sort_values(by="minute", ascending=True).reset_index(drop=True)

In [11]:
## Recast types for follow-up analysis

main_df['date'] = pd.to_datetime(main_df['date'])

main_df['X']               = main_df['X'].astype(float)
main_df['Y']               = main_df['Y'].astype(float)
main_df['a_goals']         = main_df['a_goals'].astype(int)
main_df['a_team']          = main_df['a_team'].astype(str)
main_df['h_a']             = main_df['h_a'].astype(str)
main_df['h_goals']         = main_df['h_goals'].astype(int)
main_df['h_team']          = main_df['h_team'].astype(str)
main_df['id']              = main_df['id'].astype(int)
main_df['lastAction']      = main_df['lastAction'].astype(str)
main_df['match_id']        = main_df['match_id'].astype(int)
main_df['minute']          = main_df['minute'].astype(int)
main_df['player']          = main_df['player'].astype(str)
main_df['player_assisted'] = main_df['player_assisted'].astype(str)
main_df['player_id']       = main_df['player_id'].astype(int)
main_df['result']          = main_df['result'].astype(str)
main_df['season']          = main_df['season'].astype(int)
main_df['shotType']        = main_df['shotType'].astype(str)
main_df['situation']       = main_df['situation'].astype(str)
main_df['xG']              = main_df['xG'].astype(float)

In [12]:
## Set team names
home_name = home_df['h_team'][0]
away_name = home_df['a_team'][0]

## Set team goals
home_score = home_df['h_goals'][0]
away_score = home_df['a_goals'][0]

## Players
players_away = away_df['player'].unique()
players_home = home_df['player'].unique()


## Shot maps for the teams

In [16]:
# Create shot dataframes
home_shots = home_df[['id', 'minute', 'X', 'Y', 'player', 'shotType', 'situation', 'result', 'xG']]
home_shots['pos_x'] = home_shots['X'].copy()*120
home_shots['pos_y'] = home_shots['Y']*80
away_shots = away_df[['id', 'minute', 'X', 'Y', 'player', 'shotType', 'situation', 'result', 'xG']]
away_shots['pos_x'] = away_shots['X']*120
away_shots['pos_y'] = away_shots['Y']*80

# Create a mask for the shots that are successful vs not
mask_goal_h = home_shots.result == 'Goal'
mask_goal_a = away_shots.result == 'Goal'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [17]:
## Home shot map
wdl.shotMapTeam(home_shots, mask_goal_h, home_name, away_name, home=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [18]:
## Away shot map
wdl.shotMapTeam(away_shots, mask_goal_a, home_name, away_name, home=False)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [19]:
## Combined shot map
wdl.shotMapMatch(shots_home=home_shots, shots_away=away_shots, mask_goal_h=mask_goal_h, 
                 mask_goal_a=mask_goal_a, home_name=home_name, away_name=away_name, 
                 home_score=home_score, away_score=away_score)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## xG Distribution by player

In [13]:
## Get xg distributions from dataframes
all_shooters = main_df.groupby(['player'])['xG'].sum()
home_shooters = home_df.groupby(['player'])['xG'].sum()
away_shooters = away_df.groupby(['player'])['xG'].sum()

In [14]:
# home_shooters
wdl.playerXG(home_shooters, home_name)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [15]:
# away shooters
wdl.playerXG(away_shooters, away_name)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Team xG Plots for the Match

In [20]:
## Cumulative xG plot for the teams
wdl.gameFlow(home_df, away_df)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [21]:
## Make one with seaborn
# fig, ax = plt.subplots(figsize=(7,4))

# ax = sns.lineplot(h_x, h_y, drawstyle='steps-post')

## Match Result Simulations

In [22]:
h_xg = home_df['xG']
a_xg = away_df['xG']

In [23]:
grid = wdl.simGrid(h_xg, a_xg)

In [24]:
wdl.gameProbabilities(grid)

Over 100000 simulated games:

Home wins: 99.29%
Away wins: 0.09%
Draw:      0.62%



In [25]:
## Simulated Probability Distribution
wdl.goalDistribution(grid, home_name, away_name)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [26]:
wdl.drawGrid(grid=grid, home_team_name=home_name, away_team_name=away_name)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [27]:
wdl.mostLikelyGameOutcome(grid=grid, home_score=home_score, away_score=away_score, 
                          home_team_name=home_name, away_team_name=away_name)


Most likely outcome:
Manchester United    5+ -  0    Southampton

This was the most likely outcome!


## Non-penalty xG Calculations

Do the same analysis with non-penalty xG

In [28]:
# Non-penalty xG
home_df['npxg'] = home_df['xG'].copy()
home_df['npxg'].loc[(home_df['situation'] == 'Penalty')] = 0
home_df['npxg_cumulative'] = home_df['npxg'].cumsum()
away_df['npxg'] = away_df['xG'].copy()
away_df['npxg'].loc[(away_df['situation'] == 'Penalty')] = 0
away_df['npxg_cumulative'] = away_df['npxg'].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [29]:
## Get xg distributions from dataframes
home_shooters = home_df.groupby(['player'])['npxg'].sum()
away_shooters = away_df.groupby(['player'])['npxg'].sum()

In [30]:
# home_shooters
wdl.playerXG(home_shooters, home_name, nonpen=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [31]:
# away_shooters
wdl.playerXG(away_shooters, away_name, nonpen=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [32]:
## Cumulative xG plot for the teams
wdl.gameFlow(home_df, away_df, nonpen=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [34]:
grid_npxg = wdl.simGrid(home_df['npxg'], away_df['npxg'])

In [35]:
wdl.gameProbabilities(grid_npxg)

Over 100000 simulated games:

Home wins: 97.95%
Away wins: 0.30%
Draw:      1.75%



In [36]:
## Simulated Probability Distribution
wdl.goalDistribution(grid_npxg, home_name, away_name, nonpen=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [37]:
wdl.drawGrid(grid=grid_npxg, home_team_name=home_name, away_team_name=away_name, nonpen=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [41]:
wdl.mostLikelyGameOutcome(grid=grid_npxg, home_score=home_score, away_score=away_score, 
                          home_team_name=home_name, away_team_name=away_name)


Most likely outcome:
Manchester United    5+ -  0    Southampton

This was the most likely outcome!
