In [0]:
pip install asyncio aiohttp understat



In [0]:
#importing packages

import asyncio

import json

import aiohttp

import pandas as pd

import re

import ast

import numpy as np

from functools import reduce

from scipy import spatial

import plotly.graph_objects as go

from understat import Understat

In [0]:
#defining function to scrape Understat data (https://understat.com/league/EPL)

async def main(team_title):
    async with aiohttp.ClientSession() as session:
        understat = Understat(session)
        players = await understat.get_league_players(
            "epl",
            2019,
        )
        return(json.dumps(players))

loop = asyncio.get_event_loop()

In [0]:
#defining functions to organize data from strings to dictionaries

def splittodict(inputstring): 
    inputstring = inputstring[1:-1]
    pattern = re.compile(r'(?<=\{)(.*?)(?=\})')
    listofmatches = re.findall(pattern, inputstring)
    listofmatches2 = []
    for match in listofmatches: 
        match2 = '{' + match + '}'
        match3 = ast.literal_eval(match2)
        listofmatches2.append(match3)
    return(listofmatches2)
    

In [0]:
#function to transform dictionaries into dataframes

def dicttodf(inputdict): 
    df = pd.DataFrame.from_dict(inputdict, orient = 'index')
    df2 = df.transpose()
    return df2

In [0]:
#getting Understat data for players in the English Premier League

mainoutput = loop.run_until_complete(main('Manchester United'))

outputdicts = splittodict(mainoutput)

outputdfs = []

for dictionary in outputdicts: 
    dictionarydf = dicttodf(dictionary)
    outputdfs.append(dictionarydf)

In [0]:
#merging dataframes

mergeddataframe = reduce(lambda left, right: pd.merge(left, right, how = 'outer'), outputdfs)

In [0]:
#dataset with statistics

mergeddataframe

Unnamed: 0,id,player_name,games,time,goals,xG,assists,xA,shots,key_passes,yellow_cards,red_cards,position,team_title,npg,npxG,xGChain,xGBuildup
0,755,Jamie Vardy,22,1932,17,12.691308803856373,4,2.851450266316533,52,15,2,0,F,Leicester,14,9.64663340896368,12.235153935849667,0.7696032430976629
1,619,Sergio Agüero,18,1109,16,12.247042566537857,3,2.477516859769821,58,17,1,0,F S,Manchester City,14,10.724741786718369,13.7228037789464,2.3636222975328565
2,318,Pierre-Emerick Aubameyang,22,1949,14,10.246303521096706,1,2.7001877576112747,55,16,3,1,F M,Arsenal,13,9.485171549022198,11.226708762347698,2.3485346734523773
3,556,Marcus Rashford,22,1890,14,15.110983923077583,4,3.2046038936823606,74,24,2,0,F M,Manchester United,9,9.782838862389326,12.893335608765483,2.9352949280291796
4,986,Danny Ings,24,1665,14,9.954283555969596,1,1.3589857276529074,58,17,2,0,F M S,Southampton,14,9.954283555969596,10.65714230760932,2.066059786826372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,8222,Japhet Tanganga,2,180,0,0.016901906579732895,0,0,1,0,1,0,D,Tottenham,0,0.016901906579732895,0.2807402275502682,0.2638383209705353
463,8226,Tariq Lamptey,1,26,0,0,0,0.06915584951639175,0,1,0,0,S,Chelsea,0,0,0.06915584951639175,0
464,8239,Takumi Minamino,1,58,0,0.05560074746608734,0,0,1,0,0,0,S,Liverpool,0,0.05560074746608734,0.07584588974714279,0.07584588974714279
465,8254,Indiana Vassilev,2,32,0,0,0,0,0,0,0,0,S,Aston Villa,0,0,0,0


In [0]:
#estimating values of players based on their statistics: goals, assists, expected goal chain

values = []

for player in mergeddataframe.values: 
    goals = player[4]
    assists = player[6]
    xgchain = player[16]
    value = abs(np.random.normal(float(goals), 5)) + abs(np.random.normal(float(assists), 3)) + abs(np.random.normal(float(xgchain), 2)) + 5

    values.append(value)

In [0]:
mergeddataframe['value'] = values

In [0]:
mergeddataframe.to_csv('datasetofvalues.csv')

In [0]:
#defining stats per 90 minutes

def calculate_per90(metric): 
    return round(mergeddataframe[metric].astype(float) / mergeddataframe['time'].astype(float) * 90, 2)

In [0]:
mergeddataframe['G90'] = calculate_per90('goals')

mergeddataframe['xG90'] = calculate_per90('xG')

mergeddataframe['shots90'] = calculate_per90('shots')

mergeddataframe['A90'] = calculate_per90('assists')

mergeddataframe['xA90'] = calculate_per90('xA')

mergeddataframe['KP90'] = calculate_per90('key_passes')

mergeddataframe['xGChain90'] = calculate_per90('xGChain')

mergeddataframe['xGBuildup90'] = calculate_per90('xGBuildup')

In [0]:
mergeddataframe

Unnamed: 0,id,player_name,games,time,goals,xG,assists,xA,shots,key_passes,yellow_cards,red_cards,position,team_title,npg,npxG,xGChain,xGBuildup,value,G90,xG90,shots90,A90,xA90,KP90,xGChain90,xGBuildup90
0,755,Jamie Vardy,22,1932,17,12.691308803856373,4,2.851450266316533,52,15,2,0,F,Leicester,14,9.64663340896368,12.235153935849667,0.7696032430976629,36.796226,0.79,0.59,2.42,0.19,0.13,0.70,0.57,0.04
1,619,Sergio Agüero,18,1109,16,12.247042566537857,3,2.477516859769821,58,17,1,0,F S,Manchester City,14,10.724741786718369,13.7228037789464,2.3636222975328565,29.446794,1.30,0.99,4.71,0.24,0.20,1.38,1.11,0.19
2,318,Pierre-Emerick Aubameyang,22,1949,14,10.246303521096706,1,2.7001877576112747,55,16,3,1,F M,Arsenal,13,9.485171549022198,11.226708762347698,2.3485346734523773,33.712312,0.65,0.47,2.54,0.05,0.12,0.74,0.52,0.11
3,556,Marcus Rashford,22,1890,14,15.110983923077583,4,3.2046038936823606,74,24,2,0,F M,Manchester United,9,9.782838862389326,12.893335608765483,2.9352949280291796,51.371287,0.67,0.72,3.52,0.19,0.15,1.14,0.61,0.14
4,986,Danny Ings,24,1665,14,9.954283555969596,1,1.3589857276529074,58,17,2,0,F M S,Southampton,14,9.954283555969596,10.65714230760932,2.066059786826372,40.699843,0.76,0.54,3.14,0.05,0.07,0.92,0.58,0.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,8222,Japhet Tanganga,2,180,0,0.016901906579732895,0,0,1,0,1,0,D,Tottenham,0,0.016901906579732895,0.2807402275502682,0.2638383209705353,18.609900,0.00,0.01,0.50,0.00,0.00,0.00,0.14,0.13
463,8226,Tariq Lamptey,1,26,0,0,0,0.06915584951639175,0,1,0,0,S,Chelsea,0,0,0.06915584951639175,0,8.451551,0.00,0.00,0.00,0.00,0.24,3.46,0.24,0.00
464,8239,Takumi Minamino,1,58,0,0.05560074746608734,0,0,1,0,0,0,S,Liverpool,0,0.05560074746608734,0.07584588974714279,0.07584588974714279,8.464196,0.00,0.09,1.55,0.00,0.00,0.00,0.12,0.12
465,8254,Indiana Vassilev,2,32,0,0,0,0,0,0,0,0,S,Aston Villa,0,0,0,0,18.743785,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [0]:
#matching functions

matchon = ['G90', 'xG90', 'shots90', 'A90', 'xA90', 'KP90', 'xGChain90', 'xGBuildup90']
matchdf = mergeddataframe[matchon]

In [0]:
matchdf

Unnamed: 0,G90,xG90,shots90,A90,xA90,KP90,xGChain90,xGBuildup90
0,0.79,0.59,2.42,0.19,0.13,0.70,0.57,0.04
1,1.30,0.99,4.71,0.24,0.20,1.38,1.11,0.19
2,0.65,0.47,2.54,0.05,0.12,0.74,0.52,0.11
3,0.67,0.72,3.52,0.19,0.15,1.14,0.61,0.14
4,0.76,0.54,3.14,0.05,0.07,0.92,0.58,0.11
...,...,...,...,...,...,...,...,...
462,0.00,0.01,0.50,0.00,0.00,0.00,0.14,0.13
463,0.00,0.00,0.00,0.00,0.24,3.46,0.24,0.00
464,0.00,0.09,1.55,0.00,0.00,0.00,0.12,0.12
465,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [0]:
tree = spatial.KDTree(matchdf)

In [0]:
#finding players most similar to Sergio Aguero

aguerodata = mergeddataframe.loc[mergeddataframe['player_name'] == 'Sergio Agüero'][matchon]
results = tree.query(aguerodata, k = 11)

In [0]:
mergeddataframe.iloc[results[1].tolist()[0]]

Unnamed: 0,id,player_name,games,time,goals,xG,assists,xA,shots,key_passes,yellow_cards,red_cards,position,team_title,npg,npxG,xGChain,xGBuildup,value,G90,xG90,shots90,A90,xA90,KP90,xGChain90,xGBuildup90
1,619,Sergio Agüero,18,1109,16,12.247042566537855,3,2.477516859769821,58,17,1,0,F S,Manchester City,14,10.724741786718369,13.7228037789464,2.3636222975328565,29.446794,1.3,0.99,4.71,0.24,0.2,1.38,1.11,0.19
14,5543,Gabriel Jesus,20,1181,9,12.5264705196023,4,1.7887941636145115,60,14,1,0,F S,Manchester City,9,11.765301652252674,16.256307382136583,4.225432701408863,32.488275,0.69,0.95,4.57,0.3,0.14,1.07,1.24,0.32
9,1250,Mohamed Salah,20,1725,11,12.210501208901404,5,4.994511172175407,76,36,1,0,F,Liverpool,9,10.688163474202156,19.389187566936016,5.88127301633358,44.110623,0.57,0.64,3.97,0.26,0.26,1.88,1.01,0.31
3,556,Marcus Rashford,22,1890,14,15.110983923077583,4,3.2046038936823606,74,24,2,0,F M,Manchester United,9,9.782838862389326,12.893335608765485,2.93529492802918,51.371287,0.67,0.72,3.52,0.19,0.15,1.14,0.61,0.14
5,702,Tammy Abraham,23,1873,13,12.809597373008728,3,2.7953314669430256,72,19,2,0,F S,Chelsea,13,12.809597373008728,14.73238542675972,0.8963729348033667,37.27719,0.62,0.62,3.46,0.14,0.13,0.91,0.71,0.04
10,4105,Raúl Jiménez,24,2039,11,10.762845426797869,6,6.384599212557077,79,38,2,0,F S,Wolverhampton Wanderers,8,8.479338884353638,15.522327676415443,3.476310452446341,32.921093,0.49,0.48,3.49,0.26,0.28,1.68,0.69,0.15
115,6842,Alireza Jahanbakhsh,5,250,2,0.511822909116745,0,0.9932915829122066,10,4,0,0,F M S,Brighton,2,0.511822909116745,1.1353568024933338,0.3408942930400371,11.72542,0.72,0.18,3.6,0.0,0.36,1.44,0.41,0.12
6,618,Raheem Sterling,22,1861,11,12.77122126892209,1,5.160340078175068,68,33,4,0,F M S,Manchester City,11,12.010052431374788,22.27445751428604,7.847852718085051,41.601674,0.53,0.62,3.29,0.05,0.25,1.6,1.08,0.38
16,482,Roberto Firmino,23,1911,8,10.80780778080225,4,5.126655466854572,72,32,0,0,F M S,Liverpool,8,10.80780778080225,19.51175420731306,6.8903469033539295,32.093938,0.38,0.51,3.39,0.19,0.24,1.51,0.92,0.32
35,2662,Christian Pulisic,16,1083,5,6.155446246266365,2,2.2562413550913334,40,19,0,0,F M S,Chelsea,5,6.155446246266365,10.000476256012917,4.007628088817,30.393446,0.42,0.51,3.32,0.17,0.19,1.58,0.83,0.33


In [0]:
#comparing the most similar player to Sergio Aguero

fig = go.Figure()

categories = matchon
aguerodataplot = matchdf.iloc[results[1].tolist()[0][0]].values.tolist()
jesusdataplot = matchdf.iloc[results[1].tolist()[0][1]].values.tolist()

fig.add_trace(go.Scatterpolar(
      r = aguerodataplot,
      theta=categories,
      fill='toself',
      name='Product A'
))

fig.add_trace(go.Scatterpolar(
      r = jesusdataplot,
      theta=categories,
      fill='toself',
      name='Product B'
))

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 5]
    )),
  showlegend=False
)

fig.show()