# Gols esperados (Expected Goals) e sua aplicação no futebol

Com o passar dos anos, o futebol vem se tornando cada vez mais um esporte de dados. A análise de dados no futebol vem se tornando cada vez mais comum, e com isso, novas métricas e estatísticas vem sendo criadas para tentar explicar o que acontece dentro de campo. Equipes do mundo inteiro passaram a medir e analisar os dados de seus jogadores e de seus adversários, com o objetivo de encontrar padrões e tendências que possam ser utilizados para melhorar o desempenho de suas equipes.
A métrica que será abordada neste artigo é o Expected Goals (xG), ou Gols Esperados. O xG é uma métrica que mede a qualidade das finalizações de uma equipe, ou seja, a probabilidade de uma finalização resultar em gol. O xG é calculado com base em diversos fatores, como a distância do chute, o ângulo do chute, a parte do corpo utilizada para o chute, entre outros. O xG é uma métrica que vem sendo cada vez mais utilizada por equipes de futebol, e também por sites de estatísticas, como o [Understat](https://understat.com/), que disponibiliza os dados de xG de diversas ligas europeias.

### Preparando o ambiente

#### Imports

In [39]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import json 
from sklearn.linear_model import LogisticRegression

eventspath = "archive/data/events/"

#### Filtrando os chutes nos nossos de eventos

In [40]:
eventsfiles = os.listdir(eventspath)
eventsfiles.sort()

chutes = []
for file in eventsfiles:
    eventos = json.load(open(eventspath+file))
    for e in eventos:
        if e['type']['name'] == 'Shot':
            chutes.append(e)
            

In [41]:
df_chutes = pd.DataFrame(chutes)

df_chutes.head()


Unnamed: 0,id,index,period,timestamp,minute,second,type,possession,possession_team,play_pattern,team,player,position,location,duration,related_events,shot,under_pressure,out,off_camera
0,becd7956-ce44-479e-8fc9-16a2d1f1f349,137,1,00:02:29.477,2,29,"{'id': 16, 'name': 'Shot'}",6,"{'id': 217, 'name': 'Barcelona'}","{'id': 4, 'name': 'From Throw In'}","{'id': 217, 'name': 'Barcelona'}","{'id': 5503, 'name': 'Lionel Andrés Messi Cucc...","{'id': 24, 'name': 'Left Center Forward'}","[111.5, 52.9]",1.075902,[b3f73933-da7a-42f7-8364-19b1c7b19e9a],"{'statsbomb_xg': 0.07699243, 'end_location': [...",,,
1,9107d374-2942-4876-a14f-1b9f86901c15,262,1,00:05:39.641,5,39,"{'id': 16, 'name': 'Shot'}",12,"{'id': 217, 'name': 'Barcelona'}","{'id': 1, 'name': 'Regular Play'}","{'id': 217, 'name': 'Barcelona'}","{'id': 5211, 'name': 'Jordi Alba Ramos'}","{'id': 6, 'name': 'Left Back'}","[113.9, 26.4]",0.807592,[7fb36c67-4b6c-4c3d-bc52-4e1cd712e790],"{'statsbomb_xg': 0.05166811, 'end_location': [...",,,
2,ddd194ca-08fb-43d0-87c2-33647f975f9f,715,1,00:15:29.059,15,29,"{'id': 16, 'name': 'Shot'}",23,"{'id': 217, 'name': 'Barcelona'}","{'id': 8, 'name': 'From Keeper'}","{'id': 217, 'name': 'Barcelona'}","{'id': 5503, 'name': 'Lionel Andrés Messi Cucc...","{'id': 24, 'name': 'Left Center Forward'}","[93.7, 34.7]",0.979318,[09e64ce6-9c68-4240-ab72-a228322afbb5],"{'statsbomb_xg': 0.016932096, 'end_location': ...",,,
3,86596ddb-d824-4e5e-b18c-b4442e9ce7cf,743,1,00:16:20.072,16,20,"{'id': 16, 'name': 'Shot'}",30,"{'id': 206, 'name': 'Deportivo Alavés'}","{'id': 1, 'name': 'Regular Play'}","{'id': 206, 'name': 'Deportivo Alavés'}","{'id': 6613, 'name': 'Rubén Sobrino Pozuelo'}","{'id': 23, 'name': 'Center Forward'}","[109.2, 39.1]",0.312149,"[07145f44-87ec-4d02-a483-01d212457e5e, 38b2cec...","{'statsbomb_xg': 0.1226044, 'end_location': [1...",True,,
4,3ed2b107-be17-42d5-9d1b-25006a0e55cb,802,1,00:18:16.362,18,16,"{'id': 16, 'name': 'Shot'}",33,"{'id': 217, 'name': 'Barcelona'}","{'id': 2, 'name': 'From Corner'}","{'id': 217, 'name': 'Barcelona'}","{'id': 5246, 'name': 'Luis Alberto Suárez Díaz'}","{'id': 22, 'name': 'Right Center Forward'}","[107.8, 24.7]",0.937618,[8e604d31-f48d-4998-a71a-d58812bd31f8],"{'statsbomb_xg': 0.041750744, 'end_location': ...",,,


#### Pegando dados da coluna 'shot' dos chutes

In [75]:
def seleciona_chaves(shot):
    # Substitua 'chave1', 'chave2', etc. pelas chaves que você deseja selecionar
    chaves_selecionadas = ['first_time', 'statsbomb_xg', 'deflected', 'technique', 'body_part', 'type', 'outcome', 'open_goal', 'follows_dribble', 'aerial_won']
    return {chave: shot[chave] for chave in chaves_selecionadas if chave in shot}

df_chutes['shot'] = df_chutes['shot'].apply(seleciona_chaves)

In [43]:
# Cria um novo DataFrame a partir da coluna 'shot'
df_shot = df_chutes['shot'].apply(pd.Series)

# Renomeia as colunas do novo DataFrame para começar com 'shot_'
df_shot = df_shot.rename(columns = lambda x : 'shot_' + x)

# Junta o novo DataFrame com o DataFrame original
df_chutes = pd.concat([df_chutes.drop(['shot'], axis=1), df_shot], axis=1)

In [44]:
df_chutes.head()

Unnamed: 0,id,index,period,timestamp,minute,second,type,possession,possession_team,play_pattern,...,shot_first_time,shot_statsbomb_xg,shot_technique,shot_body_part,shot_type,shot_outcome,shot_aerial_won,shot_deflected,shot_open_goal,shot_follows_dribble
0,becd7956-ce44-479e-8fc9-16a2d1f1f349,137,1,00:02:29.477,2,29,"{'id': 16, 'name': 'Shot'}",6,"{'id': 217, 'name': 'Barcelona'}","{'id': 4, 'name': 'From Throw In'}",...,True,0.076992,"{'id': 91, 'name': 'Half Volley'}","{'id': 40, 'name': 'Right Foot'}","{'id': 87, 'name': 'Open Play'}","{'id': 98, 'name': 'Off T'}",,,,
1,9107d374-2942-4876-a14f-1b9f86901c15,262,1,00:05:39.641,5,39,"{'id': 16, 'name': 'Shot'}",12,"{'id': 217, 'name': 'Barcelona'}","{'id': 1, 'name': 'Regular Play'}",...,True,0.051668,"{'id': 95, 'name': 'Volley'}","{'id': 38, 'name': 'Left Foot'}","{'id': 87, 'name': 'Open Play'}","{'id': 98, 'name': 'Off T'}",,,,
2,ddd194ca-08fb-43d0-87c2-33647f975f9f,715,1,00:15:29.059,15,29,"{'id': 16, 'name': 'Shot'}",23,"{'id': 217, 'name': 'Barcelona'}","{'id': 8, 'name': 'From Keeper'}",...,,0.016932,"{'id': 93, 'name': 'Normal'}","{'id': 38, 'name': 'Left Foot'}","{'id': 87, 'name': 'Open Play'}","{'id': 100, 'name': 'Saved'}",,,,
3,86596ddb-d824-4e5e-b18c-b4442e9ce7cf,743,1,00:16:20.072,16,20,"{'id': 16, 'name': 'Shot'}",30,"{'id': 206, 'name': 'Deportivo Alavés'}","{'id': 1, 'name': 'Regular Play'}",...,,0.122604,"{'id': 93, 'name': 'Normal'}","{'id': 37, 'name': 'Head'}","{'id': 87, 'name': 'Open Play'}","{'id': 98, 'name': 'Off T'}",True,,,
4,3ed2b107-be17-42d5-9d1b-25006a0e55cb,802,1,00:18:16.362,18,16,"{'id': 16, 'name': 'Shot'}",33,"{'id': 217, 'name': 'Barcelona'}","{'id': 2, 'name': 'From Corner'}",...,,0.041751,"{'id': 93, 'name': 'Normal'}","{'id': 40, 'name': 'Right Foot'}","{'id': 87, 'name': 'Open Play'}","{'id': 98, 'name': 'Off T'}",,,,


#### Limpando as colunas desnecessárias

In [45]:
print(df_chutes.columns)

Index(['id', 'index', 'period', 'timestamp', 'minute', 'second', 'type',
       'possession', 'possession_team', 'play_pattern', 'team', 'player',
       'position', 'location', 'duration', 'related_events', 'under_pressure',
       'out', 'off_camera', 'shot_first_time', 'shot_statsbomb_xg',
       'shot_technique', 'shot_body_part', 'shot_type', 'shot_outcome',
       'shot_aerial_won', 'shot_deflected', 'shot_open_goal',
       'shot_follows_dribble'],
      dtype='object')


In [46]:
#Limpando o df
df_chutes = df_chutes.drop(['id', 'index', 'period', 'timestamp', 'type', 'possession', 'possession_team', 'team', 'player', 'duration', 'off_camera', 'out', 'related_events'], axis=1)

In [47]:
df_chutes.head()

Unnamed: 0,minute,second,play_pattern,position,location,under_pressure,shot_first_time,shot_statsbomb_xg,shot_technique,shot_body_part,shot_type,shot_outcome,shot_aerial_won,shot_deflected,shot_open_goal,shot_follows_dribble
0,2,29,"{'id': 4, 'name': 'From Throw In'}","{'id': 24, 'name': 'Left Center Forward'}","[111.5, 52.9]",,True,0.076992,"{'id': 91, 'name': 'Half Volley'}","{'id': 40, 'name': 'Right Foot'}","{'id': 87, 'name': 'Open Play'}","{'id': 98, 'name': 'Off T'}",,,,
1,5,39,"{'id': 1, 'name': 'Regular Play'}","{'id': 6, 'name': 'Left Back'}","[113.9, 26.4]",,True,0.051668,"{'id': 95, 'name': 'Volley'}","{'id': 38, 'name': 'Left Foot'}","{'id': 87, 'name': 'Open Play'}","{'id': 98, 'name': 'Off T'}",,,,
2,15,29,"{'id': 8, 'name': 'From Keeper'}","{'id': 24, 'name': 'Left Center Forward'}","[93.7, 34.7]",,,0.016932,"{'id': 93, 'name': 'Normal'}","{'id': 38, 'name': 'Left Foot'}","{'id': 87, 'name': 'Open Play'}","{'id': 100, 'name': 'Saved'}",,,,
3,16,20,"{'id': 1, 'name': 'Regular Play'}","{'id': 23, 'name': 'Center Forward'}","[109.2, 39.1]",True,,0.122604,"{'id': 93, 'name': 'Normal'}","{'id': 37, 'name': 'Head'}","{'id': 87, 'name': 'Open Play'}","{'id': 98, 'name': 'Off T'}",True,,,
4,18,16,"{'id': 2, 'name': 'From Corner'}","{'id': 22, 'name': 'Right Center Forward'}","[107.8, 24.7]",,,0.041751,"{'id': 93, 'name': 'Normal'}","{'id': 40, 'name': 'Right Foot'}","{'id': 87, 'name': 'Open Play'}","{'id': 98, 'name': 'Off T'}",,,,


In [48]:
# Substituindo as colunas de JSON por valores numéricos

def extrai_id(dicionario):
    # Retorna o valor da chave 'id' se ela existir, senão retorna None
    return dicionario.get('id', None)

# Substitua 'coluna1', 'coluna2', etc. pelas colunas que você quer modificar
for coluna in ['play_pattern', 'position', 'shot_body_part', 'shot_technique', 'shot_type', 'shot_outcome']:
    df_chutes[coluna] = df_chutes[coluna].apply(extrai_id)

In [49]:
# Dividindo a localização em duas colunas
def extrai_localizacao(lista):
    # Retorna uma tupla (x, y) com a localização do chute
    return (lista[0], lista[1])


# Aplica a função 'extrai_localizacao' na coluna 'location' e cria duas novas colunas 'locx' e 'locy'
df_chutes[['locx', 'locy']] = df_chutes['location'].apply(extrai_localizacao).apply(pd.Series)

# Remove a coluna 'location' original
df_chutes = df_chutes.drop('location', axis=1)


In [50]:
# Trocando os valores diferente de 'Goal' por 0 em shot_outcome

def troca_outcome(valor):
    if valor == 97: 
        return 1
    else:
        return 0
    
df_chutes['shot_outcome'] = df_chutes['shot_outcome'].apply(troca_outcome)

In [51]:
df_chutes = df_chutes.replace({np.nan: 0, True: 1})
df_chutes.head()


Unnamed: 0,minute,second,play_pattern,position,under_pressure,shot_first_time,shot_statsbomb_xg,shot_technique,shot_body_part,shot_type,shot_outcome,shot_aerial_won,shot_deflected,shot_open_goal,shot_follows_dribble,locx,locy
0,2,29,4,24,0,1,0.076992,91,40,87,0,0,0,0,0,111.5,52.9
1,5,39,1,6,0,1,0.051668,95,38,87,0,0,0,0,0,113.9,26.4
2,15,29,8,24,0,0,0.016932,93,38,87,0,0,0,0,0,93.7,34.7
3,16,20,1,23,1,0,0.122604,93,37,87,0,1,0,0,0,109.2,39.1
4,18,16,2,22,0,0,0.041751,93,40,87,0,0,0,0,0,107.8,24.7


In [52]:
import math

def calculate_angle(x, y):
  # 44 and 36 is the location of each goal post
  g0 = [120, 44]
  p = [x, y]
  g1 = [120, 36]

  v0 = np.array(g0) - np.array(p)
  v1 = np.array(g1) - np.array(p)

  angle = np.math.atan2(np.linalg.det([v0,v1]),np.dot(v0,v1))
  return(abs(np.degrees(angle)))

In [53]:
def calculate_distance(x, y):
  x_dist = 120-x
  y_dist = 0
  if (y<36):
    y_dist = 36-y
  elif (y>44):
    y_dist = y-44
  return math.sqrt(x_dist**2 + y_dist**2)

In [54]:
# Removendo o xG do StatsBomb
statsbomb_xg = df_chutes['shot_statsbomb_xg']
df_chutes = df_chutes.drop('shot_statsbomb_xg', axis=1)
df_chutes['angle'] = df_chutes.apply(lambda row: calculate_angle(row['locx'], row['locy']), axis=1)
df_chutes['distance'] = df_chutes.apply(lambda row: calculate_distance(row['locx'], row['locy']), axis=1)


  angle = np.math.atan2(np.linalg.det([v0,v1]),np.dot(v0,v1))


In [55]:
df_chutes.head()

Unnamed: 0,minute,second,play_pattern,position,under_pressure,shot_first_time,shot_technique,shot_body_part,shot_type,shot_outcome,shot_aerial_won,shot_deflected,shot_open_goal,shot_follows_dribble,locx,locy,angle,distance
0,2,29,4,24,0,1,91,40,87,0,0,0,0,0,111.5,52.9,16.982586,12.306909
1,5,39,1,6,0,1,95,38,87,0,0,0,0,0,113.9,26.4,13.316706,11.374093
2,15,29,8,24,0,0,93,38,87,0,0,0,0,0,93.7,34.7,16.644406,26.33211
3,16,20,1,23,1,0,93,37,87,0,1,0,0,0,109.2,39.1,40.419411,10.8
4,18,16,2,22,0,0,93,40,87,0,0,0,0,0,107.8,24.7,14.895257,16.629191


### Treinando o modelo

In [56]:
X = df_chutes.drop('shot_outcome', axis=1)
y = df_chutes['shot_outcome']

mdl = LogisticRegression(max_iter=100000)
mdl.fit(X, y)

In [57]:
teste = df_chutes.iloc[0:1]
print(teste)
print((mdl.predict_proba(teste.drop('shot_outcome', axis=1)))[:, 1])
print(statsbomb_xg[0])

   minute  second  play_pattern  position  under_pressure  shot_first_time  \
0       2      29             4        24               0                1   

   shot_technique  shot_body_part  shot_type  shot_outcome  shot_aerial_won  \
0              91              40         87             0                0   

   shot_deflected  shot_open_goal  shot_follows_dribble   locx  locy  \
0               0               0                     0  111.5  52.9   

       angle   distance  
0  16.982586  12.306909  
[0.09836168]
0.07699243


In [58]:
yPred = mdl.predict_proba(X)[:, 1]

### Avaliando o modelo

In [59]:
def calcula_error(y, yPred):
    return np.mean(np.abs(y - yPred))

print(calcula_error(y, yPred))

0.16812445689728467


### Aplicando o modelo

In [145]:

## Vamos analisar a partida entre França e Croácia na final da Copa do Mundo de 2018

# Carregando os eventos da partida
eventos = json.load(open('archive/data/events/8658.json'))

# Criando um DataFrame com os eventos
df_franca_croacia = pd.DataFrame(eventos)

# Selecionando apenas os eventos de chutes
chutes = []
for evento in eventos:
    if evento['type']['name'] == 'Shot':
        chutes.append(evento)



# Criando um DataFrame com os chutes
df_chutes = pd.DataFrame(chutes)

def extrai_nome(possession_team):
    # Retorna o valor da chave 'name' se ela existir, senão retorna None
    return possession_team.get('name', None)

# Aplica a função 'extrai_nome' na coluna 'possession_team'
df_chutes['possession_team'] = df_chutes['possession_team'].apply(extrai_nome)

# Agora você pode selecionar as linhas com base no nome da equipe
df_franca = df_chutes[df_chutes['possession_team'] == 'France']
df_croacia = df_chutes[df_chutes['possession_team'] == 'Croatia']

In [146]:
def tratar_df(df):
    # Substituindo as colunas de JSON por valores numéricos
    df['shot'] = df['shot'].apply(seleciona_chaves)
    df_shot = df['shot'].apply(pd.Series)
    df_shot = df_shot.rename(columns = lambda x : 'shot_' + x)
    df = pd.concat([df.drop(['shot'], axis=1), df_shot], axis=1)
    chaves_selecionadas = ['first_time', 'statsbomb_xg', 'deflected', 'technique', 'body_part', 'type', 'outcome', 'open_goal', 'follows_dribble', 'aerial_won']
    chaves_selecionadas = ['shot_' + chave for chave in chaves_selecionadas]
    for chave in chaves_selecionadas:
        if chave not in df.columns:
            df[chave] = np.nan
    df = df.drop(['id', 'index', 'period', 'timestamp', 'type', 'possession', 'possession_team', 'team', 'player', 'duration', 'related_events'], axis=1)
    for coluna in ['play_pattern', 'position', 'shot_body_part', 'shot_technique', 'shot_type', 'shot_outcome']:
        df[coluna] = df[coluna].apply(extrai_id)
    df = df.replace({np.nan: 0, True: 1})
    df[['locx', 'locy']] = df['location'].apply(extrai_localizacao).apply(pd.Series)
    df = df.drop('location', axis=1)
    df['shot_outcome'] = df['shot_outcome'].apply(troca_outcome)
    df['angle'] = df.apply(lambda row: calculate_angle(row['locx'], row['locy']), axis=1)
    df['distance'] = df.apply(lambda row: calculate_distance(row['locx'], row['locy']), axis=1)
    statsbomb_xg = df['shot_statsbomb_xg']
    df = df.drop('shot_statsbomb_xg', axis=1)
    return df, statsbomb_xg

In [147]:
df_franca, xGTestFranca = tratar_df(df_franca)
df_croacia, xGTestCrocia = tratar_df(df_croacia)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['shot'] = df['shot'].apply(seleciona_chaves)
  angle = np.math.atan2(np.linalg.det([v0,v1]),np.dot(v0,v1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['shot'] = df['shot'].apply(seleciona_chaves)
  angle = np.math.atan2(np.linalg.det([v0,v1]),np.dot(v0,v1))


In [148]:
df_franca = df_franca.reindex(columns=X.columns)
df_croacia = df_croacia.reindex(columns=X.columns)
df_franca.head()


Unnamed: 0,minute,second,play_pattern,position,under_pressure,shot_first_time,shot_technique,shot_body_part,shot_type,shot_aerial_won,shot_deflected,shot_open_goal,shot_follows_dribble,locx,locy,angle,distance
3,37,56,5,22,0,0,93,38,88,0.0,0.0,0.0,0.0,108.0,40.0,36.869898,12.0
8,46,10,8,22,0,1,93,38,87,0.0,0.0,0.0,0.0,95.0,45.0,17.508266,25.019992
12,51,35,1,12,0,0,93,40,87,0.0,0.0,0.0,0.0,115.0,55.0,9.700392,12.083046
13,58,32,4,9,0,1,93,40,87,0.0,0.0,0.0,0.0,99.0,41.0,21.5226,21.0
14,58,33,4,9,0,1,93,38,87,0.0,0.0,0.0,0.0,102.0,39.0,24.986433,18.0


In [149]:
yPredFranca = mdl.predict_proba(df_franca)[:, 1]
yPredCroacia = mdl.predict_proba(df_croacia)[:, 1]

In [150]:
nossoXgFranca = np.sum(yPredFranca)
nossoXgCroacia = np.sum(yPredCroacia)

In [151]:
print("França: ", nossoXgFranca)
print("Croácia: ", nossoXgCroacia)

França:  0.659588544940829
Croácia:  1.9195787927839474


In [152]:
statsbomb_xgFranca = np.sum(xGTestFranca)
statsbomb_xgCroacia = np.sum(xGTestCrocia)

In [153]:
print("França: ", statsbomb_xgFranca)
print("Croácia: ", statsbomb_xgCroacia)

França:  1.097963608
Croácia:  1.4802876897


### Salvando o modelo

In [157]:
from joblib import Parallel, delayed 
import joblib

joblib.dump(mdl, 'modelo.joblib')

['modelo.joblib']

In [159]:
testemdl = joblib.load('modelo.joblib')
testemdl.predict_proba(df_franca)[:, 1]

array([0.22053259, 0.07032823, 0.05408428, 0.05774594, 0.07577513,
       0.13082958, 0.03186066, 0.01843213])