In [144]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

DATA_DIR = Path("datasets")
EVENTS_DIR = DATA_DIR.joinpath("events")

# Explore the [events](https://www.kaggle.com/datasets/secareanualin/football-events/data) dataset

In [145]:
events_df = pd.read_csv(EVENTS_DIR.joinpath("events.csv"))
game_info_df = pd.read_csv(EVENTS_DIR.joinpath("ginf.csv"))
dictionary = json.load(open(EVENTS_DIR.joinpath("dictionary.json")))

In [153]:
full_df = pd.merge(events_df, game_info_df, on='id_odsp', how='inner').drop("id_odsp", axis=1).drop("id_event", axis=1).drop("sort_order", axis=1).drop("adv_stats", axis=1)
for name in dictionary.keys():
    full_df = full_df.replace({name: { kv["id"] : kv["name"] for kv in dictionary[name]}})
full_df

Unnamed: 0,time,text,event_type,event_type2,side,event_team,opponent,player,player2,player_in,...,at,fthg,ftag,odd_h,odd_d,odd_a,odd_over,odd_under,odd_bts,odd_bts_n
0,2,Attempt missed. Mladen Petric (Hamburg) left f...,Attempt,Key Pass,Away,Hamburg SV,Borussia Dortmund,mladen petric,gokhan tore,,...,Hamburg SV,3,1,1.56,4.41,7.42,,,,
1,4,"Corner, Borussia Dortmund. Conceded by Dennis...",Corner,,Home,Borussia Dortmund,Hamburg SV,dennis diekmeier,dennis diekmeier,,...,Hamburg SV,3,1,1.56,4.41,7.42,,,,
2,4,"Corner, Borussia Dortmund. Conceded by Heiko ...",Corner,,Home,Borussia Dortmund,Hamburg SV,heiko westermann,heiko westermann,,...,Hamburg SV,3,1,1.56,4.41,7.42,,,,
3,7,Foul by Sven Bender (Borussia Dortmund).,Foul,,Home,Borussia Dortmund,Hamburg SV,sven bender,,,...,Hamburg SV,3,1,1.56,4.41,7.42,,,,
4,7,Gokhan Tore (Hamburg) wins a free kick in the ...,Free kick won,,Away,Hamburg SV,Borussia Dortmund,gokhan tore,,,...,Hamburg SV,3,1,1.56,4.41,7.42,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941004,92,Lucas Torreira (Sampdoria) wins a free kick in...,Free kick won,,Away,Sampdoria,Atalanta,lucas torreira,,,...,Sampdoria,1,0,1.79,3.96,5.40,1.9,2.07,1.8,2.16
941005,93,"Corner, Sampdoria. Conceded by Andrea Masiello.",Corner,,Away,Sampdoria,Atalanta,,,,...,Sampdoria,1,0,1.79,3.96,5.40,1.9,2.07,1.8,2.16
941006,93,Attempt missed. Fabio Quagliarella (Sampdoria)...,Attempt,Key Pass,Away,Sampdoria,Atalanta,fabio quagliarella,lucas torreira,,...,Sampdoria,1,0,1.79,3.96,5.40,1.9,2.07,1.8,2.16
941007,94,Alberto Grassi (Atalanta) wins a free kick on ...,Free kick won,,Home,Atalanta,Sampdoria,alberto grassi,,,...,Sampdoria,1,0,1.79,3.96,5.40,1.9,2.07,1.8,2.16


In [154]:
full_df[full_df.is_goal == 1].player.value_counts()

player
lionel messi          205
cristiano ronaldo     198
zlatan ibrahimovic    153
robert lewandowski    124
edinson cavani        121
                     ... 
damir vrancic           1
franck signorino        1
roberto hilbert         1
jean deza               1
kenan kodro             1
Name: count, Length: 3214, dtype: int64

In [157]:
frame = {'Home': game_info_df[["ht"]].value_counts(),
         'Away': game_info_df[["at"]].value_counts()}
games_df = pd.DataFrame(frame)
games_df['Total'] = games_df.Home + games_df.Away
games_df.value_counts("Total").head()

Total
211.0    17
38.0     15
212.0    11
76.0     10
187.0     9
Name: count, dtype: int64

In [158]:
game_info_df[["season", "country"]].value_counts().head()

season  country
2012    england    380
2014    italy      380
2016    spain      380
        italy      380
        france     380
Name: count, dtype: int64

In [161]:
full_df[full_df.is_goal == 1].location.dropna().value_counts()

location
Centre of the box                 10713
Very close range                   3337
Outside the box                    2555
Penalty spot                       1989
Not recorded                       1438
Right side of the box              1273
Left side of the box               1234
Left side of the six yard box       711
Right side of the six yard box      702
Difficult angle on the left         201
Difficult angle on the right        187
Difficult angle and long range       43
Long range                           31
More than 35 yards                   23
More than 40 yards                    4
Name: count, dtype: int64

In [165]:
full_df[full_df.is_goal == 1].shot_place.value_counts()

shot_place
Bottom left corner     7212
Bottom right corner    6932
Centre of the goal     4446
Top right corner       2157
Top left corner        2023
Name: count, dtype: int64

In [None]:
full_df[full_df.is_goal == 1]

In [189]:
all_attempts = full_df[full_df.event_type == "Attempt"]
goals_by_shot_place = all_attempts[all_attempts.is_goal == 1].groupby("shot_place").shot_place.size()
attempts_by_shot_place = all_attempts.groupby("shot_place").shot_place.size()
goal_rates = pd.concat([goals_by_shot_place, attempts_by_shot_place], axis=1, join="inner")
goal_rates.columns = ['goals','attempts']
goal_rates['ratio'] = goal_rates["goals"] / goal_rates["attempts"]
goal_rates

Unnamed: 0_level_0,goals,attempts,ratio
shot_place,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bottom left corner,7212,18260,0.394962
Bottom right corner,6932,18748,0.369746
Centre of the goal,4446,25079,0.17728
Top left corner,2023,5983,0.338125
Top right corner,2157,6306,0.342055


## Notes
- The 2017 season is not complete
- The position data for each event is not precise
- Only 1/3 events have a location, but moments of play such as goals often do
- Most goals are scored in the bottom corners, but most attempts are made in the centre