#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 03
**CH03C Measurig home team advantage in football**

using the football dataset

version 1.0 2021-05-05

In [5]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
from mizani.formatters import percent_format
from plotnine import *
from plotnine import ggplot, aes, geom_histogram
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

In [6]:
df = pd.read_csv("/workspaces/codespaces-jupyter/data/epl_games.csv")
#df = pd.read_csv("https://osf.io/bdjt5/download")

In [7]:
df.columns.values

array(['div', 'season', 'date', 'team_home', 'team_away', 'points_home',
       'points_away', 'goals_home', 'goals_away'], dtype=object)

In [9]:
df_new = df[df["season"] == 2012]

In [11]:
df_old = df[df["season"] == 2011]

In [None]:
home_goals = df_old.groupby("team_home")["goals_home"].sum()
away_goals = df_old.groupby("team_away")["goals_away"].sum()
goal_sum = (home_goals + away_goals).reset_index()
goal_sum
goal_sum = goal_sum.sort_values(0, ascending=False)
goal_sum

Unnamed: 0,team_home,0
8,Man City,93
9,Man United,89
0,Arsenal,74
16,Tottenham,66
4,Chelsea,65
10,Newcastle,56
11,Norwich,52
5,Everton,50
2,Blackburn,48
6,Fulham,48


In [60]:
goal_sum_df = goal_sum.rename(columns={"team_home": "team", 0: "goal_sum"})
goal_sum_df
goal_sum_df['group'] = pd.qcut(goal_sum_df['goal_sum'], q=3, labels=group_labels[::-1])
goal_sum_df

Unnamed: 0,team,goal_sum,group
8,Man City,93,Top
9,Man United,89,Top
0,Arsenal,74,Top
16,Tottenham,66,Top
4,Chelsea,65,Top
10,Newcastle,56,Top
11,Norwich,52,Top
5,Everton,50,Middle
2,Blackburn,48,Middle
6,Fulham,48,Middle


In [95]:
df_new["group_home"] = df_new["team_home"].map(dict(zip(goal_sum_df["team"], goal_sum_df["group"]))).fillna("New")
df_new["group_away"] = df_new["team_away"].map(dict(zip(goal_sum_df["team"], goal_sum_df["group"]))).fillna("New")
df_new

Unnamed: 0,div,season,date,team_home,team_away,points_home,points_away,goals_home,goals_away,group_home,group_away
1520,E0,2012,18aug2012,West Ham,Aston Villa,3,0,1,0,New,Bottom
1521,E0,2012,18aug2012,Arsenal,Sunderland,1,1,0,0,Top,Bottom
1522,E0,2012,18aug2012,Reading,Stoke,1,1,1,1,New,Bottom
1523,E0,2012,18aug2012,Newcastle,Tottenham,3,0,2,1,Top,Top
1524,E0,2012,18aug2012,Fulham,Norwich,3,0,5,0,Middle,Top
...,...,...,...,...,...,...,...,...,...,...,...
1895,E0,2012,19may2013,Newcastle,Arsenal,0,3,0,1,Top,Top
1896,E0,2012,19may2013,Chelsea,Everton,3,0,2,1,Top,Middle
1897,E0,2012,19may2013,Man City,Norwich,0,3,2,3,Top,Top
1898,E0,2012,19may2013,Swansea,Fulham,0,3,0,3,Bottom,Middle


In [87]:
# Example: count how many unique home teams played in the 2011 season
df_2011 = df[df['season'] == 2011]['team_home'].unique()
df_2011

array(['Newcastle', 'Blackburn', 'QPR', 'Liverpool', 'Fulham', 'Wigan',
       'West Brom', 'Stoke', 'Man City', 'Chelsea', 'Everton', 'Swansea',
       'Aston Villa', 'Arsenal', 'Sunderland', 'Bolton', 'Wolves',
       'Norwich', 'Man United', 'Tottenham'], dtype=object)

In [97]:
# Count how many times 'NPlayed' appears in the 'group_home' column
df_new['group_home'].value_counts()['New']


np.int64(57)

In [99]:
# Count how many times 'NPlayed' appears in the 'group_home' column
df_new['group_away'].value_counts()['New']


np.int64(57)

## Goal Difference

In [100]:
df_new["home_goaladv"] = df_new["goals_home"] - df_new["goals_away"]

In [103]:
df_new["home_goaladv"].describe()

count    380.000000
mean       0.318421
std        1.717880
min       -6.000000
25%       -1.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: home_goaladv, dtype: float64

In [None]:
pd.DataFrame.from_dict(
    {
        "Statistics": [
            "Mean",
            "Standard deviation",
            "Percent positive",
            "Percent zero",
            "Percent negative",
            "Number of observations",
        ],
        "Value": [
            df_new["home_goaladv"].describe()["mean"],
            df["home_goaladv"].describe()["std"],
            (df["home_goaladv"] > 0).sum() / df["home_goaladv"].shape[0] * 100,
            (df["home_goaladv"] == 0).sum() / df["home_goaladv"].shape[0] * 100,
            (df["home_goaladv"] < 0).sum() / df["home_goaladv"].shape[0] * 100,
            df["home_goaladv"].describe()["count"],
        ],
    }
).round(1)