# Imports

In [None]:
# imports

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pandas.plotting import table
import os, glob
from os import path
import shutil
import statsmodels.api as sm
from sklearn import linear_model
import datetime
from scipy import stats
from scipy.stats import norm
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
from math import e

# Anzahl der angezeigten Spalten und Zeilen

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
pd.reset_option("display.max_columns")
pd.reset_option("display.max_rows")

In [None]:
pd.set_option("display.max_colwidth", 0)

In [None]:
pd.reset_option("display.max_columns", None)

# einlesen

In [None]:
source = "/content/drive/MyDrive/BA/Daten Regression/"

stats = pd.read_csv(source + "stats.csv")
matches = pd.read_csv(source + "matches.csv")
tables = pd.read_csv(source + "tables.csv") 

# Regression

## Vorbereitung

In [None]:
# drop the players who have an entry in the files but have not participated in the game

stats.drop(stats[(stats["Min"].isna())&(stats["PlayerID"] >= 14)].index, inplace=True)

# define the relevant columns
relevant_stats_cols_df = stats[["League", "GameID", "PlayerID", "Squad", "Player", "#", "Nation", "Pos", "Age", "Min",
                 "PassesAtt", "Carries", "Sh", "Tkl", "Press",
                 "PassesCmp", "PassesCmp%", "DribblesSucc", "DribblesAtt", "DribblesSucc%", "TklW", "PressSucc", "Press%"]]

# calculate the sums for the columns "PassesCmp%", "DribblesSucc%", "Press%" and from these the mean values
sum_of_relevant_stats_cols = relevant_stats_cols_df.groupby(["League", "GameID", "Squad"])["PassesAtt", "PassesCmp", "DribblesAtt", "DribblesSucc", "Press", "PressSucc"].sum()
sum_of_relevant_stats_cols["PassesCmp%"] = [np.round((sum_of_relevant_stats_cols["PassesCmp"][i] / sum_of_relevant_stats_cols["PassesAtt"][i])*100, 1) for i in range(len(sum_of_relevant_stats_cols))]
sum_of_relevant_stats_cols["DribblesSucc%"] = [np.round((sum_of_relevant_stats_cols["DribblesSucc"][i] / sum_of_relevant_stats_cols["DribblesAtt"][i])*100, 1) for i in range(len(sum_of_relevant_stats_cols))]
sum_of_relevant_stats_cols["Press%"] = [np.round((sum_of_relevant_stats_cols["PressSucc"][i] / sum_of_relevant_stats_cols["Press"][i])*100, 1) for i in range(len(sum_of_relevant_stats_cols))]

# create the df with mean values for the teams 
mean_of_relevant_stats_cols = relevant_stats_cols_df.groupby(["League", "GameID", "Squad"])["PassesAtt", "Carries", "Sh", "Tkl", "Press","TklW"].mean()
merged_mean_of_relevant_stats_cols = mean_of_relevant_stats_cols.merge(sum_of_relevant_stats_cols[["PassesCmp%", "DribblesSucc%", "Press%"]], left_index=True, right_index=True)
merged_mean_of_relevant_stats_cols_res_ind = merged_mean_of_relevant_stats_cols.reset_index()
mean_df = pd.merge(merged_mean_of_relevant_stats_cols_res_ind, matches[["GameID", "League", "Temperature", "Rain", "Humidity", "Home Team Rank", "Away Team Rank"]], on = ["GameID", "League"])

In [None]:
# update the name of the club "Eintracht Frankfurt" from "Eint Frankfurt" to "Frankfurt", like it is in the stats dataset 

matches["Home"] = ["Frankfurt" if i == "Eint Frankfurt" else i for i in matches["Home"]]
matches["Away"] = ["Frankfurt" if i == "Eint Frankfurt" else i for i in matches["Away"]]

In [None]:
# create the data set containing only the home teams
mean_df_home = pd.merge(merged_mean_of_relevant_stats_cols_res_ind, matches[["GameID", "League", "Home", "Temperature", "Rain", "Humidity", "Home Team Rank", "Away Team Rank"]], right_on = ["GameID", "League", "Home"], left_on=["GameID", "League", "Squad"])
mean_df_home.drop("Home", axis=1, inplace=True)

In [None]:
# square the variable "PassesCmp%", apply the square root to "Sh", "Tkl", "TklW", "Press%" and the logarithm to "PassesAtt", "Carries", "Press"

mean_df["PassesCmp%"] = np.power(mean_df["PassesCmp%"], 2)
mean_df[["Sh", "Tkl", "TklW", "Press%"]] = np.sqrt(mean_df[["Sh", "Tkl", "TklW", "Press%"]])
mean_df[["PassesAtt", "Carries", "Press"]] = np.log(mean_df[["PassesAtt", "Carries", "Press"]])

In [None]:
# square the variable "PassesCmp%", apply the square root to "Sh", "Tkl", "TklW", "Press%" and the logarithm to "PassesAtt", "Carries", "Press"

mean_df_home["PassesCmp%"] = np.power(mean_df_home["PassesCmp%"], 2)
mean_df_home[["Sh", "Tkl", "TklW", "Press%"]] = np.sqrt(mean_df_home[["Sh", "Tkl", "TklW", "Press%"]])
mean_df_home[["PassesAtt", "Carries", "Press"]] = np.log(mean_df_home[["PassesAtt", "Carries", "Press"]])

In [None]:
# section of the table before the aggregation
mean_df.loc[804:,:].head(5).to_latex()

In [None]:
# section of the table after the aggregation
relevant_stats_cols_df.loc[11183:,:].head(30).to_latex()

In [None]:
# add the weather conditions of the matches and team ranks of the opponents from the matches df to the stats df for the regression 

stats_regr = pd.merge(stats, matches[["GameID","League","Temperature", "Rain", "Humidity", "Home Team Rank", "Away Team Rank"]], on = ["GameID", "League"])

## Statsmodels Regression

In [None]:
# state the independent and dependent variables 

in_vars_base = ["Temperature", "Rain", "Humidity"]
tactic_stats = ["PassesAtt", "Carries", "Sh", "Tkl", "Press"]
tactic_matches = ["Possession Home", "Possession Away"]
perf_stats = ["PassesCmp%", "DribblesSucc%", "TklW", "Press%"]
perf_matches = [] # this list is intentionally left empty, as none of the performance variables examined were in the matches data set

### Funktionen

In [None]:
def scatter_plots(stats_df, matches_df, var_type, league, rows, cols, size=(20, 28)):
  """
    Prints the scatter plots.

            Parameters:
                    stats_df (data frame)   : Statistics data frame on which the scatter plot is performed
                    matches_df (data frame) : Matches data frame on which the scatter plot is performed
                    var_type (str)          : Type of variable for the scatter plot
                    league (str)            : Name and season of the league for the scatter plot
                    rows (int)              : Number of rows for the scatter plot grid
                    cols (int)              : Number of columns for the scatter plot grid
                    size (int)              : Size of the scatter plot (default (20, 28))

            Returns:
                    None
  """

  if var_type == "tactic":
    fig1, ax1 = plt.subplots(rows, cols, figsize=size)

    # temperature
    index_t = 0
    for dep_var_t in ["PassesAtt", "Carries", "Sh", "Tkl", "Press", "Possession Home", "Possession Away"]:
      if dep_var_t != "Possession Home" and dep_var_t != "Possession Away":
        print(sns.regplot(ax=ax1[index_t, 0], x=dep_var_t, y="Temperature", data=stats_df[stats_df["League"] == league], 
                          fit_reg=True, 
                          scatter_kws={"color":"black"}, line_kws={"color":"red"}))
      else:
        print(sns.regplot(ax=ax1[index_t, 0], x=dep_var_t, y="Temperature", data=matches_df[matches_df["League"] == league], 
                          fit_reg=True, 
                          scatter_kws={"color":"black"}, line_kws={"color":"red"}))
      index_t += 1


    # rain
    index_r = 0
    for dep_var_r in ["PassesAtt", "Carries", "Sh", "Tkl", "Press", "Possession Home", "Possession Away"]:
      if dep_var_r != "Possession Home" and dep_var_r != "Possession Away":
        print(sns.regplot(ax=ax1[index_r, 1], x=dep_var_r, y="Rain", data=stats_df[stats_df["League"] == league], 
                          fit_reg=True, 
                          scatter_kws={"color":"black"}, line_kws={"color":"red"}))
      else:
        print(sns.regplot(ax=ax1[index_r, 1], x=dep_var_r, y="Rain", data=matches_df[matches_df["League"] == league], 
                          fit_reg=True, 
                          scatter_kws={"color":"black"}, line_kws={"color":"red"}))
      index_r += 1

    # humidity
    index_h = 0
    for dep_var_h in ["PassesAtt", "Carries", "Sh", "Tkl", "Press", "Possession Home", "Possession Away"]:
      if dep_var_h != "Possession Home" and dep_var_h != "Possession Away":
        print(sns.regplot(ax=ax1[index_h, 2], x=dep_var_h, y="Humidity", data=stats_df[stats_df["League"] == league], 
                          fit_reg=True, 
                          scatter_kws={"color":"black"}, line_kws={"color":"red"}))
      else:
        print(sns.regplot(ax=ax1[index_h, 2], x=dep_var_h, y="Humidity", data=matches_df[matches_df["League"] == league], 
                          fit_reg=True, 
                          scatter_kws={"color":"black"}, line_kws={"color":"red"}))
      index_h += 1
    
  elif var_type == "performance":
    fig2, ax2 = plt.subplots(rows, cols, figsize=size)

    # temperature
    index_t = 0
    for dep_var_t in ["PassesCmp%", "DribblesSucc%", "TklW", "Press%"]:
      print(sns.regplot(ax=ax2[index_t, 0], x=dep_var_t, y="Temperature", data=stats_df[stats_df["League"] == league], 
                        fit_reg=True, 
                        scatter_kws={"color":"black"}, line_kws={"color":"red"}))
      index_t += 1

    # rain
    index_r = 0
    for dep_var_r in ["PassesCmp%", "DribblesSucc%", "TklW", "Press%"]:
      print(sns.regplot(ax=ax2[index_r, 1], x=dep_var_r, y="Rain", data=stats_df[stats_df["League"] == league], 
                        fit_reg=True, 
                        scatter_kws={"color":"black"}, line_kws={"color":"red"}))
      index_r += 1

    # humidity
    index_h = 0
    for dep_var_h in ["PassesCmp%", "DribblesSucc%", "TklW", "Press%"]:
      print(sns.regplot(ax=ax2[index_h, 2], x=dep_var_h, y="Humidity", data=stats_df[stats_df["League"] == league], 
                        fit_reg=True, 
                        scatter_kws={"color":"black"}, line_kws={"color":"red"}))
      index_h += 1

  else:
    return "Variable type unknown!"

In [None]:
def hist_plots(stats_df, matches_df, var_type, league, rows, size=(15, 15)):
  """
    Plots the histograms.

            Parameters:
                    stats_df (data frame)   : Statistics data frame on which the histogram is performed
                    matches_df (data frame) : Matches data frame on which the histogram is performed
                    var_type (str)          : Type of variable for the histogram
                    league (str)            : Name and season of the league for the histogramm
                    rows (int)              : Number of rows for the histogram grid
                    size (int)              : Size of the histogram plot (default (15, 15))

            Returns:
                    None
  """

  if var_type == "tactic":
    fig1, ax1 = plt.subplots(rows, figsize=size)
    index = 0
    for dep_var in ["PassesAtt", "Carries", "Sh", "Tkl", "Press", "Possession Home", "Possession Away"]:
      if dep_var != "Possession Home" and dep_var != "Possession Away":
        print(sns.histplot(ax=ax1[index], x=dep_var, data=stats_df[stats_df["League"] == league]))
      else:
        print(sns.histplot(ax=ax1[index], x=dep_var, data=matches_df[matches_df["League"] == league]))
      index += 1
  elif var_type == "performance":
    fig2, ax2 = plt.subplots(rows, figsize=size)
    index = 0
    for dep_var in ["PassesCmp%", "DribblesSucc%", "TklW", "Press%"]:
      print(sns.histplot(ax=ax2[index], x=dep_var, data=stats_df[stats_df["League"] == league]))
      index += 1
  else:
    return "Variable type unknown!"

In [None]:
def ols_w_summary(df, indep_vars, dep_var, dictionary, league, stat_type):
  """
    Does regression and saves some values from that model in a dictionary.

            Parameters:
                    df (data frame)    : Data frame on which the regression is performed
                    indep_vars (list)  : List of the independent variables for the regression
                    dep_var (list)     : List of the dependent variable for the regression
                    dictionary (dict)  : Dictionary with preset keys of the regression model values that should be saved
                    league (str)       : Name and season of the league for the regression
                    stat_type (str)    : Type of the statictical test for the regression assumptions

            Returns:
                    dictionary (dict): Dictionary containing the resulting values of the regression
  """

  X = sm.add_constant(df[indep_vars])
  y = df[dep_var]
  model = sm.OLS(y, X, missing="drop")
  results = model.fit()

  if stat_type == "durbin_watson": 
    from statsmodels.stats.stattools import durbin_watson
    print(league)
    print(indep_vars)
    print(dep_var)
    print(durbin_watson(results.resid))
    print("---")

  elif stat_type == "breusch_pagan":
    bp_names = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
    bp_test = sms.het_breuschpagan(results.resid, results.model.exog)
    bp = lzip(bp_names, bp_test)
    if bp[1][1] < 0.05:
      print(league)
      print(indep_vars)
      print(dep_var)
      print(bp)
      print("---")
  
  elif stat_type == "breusch_pagan_plot":
    bp_names = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
    bp_test = sms.het_breuschpagan(results.resid, results.model.exog)
    bp = lzip(bp_names, bp_test)
    if bp[1][1] < 0.05:
      print(league)
      print(indep_vars)
      print(dep_var)
      print(bp)
      plt.scatter(results.fittedvalues, results.resid)
      plt.xlabel("Fitted Residuals")
      plt.ylabel("Residuals")
      plt.rcParams['figure.dpi'] = 60
      plt.show()


  elif stat_type == "jarque_bera":
    jb_name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
    jb_test = sms.jarque_bera(results.resid)
    jb = lzip(jb_name, jb_test)
    if jb[1][1] < 0.01:
      print(league)
      print(indep_vars)
      print(dep_var)
      print(jb)
      print("---")
  
  elif stat_type == "jarque_bera_distplot":
    jb_name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
    jb_test = sms.jarque_bera(results.resid)
    jb = lzip(jb_name, jb_test)
    if jb[1][1] < 0.01:
      print(league)
      print(indep_vars)
      print(dep_var)
      print(jb)
      sns.distplot(results.resid, fit=norm);
      plt.xlabel('Residuals')
      plt.rcParams['figure.dpi'] = 60
      plt.show()

  elif stat_type == "jarque_bera_probplot":
    from scipy import stats
    jb_name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
    jb_test = sms.jarque_bera(results.resid)
    jb = lzip(jb_name, jb_test)
    if jb[1][1] < 0.01:
      print(league)
      print(indep_vars)
      print(dep_var)
      print(jb)
      stats.probplot(results.resid, plot=plt)
      plt.rcParams['figure.dpi'] = 60
      plt.show()

  elif stat_type != None:
    print("Statistics type unknown!")

  dictionary["dep_var"].append(dep_var)
  dictionary["rsquared"].append(round(results.rsquared,3))
  dictionary["rsquared_adj"].append(round(results.rsquared_adj,3))
  dictionary["nobs"].append(results.nobs)
  l_pval = []
  for pval in results.pvalues:
    l_pval.append(round(pval,4))
  dictionary["pvalues"].append(l_pval)
  l_bse = []
  for bse in results.bse:
    l_bse.append(round(bse,3))
  dictionary["bse"].append(l_bse)
  l_par = []
  for par in results.params:
    l_par.append(round(par,3))
  dictionary["params"].append(l_par)

  return dictionary

In [None]:
def make_regression(df_stats, df_matches, in_vars, de_var_stats, de_var_matches, league, stat_type=None):
  """
    Calls ols_w_summary for each dependent variable and returns values from that model in a dictionary.

            Parameters:
                    df_stats (data frame)    : Stats data frame on which the regression is performed
                    df_matches (data frame)  : Matches data frame on which the regression is performed
                    in_vars (list)           : List of the independent variables for the regression
                    de_var_stats (list)      : List of the dependent stats variables for the regression
                    de_var_matches (list)    : List of the dependent matches variables for the regression
                    league (str)             : Name and season of the league for the regression
                    stat_type (str)          : Type of the statictical test for the regression assumptions (default None)

            Returns:
                    regr_results_dict (dict): Dictionary containing the resulting values of all regression models
  """

  regr_results_dict = {"dep_var":[], "rsquared":[], "rsquared_adj":[], "nobs":[], "pvalues":[], "bse":[], "params":[]}

  for var in de_var_stats:
    ols_w_summary(df_stats, in_vars, [var], regr_results_dict, league, stat_type)
  for var in de_var_matches:
    ols_w_summary(df_matches, in_vars, [var], regr_results_dict, league, stat_type)
  
  return regr_results_dict

In [None]:
def dict_to_latex(regr_values_dict, league, in_vars):
  """
    Creates a latex table of the dictionary and its values.

            Parameters:
                    regr_values_dict (dict)   : Dictionary containing the values from the regression models
                    league (str)              : Name and season of the league for the results
                    in_vars (list)            : List of the independent variables for the results
                    
            Returns:
                    df_latex (str) : Data frame in latex form
  """

  for i in range(len(regr_values_dict["params"])):
    for j in range(len(in_vars)+1):
      if regr_values_dict["pvalues"][i][j] < 0.001:
         regr_values_dict["params"][i][j] = str(regr_values_dict["params"][i][j]) + "***" + " (" + str(regr_values_dict["bse"][i][j]) + ")"
      elif regr_values_dict["pvalues"][i][j] < 0.01:
           regr_values_dict["params"][i][j] = str(regr_values_dict["params"][i][j]) + "**" + " (" + str(regr_values_dict["bse"][i][j]) + ")"
      elif regr_values_dict["pvalues"][i][j] < 0.05:
           regr_values_dict["params"][i][j] = str(regr_values_dict["params"][i][j]) + "*" + " (" + str(regr_values_dict["bse"][i][j]) + ")"
      else:
        regr_values_dict["params"][i][j] = str(regr_values_dict["params"][i][j]) + " (" + str(regr_values_dict["bse"][i][j]) + ")"


  df = pd.DataFrame(regr_values_dict)

  df.drop(["pvalues"], axis=1, inplace=True)
  df.drop(["bse"], axis=1, inplace=True)
  df["params"] = [tuple(i) for i in df["params"]]
  df["dep_var"] = [tuple(i) for i in df["dep_var"]]
  df["nobs"] = [int(i) for i in df["nobs"]]
  df["League"] = league

  df = df.set_index("dep_var")
  df_latex = df.to_latex()
  return df

In [None]:
# scatterplot and probability plot of BL 18/19 "PassesAtt" regression residuals before aggregation and transformation of the variable

from scipy import stats

X_before = sm.add_constant(mean_df[mean_df["League"] == "PD 18-19"][["Temperature", "Rain", "Humidity"]])
y_passesAtt_before = e**(mean_df[mean_df["League"] == "PD 18-19"]["PassesAtt"])
model_passesAtt_before = sm.OLS(y_passesAtt_before, X_before, missing="drop")
results_passesAtt_before = model_passesAtt_before.fit()
predictions_passesAtt_before = results_passesAtt_before.predict(X_before)

stats.probplot(results_passesAtt_before.resid, plot=plt);
plt.xlabel('Theoretische Quantile')
plt.ylabel('Geordnete Werte')
plt.title("Probability Plot vorher")
plt.rcParams['figure.dpi'] = 360
plt.style.use("seaborn-whitegrid")
plt.tight_layout()
plt.savefig("plot_prob_passesAtt_before.png")

#plt.scatter(results_passesAtt_before.fittedvalues, results_passesAtt_before.resid)
#plt.xlabel("Gefitteten Werte")
#plt.ylabel("Residuen")
#plt.title("Scatter Plot der Residuen vorher")
#plt.rcParams['figure.dpi'] = 360
#plt.style.use("seaborn-whitegrid")
#plt.tight_layout()
#plt.savefig("plot_resid_scatter_passesAtt_before.png")

In [None]:
# scatterplot and probability plot of BL 18/19 "PassesAtt" regression residuals after aggregation and transformation of the variable

X_after = sm.add_constant(mean_df[mean_df["League"] == "PD 18-19"][["Temperature", "Rain", "Humidity"]])
y_passesAtt_after = mean_df[mean_df["League"] == "PD 18-19"]["PassesAtt"]
model_passesAtt_after = sm.OLS(y_passesAtt_after, X_after, missing="drop")
results_passesAtt_after = model_passesAtt_after.fit()
predictions_passesAtt_after = results_passesAtt_after.predict(X_after)

stats.probplot(results_passesAtt_after.resid, plot=plt);
plt.xlabel('Theoretische Quantile')
plt.ylabel('Geordnete Werte')
plt.title("Probability Plot nachher")
plt.rcParams['figure.dpi'] = 360
plt.style.use("seaborn-whitegrid")
plt.tight_layout()
plt.savefig("plot_prob_passesAtt_after.png")

#plt.scatter(results_passesAtt_after.fittedvalues, results_passesAtt_after.resid)
#plt.xlabel("Gefitteten Werte")
#plt.ylabel("Residuen")
#plt.title("Scatter Plot der Residuen nachher")
#plt.rcParams['figure.dpi'] = 360
#plt.style.use("seaborn-whitegrid")
#plt.tight_layout()
#plt.savefig("plot_resid_scatter_passesAtt_after.png")

### erstes Regressionsmodell

In [None]:
# data set for every league and season

bl_17_18_regr_stats_first = stats_regr[stats_regr["League"] == "BL 17-18"]
bl_17_18_regr_matches_first = matches[matches["League"] == "BL 17-18"]

bl_18_19_regr_stats_first = stats_regr[stats_regr["League"] == "BL 18-19"]
bl_18_19_regr_matches_first = matches[matches["League"] == "BL 18-19"]

pd_17_18_regr_stats_first = stats_regr[stats_regr["League"] == "PD 17-18"]
pd_17_18_regr_matches_first = matches[matches["League"] == "PD 17-18"]

pd_18_19_regr_stats_first = stats_regr[stats_regr["League"] == "PD 18-19"]
pd_18_19_regr_matches_first = matches[matches["League"] == "PD 18-19"]

In [None]:
# perform regression

# choose the statictical test type for the regression, options are: None, "durbin_watson", "breusch_pagan", "breusch_pagan_plot",
#                                                                   "jarque_bera", "jarque_bera_distplot", "jarque_bera_probplot"

stat_type = None

bl_17_18_regr_values_tactic_first = make_regression(bl_17_18_regr_stats_first, bl_17_18_regr_matches_first, in_vars_base, tactic_stats, tactic_matches, "BL 17-18", stat_type)
bl_17_18_regr_values_performance_first = make_regression(bl_17_18_regr_stats_first, bl_17_18_regr_matches_first, in_vars_base, perf_stats, perf_matches, "BL 17-18", stat_type)

bl_18_19_regr_values_tactic_first = make_regression(bl_18_19_regr_stats_first, bl_18_19_regr_matches_first, in_vars_base, tactic_stats, tactic_matches, "BL 18-19", stat_type)
bl_18_19_regr_values_performance_first = make_regression(bl_18_19_regr_stats_first, bl_18_19_regr_matches_first, in_vars_base, perf_stats, perf_matches, "BL 18-19", stat_type)

pd_17_18_regr_values_tactic_first = make_regression(pd_17_18_regr_stats_first, pd_17_18_regr_matches_first, in_vars_base, tactic_stats, tactic_matches, "PD 17-18", stat_type)
pd_17_18_regr_values_performance_first = make_regression(pd_17_18_regr_stats_first, pd_17_18_regr_matches_first, in_vars_base, perf_stats, perf_matches, "PD 17-18", stat_type)

pd_18_19_regr_values_tactic_first = make_regression(pd_18_19_regr_stats_first, pd_18_19_regr_matches_first, in_vars_base, tactic_stats, tactic_matches, "PD 18-19", stat_type)
pd_18_19_regr_values_performance_first = make_regression(pd_18_19_regr_stats_first, pd_18_19_regr_matches_first, in_vars_base, perf_stats, perf_matches, "PD 18-19", stat_type)

In [None]:
# combine tactic dfs and print the result in latex form

print("Taktik")
bl_17_18_regr_values_tactic_df_first = dict_to_latex(bl_17_18_regr_values_tactic_first, "BL 17-18", in_vars_base)
bl_18_19_regr_values_tactic_df_first = dict_to_latex(bl_18_19_regr_values_tactic_first, "BL 18-19", in_vars_base)
pd_17_18_regr_values_tactic_df_first = dict_to_latex(pd_17_18_regr_values_tactic_first, "PD 17-18", in_vars_base)
pd_18_19_regr_values_tactic_df_first = dict_to_latex(pd_18_19_regr_values_tactic_first, "PD 18-19", in_vars_base)
tactic_df_first = bl_17_18_regr_values_tactic_df_first.append([bl_18_19_regr_values_tactic_df_first, pd_17_18_regr_values_tactic_df_first, pd_18_19_regr_values_tactic_df_first])
print(tactic_df_first.to_latex())

In [None]:
# combine performance dfs and print it in latex form

print("Performance")
bl_17_18_regr_values_performance_df_first = dict_to_latex(bl_17_18_regr_values_performance_first, "BL 17-18", in_vars_base)
bl_18_19_regr_values_performance_df_first = dict_to_latex(bl_18_19_regr_values_performance_first, "BL 18-19", in_vars_base)
pd_17_18_regr_values_performance_df_first = dict_to_latex(pd_17_18_regr_values_performance_first, "PD 17-18", in_vars_base)
pd_18_19_regr_values_performance_df_first = dict_to_latex(pd_18_19_regr_values_performance_first, "PD 18-19", in_vars_base)
performance_df_first = bl_17_18_regr_values_performance_df_first.append([bl_18_19_regr_values_performance_df_first, pd_17_18_regr_values_performance_df_first, pd_18_19_regr_values_performance_df_first])
print(performance_df_first.to_latex())

### Basis Regression

#### Modell

In [None]:
# base data set for every league and season

bl_17_18_regr_stats = mean_df[mean_df["League"] == "BL 17-18"]
bl_17_18_regr_matches = matches[matches["League"] == "BL 17-18"]

bl_18_19_regr_stats = mean_df[mean_df["League"] == "BL 18-19"]
bl_18_19_regr_matches = matches[matches["League"] == "BL 18-19"]

pd_17_18_regr_stats = mean_df[mean_df["League"] == "PD 17-18"]
pd_17_18_regr_matches = matches[matches["League"] == "PD 17-18"]

pd_18_19_regr_stats = mean_df[mean_df["League"] == "PD 18-19"]
pd_18_19_regr_matches = matches[matches["League"] == "PD 18-19"]

In [None]:
# perform regression

# choose the statictical test type for the regression, options are: None, "durbin_watson", "breusch_pagan", "breusch_pagan_plot",
#                                                                   "jarque_bera", "jarque_bera_distplot", "jarque_bera_probplot"

stat_type = None

bl_17_18_regr_values_tactic = make_regression(bl_17_18_regr_stats, bl_17_18_regr_matches, in_vars_base, tactic_stats, tactic_matches, "BL 17-18", stat_type)
bl_17_18_regr_values_performance = make_regression(bl_17_18_regr_stats, bl_17_18_regr_matches, in_vars_base, perf_stats, perf_matches, "BL 17-18", stat_type)

bl_18_19_regr_values_tactic = make_regression(bl_18_19_regr_stats, bl_18_19_regr_matches, in_vars_base, tactic_stats, tactic_matches, "BL 18-19", stat_type)
bl_18_19_regr_values_performance = make_regression(bl_18_19_regr_stats, bl_18_19_regr_matches, in_vars_base, perf_stats, perf_matches, "BL 18-19", stat_type)

pd_17_18_regr_values_tactic = make_regression(pd_17_18_regr_stats, pd_17_18_regr_matches, in_vars_base, tactic_stats, tactic_matches, "PD 17-18", stat_type)
pd_17_18_regr_values_performance = make_regression(pd_17_18_regr_stats, pd_17_18_regr_matches, in_vars_base, perf_stats, perf_matches, "PD 17-18", stat_type)

pd_18_19_regr_values_tactic = make_regression(pd_18_19_regr_stats, pd_18_19_regr_matches, in_vars_base, tactic_stats, tactic_matches, "PD 18-19", stat_type)
pd_18_19_regr_values_performance = make_regression(pd_18_19_regr_stats, pd_18_19_regr_matches, in_vars_base, perf_stats, perf_matches, "PD 18-19", stat_type)

In [None]:
# combine tactic dfs and print it in latex form

print("Taktik")
bl_17_18_regr_values_tactic_df = dict_to_latex(bl_17_18_regr_values_tactic, "BL 17-18", in_vars_base)
bl_18_19_regr_values_tactic_df = dict_to_latex(bl_18_19_regr_values_tactic, "BL 18-19", in_vars_base)
pd_17_18_regr_values_tactic_df = dict_to_latex(pd_17_18_regr_values_tactic, "PD 17-18", in_vars_base)
pd_18_19_regr_values_tactic_df = dict_to_latex(pd_18_19_regr_values_tactic, "PD 18-19", in_vars_base)
tactic_df = bl_17_18_regr_values_tactic_df.append([bl_18_19_regr_values_tactic_df, pd_17_18_regr_values_tactic_df, pd_18_19_regr_values_tactic_df])
print(tactic_df.to_latex())

In [None]:
# combine performance dfs and print it in latex form

print("Performance")
bl_17_18_regr_values_performance_df = dict_to_latex(bl_17_18_regr_values_performance, "BL 17-18", in_vars_base)
bl_18_19_regr_values_performance_df = dict_to_latex(bl_18_19_regr_values_performance, "BL 18-19", in_vars_base)
pd_17_18_regr_values_performance_df = dict_to_latex(pd_17_18_regr_values_performance, "PD 17-18", in_vars_base)
pd_18_19_regr_values_performance_df = dict_to_latex(pd_18_19_regr_values_performance, "PD 18-19", in_vars_base)
performance_df = bl_17_18_regr_values_performance_df.append([bl_18_19_regr_values_performance_df, pd_17_18_regr_values_performance_df, pd_18_19_regr_values_performance_df])
print(performance_df.to_latex())

#### Lineare Beziehung zwischen den unabhängigen und abhängigen Variablen

In [None]:
# regplot of relationship between BL 18/19 "PassesAtt" and temperature before aggregation and transformation of the variables

sns.regplot(x="PassesAtt", y="Temperature", data=stats_regr[stats_regr["League"] == "PD 18-19"], 
                        fit_reg=True, 
                        scatter_kws={"color":"black"}, line_kws={"color":"red"})

plt.xlabel("PassesAtt in der PD 18/19")
plt.ylabel("Temperatur in °C")
plt.title("Zusammenhang von PassesAtt und der Temperatur vorher")
plt.rcParams['figure.dpi'] = 360
plt.tight_layout()
plt.savefig("plot_scatter_passesAtt_before.png")

In [None]:
# regplot of relationship between BL 18/19 "PassesAtt" and temperature after aggregation and transformation of the variables

sns.regplot(x="PassesAtt", y="Temperature", data=mean_df[mean_df["League"] == "PD 18-19"], 
                        fit_reg=True, 
                        scatter_kws={"color":"black"}, line_kws={"color":"red"})

plt.xlabel("PassesAtt in der PD 18/19")
plt.ylabel("Temperatur in °C")
plt.title("Zusammenhang von PassesAtt und der Temperatur nachher")
plt.rcParams['figure.dpi'] = 360
plt.tight_layout()
plt.savefig("plot_scatter_passesAtt_after.png")

In [None]:
# distribution of BL 18/19 "PassesAtt" before aggregation and transformation of the variables

sns.histplot(x="PassesAtt", data=stats_regr[stats_regr["League"] == "PD 18-19"])

plt.xlabel("PassesAtt in der PD 18/19")
plt.ylabel("Anzahl")
plt.title("Verteilung von PassesAtt vorher")
plt.rcParams['figure.dpi'] = 360
plt.tight_layout()
sns.set(style="whitegrid")
sns.despine(left=True)
plt.savefig("plot_hist_passesAtt_before.png")

In [None]:
# distribution of BL 18/19 "PassesAtt" after aggregation and transformation of the variables

sns.histplot(x="PassesAtt", data=mean_df[mean_df["League"] == "PD 18-19"])

plt.xlabel("PassesAtt in der PD 18/19")
plt.ylabel("Anzahl")
plt.title("Verteilung von PassesAtt nachher")
plt.rcParams['figure.dpi'] = 360
plt.tight_layout()
sns.set(style="whitegrid")
sns.despine(left=True)
plt.savefig("plot_hist_passesAtt_after.png")

##### BL 17/18

In [None]:
scatter_plots(mean_df, matches, "tactic", "BL 17-18", 7, 3)

In [None]:
scatter_plots(mean_df, matches, "performance", "BL 17-18", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df, matches, "tactic", "BL 17-18", 7, (20, 28))

In [None]:
hist_plots(mean_df, matches, "performance", "BL 17-18", 4, (20, 28))

##### BL 18/19

In [None]:
scatter_plots(mean_df, matches, "tactic", "BL 18-19", 7, 3)

In [None]:
scatter_plots(mean_df, matches, "performance", "BL 18-19", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df, matches, "tactic", "BL 18-19", 7, (20, 28))

In [None]:
hist_plots(mean_df, matches, "performance", "BL 18-19", 4, (20, 28))

##### PD 17/18

In [None]:
scatter_plots(mean_df, matches, "tactic", "PD 17-18", 7, 3)

In [None]:
scatter_plots(mean_df, matches, "performance", "PD 17-18", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df, matches, "tactic", "PD 17-18", 7, (20, 28))

In [None]:
hist_plots(mean_df, matches, "performance", "PD 17-18", 4, (15, 15))

##### PD 18/19

In [None]:
scatter_plots(mean_df, matches, "tactic", "PD 18-19", 7, 3)

In [None]:
scatter_plots(mean_df, matches, "performance", "PD 18-19", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df, matches, "tactic", "PD 18-19", 7, (20, 28))

In [None]:
hist_plots(mean_df, matches, "performance", "PD 18-19", 4, (15, 15))

#### Korrelation der unabhängigen Variablen

In [None]:
# VIF values for the independent variables of the base regression model

from statsmodels.stats.outliers_influence import variance_inflation_factor

for league in ["BL 17-18", "BL 18-19", "PD 17-18", "PD 18-19"]:
  X = mean_df[mean_df["League"] == league][["Temperature", "Rain", "Humidity"]]

  # VIF data frame 
  vif_data = pd.DataFrame() 
  vif_data["feature"] = X.columns 
    
  # calculating VIF for each feature 
  vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                            for i in range(len(X.columns))] 
    
  print(league)
  print(vif_data.to_latex())
  print((mean_df[mean_df["League"] == league][["Temperature", "Rain", "Humidity"]]).corr())
  print("---")

### Regression mit Top Sechs

#### Modell

In [None]:
bl_17_18_top6 = ["Bayern Munich", "Schalke 04", "Hoffenheim", "Dortmund", "Leverkusen", "RB Leipzig"]
bl_18_19_top6 = ["Bayern Munich", "Dortmund", "RB Leipzig", "Leverkusen", "M'Gladbach", "Wolfsburg"]
pd_17_18_top6 = ["Barcelona", "Atlético Madrid", "Real Madrid", "Valencia", "Villarreal", "Betis"]
pd_18_19_top6 = ["Barcelona", "Atlético Madrid", "Real Madrid", "Valencia", "Getafe", "Sevilla"]

In [None]:
mean_df_BL_17_18 = mean_df[mean_df["League"] == "BL 17-18"]
mean_df_BL_18_19 = mean_df[mean_df["League"] == "BL 18-19"]
mean_df_PD_17_18 = mean_df[mean_df["League"] == "PD 17-18"]
mean_df_PD_18_19 = mean_df[mean_df["League"] == "PD 18-19"]

In [None]:
# drop every teams except the top6 from the dataframe

mean_df_BL_17_18["drop"] = [squad in bl_17_18_top6 for squad in mean_df_BL_17_18["Squad"]]
mean_df_BL_17_18.drop(mean_df_BL_17_18[mean_df_BL_17_18["drop"] == False].index, inplace=True)

mean_df_BL_18_19["drop"] = [squad in bl_18_19_top6 for squad in mean_df_BL_18_19["Squad"]]
mean_df_BL_18_19.drop(mean_df_BL_18_19[mean_df_BL_18_19["drop"] == False].index, inplace=True)

mean_df_PD_17_18["drop"] = [squad in pd_17_18_top6 for squad in mean_df_PD_17_18["Squad"]]
mean_df_PD_17_18.drop(mean_df_PD_17_18[mean_df_PD_17_18["drop"] == False].index, inplace=True)

mean_df_PD_18_19["drop"] = [squad in pd_18_19_top6 for squad in mean_df_PD_18_19["Squad"]]
mean_df_PD_18_19.drop(mean_df_PD_18_19[mean_df_PD_18_19["drop"] == False].index, inplace=True)

In [None]:
mean_df_top6 = mean_df_BL_17_18.append([mean_df_BL_18_19, mean_df_PD_17_18, mean_df_PD_18_19])

In [None]:
matches_BL_17_18 = matches[matches["League"] == "BL 17-18"]
matches_BL_18_19 = matches[matches["League"] == "BL 18-19"]
matches_PD_17_18 = matches[matches["League"] == "PD 17-18"]
matches_PD_18_19 = matches[matches["League"] == "PD 18-19"]

In [None]:
# drop every match where no top6 teams takes part 

matches_BL_17_18["drop home"] = [squad in bl_17_18_top6 for squad in matches_BL_17_18["Home"]]
matches_BL_17_18["drop away"] = [squad in bl_17_18_top6 for squad in matches_BL_17_18["Away"]]
matches_BL_17_18.drop(matches_BL_17_18[(matches_BL_17_18["drop home"] == False) & (matches_BL_17_18["drop away"] == False)].index, inplace=True)

matches_BL_18_19["drop home"] = [squad in bl_18_19_top6 for squad in matches_BL_18_19["Home"]]
matches_BL_18_19["drop away"] = [squad in bl_18_19_top6 for squad in matches_BL_18_19["Away"]]
matches_BL_18_19.drop(matches_BL_18_19[(matches_BL_18_19["drop home"] == False) & (matches_BL_18_19["drop away"] == False)].index, inplace=True)

matches_PD_17_18["drop home"] = [squad in pd_17_18_top6 for squad in matches_PD_17_18["Home"]]
matches_PD_17_18["drop away"] = [squad in pd_17_18_top6 for squad in matches_PD_17_18["Away"]]
matches_PD_17_18.drop(matches_PD_17_18[(matches_PD_17_18["drop home"] == False) & (matches_PD_17_18["drop away"] == False)].index, inplace=True)

matches_PD_18_19["drop home"] = [squad in pd_18_19_top6 for squad in matches_PD_18_19["Home"]]
matches_PD_18_19["drop away"] = [squad in pd_18_19_top6 for squad in matches_PD_18_19["Away"]]
matches_PD_18_19.drop(matches_PD_18_19[(matches_PD_18_19["drop home"] == False) & (matches_PD_18_19["drop away"] == False)].index, inplace=True)

In [None]:
matches_top6 = matches_BL_17_18.append([matches_BL_18_19, matches_PD_17_18, matches_PD_18_19])

In [None]:
# perform regression with top6 teams

# choose the statictical test type for the regression, options are: None, "durbin_watson", "breusch_pagan", "breusch_pagan_plot",
#                                                                   "jarque_bera", "jarque_bera_distplot", "jarque_bera_probplot"

stat_type = None

bl_17_18_regr_top6_values_tactic = make_regression(mean_df_BL_17_18, matches_BL_17_18, in_vars_base, tactic_stats, tactic_matches, "BL 17-18", stat_type)
bl_17_18_regr_top6_values_performance = make_regression(mean_df_BL_17_18, matches_BL_17_18, in_vars_base, perf_stats, perf_matches, "BL 17-18", stat_type)

bl_18_19_regr_top6_values_tactic = make_regression(mean_df_BL_18_19, matches_BL_18_19, in_vars_base, tactic_stats, tactic_matches, "BL 18-19", stat_type)
bl_18_19_regr_top6_values_performance = make_regression(mean_df_BL_18_19, matches_BL_18_19, in_vars_base, perf_stats, perf_matches, "BL 18-19", stat_type)

pd_17_18_regr_top6_values_tactic = make_regression(mean_df_PD_17_18, matches_PD_17_18, in_vars_base, tactic_stats, tactic_matches, "PD 17-18", stat_type)
pd_17_18_regr_top6_values_performance = make_regression(mean_df_PD_17_18, matches_PD_17_18, in_vars_base, perf_stats, perf_matches, "PD 17-18", stat_type)

pd_18_19_regr_top6_values_tactic = make_regression(mean_df_PD_18_19, matches_PD_18_19, in_vars_base, tactic_stats, tactic_matches, "PD 18-19", stat_type)
pd_18_19_regr_top6_values_performance = make_regression(mean_df_PD_18_19, matches_PD_18_19, in_vars_base, perf_stats, perf_matches, "PD 18-19", stat_type)

In [None]:
# print top6 regression for those in latex form

print("Top Sechs")
bl_17_18_regr_values_top6_tactic_df = dict_to_latex(bl_17_18_regr_top6_values_tactic, "BL 17-18", in_vars_base)
bl_17_18_regr_values_top6_performance_df = dict_to_latex(bl_17_18_regr_top6_values_performance, "BL 17-18", in_vars_base)

bl_18_19_regr_values_top6_tactic_df = dict_to_latex(bl_18_19_regr_top6_values_tactic, "BL 18-19", in_vars_base)
bl_18_19_regr_values_top6_performance_df = dict_to_latex(bl_18_19_regr_top6_values_performance, "BL 18-19", in_vars_base)

pd_17_18_regr_values_top6_tactic_df = dict_to_latex(pd_17_18_regr_top6_values_tactic, "PD 17-18", in_vars_base)
pd_17_18_regr_values_top6_performance_df = dict_to_latex(pd_17_18_regr_top6_values_performance, "PD 17-18", in_vars_base)

pd_18_19_regr_values_top6_tactic_df = dict_to_latex(pd_18_19_regr_top6_values_tactic, "PD 18-19", in_vars_base)
pd_18_19_regr_values_top6_performance_df = dict_to_latex(pd_18_19_regr_top6_values_performance, "PD 18-19", in_vars_base)

top6_tactic_df = bl_17_18_regr_values_top6_tactic_df.append([bl_18_19_regr_values_top6_tactic_df, 
                                                               pd_17_18_regr_values_top6_tactic_df,
                                                               pd_18_19_regr_values_top6_tactic_df])

top6_performance_df = bl_17_18_regr_values_top6_performance_df.append([bl_18_19_regr_values_top6_performance_df,
                                                                         pd_17_18_regr_values_top6_performance_df,
                                                                         pd_18_19_regr_values_top6_performance_df])


print("Taktik")
print(top6_tactic_df.to_latex())

print("Performance")
print(top6_performance_df.to_latex())

#### Lineare Beziehung zwischen den unabhängigen und abhängigen Variablen

##### BL 17/18

In [None]:
scatter_plots(mean_df_top6, matches_top6, "tactic", "BL 17-18", 7, 3)

In [None]:
scatter_plots(mean_df_top6, matches_top6, "performance", "BL 17-18", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df_top6, matches_top6, "tactic", "BL 17-18", 7, (20, 28))

In [None]:
hist_plots(mean_df_top6, matches_top6, "performance", "BL 17-18", 4, (20, 28))

##### BL 18/19

In [None]:
scatter_plots(mean_df_top6, matches_top6, "tactic", "BL 18-19", 7, 3)

In [None]:
scatter_plots(mean_df_top6, matches_top6, "performance", "BL 18-19", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df_top6, matches_top6, "tactic", "BL 18-19", 7, (20, 28))

In [None]:
hist_plots(mean_df_top6, matches_top6, "performance", "BL 18-19", 4, (20, 28))

##### PD 17/18

In [None]:
scatter_plots(mean_df_top6, matches_top6, "tactic", "PD 17-18", 7, 3)

In [None]:
scatter_plots(mean_df_top6, matches_top6, "performance", "PD 17-18", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df_top6, matches_top6, "tactic", "PD 17-18", 7, (20, 28))

In [None]:
hist_plots(mean_df_top6, matches_top6, "performance", "PD 17-18", 4, (20, 28))

##### PD 18/19

In [None]:
scatter_plots(mean_df_top6, matches_top6, "tactic", "PD 18-19", 7, 3)

In [None]:
scatter_plots(mean_df_top6, matches_top6, "performance", "PD 18-19", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df_top6, matches_top6, "tactic", "PD 18-19", 7, (20, 28))

In [None]:
hist_plots(mean_df_top6, matches_top6, "performance", "PD 18-19", 4, (20, 28))

#### Korrelation der unabhängigen Variablen

In [None]:
# VIF values for the independent variables of the top6 regression model

from statsmodels.stats.outliers_influence import variance_inflation_factor

for league in ["BL 17-18", "BL 18-19", "PD 17-18", "PD 18-19"]:
  X = mean_df_top6[mean_df_top6["League"] == league][["Temperature", "Rain", "Humidity"]]

  # VIF data frame 
  vif_data = pd.DataFrame() 
  vif_data["feature"] = X.columns 
    
  # calculating VIF for each feature 
  vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                            for i in range(len(X.columns))] 
    
  print(league)
  print(vif_data.to_latex())
  print((mean_df_top6[mean_df_top6["League"] == league][["Temperature", "Rain", "Humidity"]]).corr())
  print("---")

### Regression nur mit Heimmannschaften

#### Modell

In [None]:
# home data set for every league and season

bl_17_18_home_regr_stats = mean_df_home[mean_df_home["League"] == "BL 17-18"]
bl_17_18_home_regr_matches = matches[matches["League"] == "BL 17-18"]

bl_18_19_home_regr_stats = mean_df_home[mean_df_home["League"] == "BL 18-19"]
bl_18_19_home_regr_matches = matches[matches["League"] == "BL 18-19"]

pd_17_18_home_regr_stats = mean_df_home[mean_df_home["League"] == "PD 17-18"]
pd_17_18_home_regr_matches = matches[matches["League"] == "PD 17-18"]

pd_18_19_home_regr_stats = mean_df_home[mean_df_home["League"] == "PD 18-19"]
pd_18_19_home_regr_matches = matches[matches["League"] == "PD 18-19"]

In [None]:
# perform regression with home teams 

# choose the statictical test type for the regression, options are: None, "durbin_watson", "breusch_pagan", "breusch_pagan_plot",
#                                                                   "jarque_bera", "jarque_bera_distplot", "jarque_bera_probplot"

stat_type = None

bl_17_18_home_regr_values_tactic = make_regression(bl_17_18_home_regr_stats, bl_17_18_home_regr_matches, in_vars_base, tactic_stats, tactic_matches, "BL 17-18", stat_type)
bl_17_18_home_regr_values_performance = make_regression(bl_17_18_home_regr_stats, bl_17_18_home_regr_matches, in_vars_base, perf_stats, perf_matches, "BL 17-18", stat_type)

bl_18_19_home_regr_values_tactic = make_regression(bl_18_19_home_regr_stats, bl_18_19_home_regr_matches, in_vars_base, tactic_stats, tactic_matches, "BL 18-19", stat_type)
bl_18_19_home_regr_values_performance = make_regression(bl_18_19_home_regr_stats, bl_18_19_home_regr_matches, in_vars_base, perf_stats, perf_matches, "BL 18-19", stat_type)

pd_17_18_home_regr_values_tactic = make_regression(pd_17_18_home_regr_stats, pd_17_18_home_regr_matches, in_vars_base, tactic_stats, tactic_matches, "PD 17-18", stat_type)
pd_17_18_home_regr_values_performance = make_regression(pd_17_18_home_regr_stats, pd_17_18_home_regr_matches, in_vars_base, perf_stats, perf_matches, "PD 17-18", stat_type)

pd_18_19_home_regr_values_tactic = make_regression(pd_18_19_home_regr_stats, pd_18_19_home_regr_matches, in_vars_base, tactic_stats, tactic_matches, "PD 18-19", stat_type)
pd_18_19_home_regr_values_performance = make_regression(pd_18_19_home_regr_stats, pd_18_19_home_regr_matches, in_vars_base, perf_stats, perf_matches, "PD 18-19", stat_type)

In [None]:
# combine tactic dfs and print it in latex form

print("Taktik")
bl_17_18_home_regr_values_tactic_df = dict_to_latex(bl_17_18_home_regr_values_tactic, "BL 17-18", in_vars_base)#.swapaxes("index", "columns")
bl_18_19_home_regr_values_tactic_df = dict_to_latex(bl_18_19_home_regr_values_tactic, "BL 18-19", in_vars_base)#.swapaxes("index", "columns")
pd_17_18_home_regr_values_tactic_df = dict_to_latex(pd_17_18_home_regr_values_tactic, "PD 17-18", in_vars_base)#.swapaxes("index", "columns")
pd_18_19_home_regr_values_tactic_df = dict_to_latex(pd_18_19_home_regr_values_tactic, "PD 18-19", in_vars_base)#.swapaxes("index", "columns")
home_tactic_df = bl_17_18_home_regr_values_tactic_df.append([bl_18_19_home_regr_values_tactic_df, pd_17_18_home_regr_values_tactic_df, pd_18_19_home_regr_values_tactic_df])
print(home_tactic_df.to_latex())

In [None]:
# combine performance dfs and print it in latex form

print("Performance")
bl_17_18_home_regr_values_performance_df = dict_to_latex(bl_17_18_home_regr_values_performance, "BL 17-18", in_vars_base)#.swapaxes("index", "columns")
bl_18_19_home_regr_values_performance_df = dict_to_latex(bl_18_19_home_regr_values_performance, "BL 18-19", in_vars_base)#.swapaxes("index", "columns")
pd_17_18_home_regr_values_performance_df = dict_to_latex(pd_17_18_home_regr_values_performance, "PD 17-18", in_vars_base)#.swapaxes("index", "columns")
pd_18_19_home_regr_values_performance_df = dict_to_latex(pd_18_19_home_regr_values_performance, "PD 18-19", in_vars_base)#.swapaxes("index", "columns")
home_performance_df = bl_17_18_home_regr_values_performance_df.append([bl_18_19_home_regr_values_performance_df, pd_17_18_home_regr_values_performance_df, pd_18_19_home_regr_values_performance_df])
print(home_performance_df.to_latex())

#### Lineare Beziehung zwischen den unabhängigen und abhängigen Variablen

##### BL 17/18

In [None]:
scatter_plots(mean_df_home, matches, "tactic", "BL 17-18", 7, 3)

In [None]:
scatter_plots(mean_df_home, matches, "performance", "BL 17-18", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df_home, matches, "tactic", "BL 17-18", 7, (20, 28))

In [None]:
hist_plots(mean_df_home, matches, "performance", "BL 17-18", 4, (20, 28))

##### BL 18/19

In [None]:
scatter_plots(mean_df_home, matches, "tactic", "BL 18-19", 7, 3)

In [None]:
scatter_plots(mean_df_home, matches, "performance", "BL 18-19", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df_home, matches, "tactic", "BL 18-19", 7, (20, 28))

In [None]:
hist_plots(mean_df_home, matches, "performance", "BL 18-19", 4, (20, 28))

##### PD 17/18

In [None]:
scatter_plots(mean_df_home, matches, "tactic", "PD 17-18", 7, 3)

In [None]:
scatter_plots(mean_df_home, matches, "performance", "PD 17-18", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df_home, matches, "tactic", "PD 17-18", 7, (20, 28))

In [None]:
hist_plots(mean_df_home, matches, "performance", "PD 17-18", 4, (20, 28))

##### PD 18/19

In [None]:
scatter_plots(mean_df_home, matches, "tactic", "PD 18-19", 7, 3)

In [None]:
scatter_plots(mean_df_home, matches, "performance", "PD 18-19", 4, 3, (15, 15))

In [None]:
hist_plots(mean_df_home, matches, "tactic", "PD 18-19", 7, (20, 28))

In [None]:
hist_plots(mean_df_home, matches, "performance", "PD 18-19", 4, (20, 28))

#### Korrelation der unabhängigen Variablen

In [None]:
# VIF values for the independent variables of the home team regression model

from statsmodels.stats.outliers_influence import variance_inflation_factor

for league in ["BL 17-18", "BL 18-19", "PD 17-18", "PD 18-19"]:
  X = mean_df_home[mean_df_home["League"] == league][["Temperature", "Rain", "Humidity"]]

  # VIF data frame 
  vif_data = pd.DataFrame() 
  vif_data["feature"] = X.columns 
    
  # calculating VIF for each feature 
  vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                            for i in range(len(X.columns))] 
    
  print(league)
  print(vif_data.to_latex())
  print((mean_df_home[mean_df_home["League"] == league][["Temperature", "Rain", "Humidity"]]).corr())
  print("---")

# Statistiken

In [None]:
# create a data frame containing the kickoff times and the corresponding chosen interval

time_dict = {"Time":[]}

for i in np.unique(matches["Time"]):
  time_dict["Time"].append(i)

chosen_interval = ["12:00-14:59", "12:00-14:59", "12:00-14:59", "12:00-14:59", "12:00-14:59",
                   "15:00-17:59", "15:00-17:59", "18:00-20:59", "18:00-20:59", "18:00-20:59",
                   "18:00-20:59", "18:00-20:59", "18:00-20:59", "18:00-20:59", "21:00-23:59",
                   "21:00-23:59", "21:00-23:59", "21:00-23:59", "21:00-23:59", "21:00-23:59",
                   "21:00-23:59", "21:00-23:59", "21:00-23:59", "21:00-23:59", "21:00-23:59",
                   "21:00-23:59"]

time_dict["Chosen Interval"] = chosen_interval

time_interval_df = pd.DataFrame(time_dict)
time_interval_df.set_index("Time", inplace=True)
print(time_interval_df.to_latex())

In [None]:
# statistics of Home/Away Score and Home Team/Away Team Points

descr_matches_BL_17_18 = np.round(matches[matches["League"] == "BL 17-18"].describe().loc[["mean", "std", "50%", "min", "max"], ["Home Score", "Away Score", "Home Team Points", "Away Team Points"]], 2)
descr_matches_BL_18_19 = np.round(matches[matches["League"] == "BL 18-19"].describe().loc[["mean", "std", "50%", "min", "max"], ["Home Score", "Away Score", "Home Team Points", "Away Team Points"]], 2)
descr_matches_PD_17_18 = np.round(matches[matches["League"] == "PD 17-18"].describe().loc[["mean", "std", "50%", "min", "max"], ["Home Score", "Away Score", "Home Team Points", "Away Team Points"]], 2)
descr_matches_PD_18_19 = np.round(matches[matches["League"] == "PD 18-19"].describe().loc[["mean", "std", "50%", "min", "max"], ["Home Score", "Away Score", "Home Team Points", "Away Team Points"]], 2)

print(descr_matches_BL_17_18.swapaxes("index", "columns").to_latex())
print(descr_matches_BL_18_19.swapaxes("index", "columns").to_latex())
print(descr_matches_PD_17_18.swapaxes("index", "columns").to_latex())
print(descr_matches_PD_18_19.swapaxes("index", "columns").to_latex())

In [None]:
def create_descr(df_stats, df_matches, league, descr_type):
  """
    Creates description data sets from every league and season of the tactic and performance variables.

            Parameters:
                    df_stats (data frame)   : Statistics data frame on which the description is performed
                    df_matches (data frame) : Matches data frame on which the description is performed
                    league (str)            : Name and season of the league for the description
                    descr_type (str)        : States the type of variables to do the description on

            Returns:
                    None
  """

  league_stats = df_stats[df_stats["League"] == league]
  league_matches = df_matches[df_matches["League"] == league]

  if descr_type == "tactic":
    tactic_stats = league_stats.describe().loc[["mean", "std", "50%", "min", "max"], ["PassesAtt", "Carries", "Sh", "Tkl", "Press"]]
    tactic_matches = league_matches.describe().loc[["mean", "std", "50%", "min", "max"], ["Possession Home", "Possession Away"]]
    league_descr = pd.merge(tactic_stats, tactic_matches, left_index= True, right_index=True)
  elif descr_type == "performance":
    tactic_stats = league_stats.describe().loc[["mean", "std", "50%", "min", "max"], ["PassesCmp%", "DribblesSucc%", "TklW", "Press%"]]
    league_descr = tactic_stats 
  else:
    print("Description Type Error for {}".format(league))
    return

  print("{} {}:".format(league, descr_type))
  league_descr = league_descr.swapaxes("index", "columns")
  league_descr = league_descr.round(2)
  league_descr["League"] = league
  print(league_descr.to_latex())
  print("---")

In [None]:
# create description tables for the tactic variables of the original version of the data set
# if the error "TypeError: 'module' object is not subscriptable" occurs, the "stats" data set has to be read in again

stats = pd.read_csv(source + "stats.csv")
stats.drop(stats[(stats["Min"].isna())&(stats["PlayerID"] >= 14)].index, inplace=True)

create_descr(stats, matches, "BL 17-18", "tactic")
create_descr(stats, matches, "BL 18-19", "tactic")
create_descr(stats, matches, "PD 17-18", "tactic")
create_descr(stats, matches, "PD 18-19", "tactic")

In [None]:
# create description tables for the performance variables of the original version of the data set

create_descr(stats, matches, "BL 17-18", "performance")
create_descr(stats, matches, "BL 18-19", "performance")
create_descr(stats, matches, "PD 17-18", "performance")
create_descr(stats, matches, "PD 18-19", "performance")

In [None]:
# create description tables for the tactic variables of the mean_df data set

create_descr(mean_df, matches, "BL 17-18", "tactic")
create_descr(mean_df, matches, "BL 18-19", "tactic")
create_descr(mean_df, matches, "PD 17-18", "tactic")
create_descr(mean_df, matches, "PD 18-19", "tactic")

In [None]:
# create description tables for the performance variables of the mean_df data set

create_descr(mean_df, matches, "BL 17-18", "performance")
create_descr(mean_df, matches, "BL 18-19", "performance")
create_descr(mean_df, matches, "PD 17-18", "performance")
create_descr(mean_df, matches, "PD 18-19", "performance")

In [None]:
# create description tables for the tactic variables of the top6 data set

create_descr(mean_df_top6, matches_top6, "BL 17-18", "tactic")
create_descr(mean_df_top6, matches_top6, "BL 18-19", "tactic")
create_descr(mean_df_top6, matches_top6, "PD 17-18", "tactic")
create_descr(mean_df_top6, matches_top6, "PD 18-19", "tactic")

In [None]:
# create description tables for the performance variables of the top6 data set

create_descr(mean_df_top6, matches_top6, "BL 17-18", "performance")
create_descr(mean_df_top6, matches_top6, "BL 18-19", "performance")
create_descr(mean_df_top6, matches_top6, "PD 17-18", "performance")
create_descr(mean_df_top6, matches_top6, "PD 18-19", "performance")

In [None]:
# create description tables for the tactic variables of the home data set

create_descr(mean_df_home, matches, "BL 17-18", "tactic")
create_descr(mean_df_home, matches, "BL 18-19", "tactic")
create_descr(mean_df_home, matches, "PD 17-18", "tactic")
create_descr(mean_df_home, matches, "PD 18-19", "tactic")

In [None]:
# create description tables for the performance variables of the home data set

create_descr(mean_df_home, matches, "BL 17-18", "performance")
create_descr(mean_df_home, matches, "BL 18-19", "performance")
create_descr(mean_df_home, matches, "PD 17-18", "performance")
create_descr(mean_df_home, matches, "PD 18-19", "performance")

In [None]:
# number of players

spieler_BL_17_18 = stats[stats["League"] == "BL 17-18"]["Player"]
spieler_BL_18_19 = stats[stats["League"] == "BL 18-19"]["Player"]
spieler_PD_17_18 = stats[stats["League"] == "PD 17-18"]["Player"]
spieler_PD_18_19 = stats[stats["League"] == "PD 18-19"]["Player"]
spieler_gesamt = len(np.unique(stats["Player"]))

print("Anzahl der Spieler in der BL Saison 17/18: {}".format(len(np.unique(spieler_BL_17_18))))
print("Anzahl der Spieler in der BL Saison 18/19: {}".format(len(np.unique(spieler_BL_18_19))))
print("Anzahl der Spieler in der PD Saison 17/18: {}".format(len(np.unique(spieler_PD_17_18))))
print("Anzahl der Spieler in der PD Saison 18/19: {}".format(len(np.unique(spieler_PD_18_19))))
print("Summe der Spieler gesamt: {}".format(spieler_gesamt))

leagues = ["BL 17/18", "BL 18/19", "PD 17/18", "PD 18/19", "Total"]
player_numbers = [len(np.unique(spieler_BL_17_18)),
                  len(np.unique(spieler_BL_18_19)),
                  len(np.unique(spieler_PD_17_18)),
                  len(np.unique(spieler_PD_18_19)),
                  spieler_gesamt]

player_no_dict = {"League":leagues, "Number":player_numbers}

player_no_df = pd.DataFrame(player_no_dict)
player_no_df.set_index("League", inplace=True)
print(player_no_df.to_latex())

In [None]:
def weather_plot(weather_data_df_original, weather_type, labels):
  """
    Plots the weather statistics across the leagues and seasons.

            Parameters:
                    weather_data_df_original (data frame) : Original data frame with weather data that should be plotted
                    weather_type (str)                    : Type of weather variable
                    labels (list)                         : Contains the values for the labels of the bars in the bar plot

            Returns:
                    None
  """

  weather_data_df = weather_data_df_original.copy()
  names_for_plots = {"Min":"Min.", "Max":"Max.", "Mean":"Arithm. Mw"}
  weather_data_df["Type"] = [names_for_plots[w_type] if w_type in names_for_plots else w_type for w_type in weather_data_df["Type"]]

  leagues = ["BL 17/18", "BL 18/19", "PD 17/18", "PD 18/19"]

  colors = {"Min.":"#b3cde0", "Max.":"#005b96", "Arithm. Mw":"#6497b1"}
  c = weather_data_df["Type"].apply(lambda x: colors[x])
  
  plot = weather_data_df.plot(kind="bar", x="League", y="{}".format(weather_type), color=c)
  plt.xticks((1, 4, 7, 10), leagues, rotation="horizontal")
  plt.xlabel("Liga")
  rects = plot.patches

  for rect, label in zip(rects, labels):
      height = rect.get_height()
      if height < 0:
        height *= -1
      plot.text(rect.get_x() + rect.get_width() / 2, height + 1, label, ha='center', va='bottom')
    
  labels_colors = list(colors.keys())
  handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels_colors]
  plt.legend(handles, labels_colors, loc="upper left", bbox_to_anchor=(1.05, 1.0))

  if weather_type == "Temperature":
    plt.ylabel("Temperatur in °C")
  elif weather_type == "Rain":  
    plt.ylabel("Regenmenge in mm")
  elif weather_type == "Humidity":
    plt.ylabel("Luftfeuchtigkeit in %")

  sns.set(style="whitegrid")
  plt.rcParams['figure.dpi'] = 360

  plt.tight_layout()
  sns.despine(left=True)

  plt.savefig("plot_{}.png".format(weather_type))

In [None]:
# weather statistics of every league and season

max_temp_BL_17_18 = max(matches[matches["League"] == "BL 17-18"]["Temperature"])
max_temp_BL_18_19 = max(matches[matches["League"] == "BL 18-19"]["Temperature"])
max_temp_PD_17_18 = max(matches[matches["League"] == "PD 17-18"]["Temperature"])
max_temp_PD_18_19 = max(matches[matches["League"] == "PD 18-19"]["Temperature"])

min_temp_BL_17_18 = min(matches[matches["League"] == "BL 17-18"]["Temperature"])
min_temp_BL_18_19 = min(matches[matches["League"] == "BL 18-19"]["Temperature"])
min_temp_PD_17_18 = min(matches[matches["League"] == "PD 17-18"]["Temperature"])
min_temp_PD_18_19 = min(matches[matches["League"] == "PD 18-19"]["Temperature"])

mean_temp_BL_17_18 = matches[matches["League"] == "BL 17-18"]["Temperature"].mean()
mean_temp_BL_18_19 = matches[matches["League"] == "BL 18-19"]["Temperature"].mean()
mean_temp_PD_17_18 = matches[matches["League"] == "PD 17-18"]["Temperature"].mean()
mean_temp_PD_18_19 = matches[matches["League"] == "PD 18-19"]["Temperature"].mean()


max_rain_BL_17_18 = max(matches[matches["League"] == "BL 17-18"]["Rain"])
max_rain_BL_18_19 = max(matches[matches["League"] == "BL 18-19"]["Rain"])
max_rain_PD_17_18 = max(matches[matches["League"] == "PD 17-18"]["Rain"])
max_rain_PD_18_19 = max(matches[matches["League"] == "PD 18-19"]["Rain"])

min_rain_BL_17_18 = min(matches[matches["League"] == "BL 17-18"]["Rain"])
min_rain_BL_18_19 = min(matches[matches["League"] == "BL 18-19"]["Rain"])
min_rain_PD_17_18 = min(matches[matches["League"] == "PD 17-18"]["Rain"])
min_rain_PD_18_19 = min(matches[matches["League"] == "PD 18-19"]["Rain"])

mean_rain_BL_17_18 = matches[matches["League"] == "BL 17-18"]["Rain"].mean()
mean_rain_BL_18_19 = matches[matches["League"] == "BL 18-19"]["Rain"].mean()
mean_rain_PD_17_18 = matches[matches["League"] == "PD 17-18"]["Rain"].mean()
mean_rain_PD_18_19 = matches[matches["League"] == "PD 18-19"]["Rain"].mean()


max_humi_BL_17_18 = max(matches[matches["League"] == "BL 17-18"]["Humidity"])
max_humi_BL_18_19 = max(matches[matches["League"] == "BL 18-19"]["Humidity"])
max_humi_PD_17_18 = max(matches[matches["League"] == "PD 17-18"]["Humidity"])
max_humi_PD_18_19 = max(matches[matches["League"] == "PD 18-19"]["Humidity"])

min_humi_BL_17_18 = min(matches[matches["League"] == "BL 17-18"]["Humidity"])
min_humi_BL_18_19 = min(matches[matches["League"] == "BL 18-19"]["Humidity"])
min_humi_PD_17_18 = min(matches[matches["League"] == "PD 17-18"]["Humidity"])
min_humi_PD_18_19 = min(matches[matches["League"] == "PD 18-19"]["Humidity"])

mean_humi_BL_17_18 = matches[matches["League"] == "BL 17-18"]["Humidity"].mean()
mean_humi_BL_18_19 = matches[matches["League"] == "BL 18-19"]["Humidity"].mean()
mean_humi_PD_17_18 = matches[matches["League"] == "PD 17-18"]["Humidity"].mean()
mean_humi_PD_18_19 = matches[matches["League"] == "PD 18-19"]["Humidity"].mean()


# create data frame containing the weather values

data = {"League":["BL 17-18","BL 17-18","BL 17-18","BL 18-19","BL 18-19","BL 18-19",
                  "PD 17-18","PD 17-18","PD 17-18","PD 18-19","PD 18-19","PD 18-19"],
        "Type":["Min","Max","Mean","Min","Max","Mean","Min","Max","Mean","Min","Max","Mean"],
        "Temperature":[min_temp_BL_17_18,max_temp_BL_17_18,mean_temp_BL_17_18,
                       min_temp_BL_18_19,max_temp_BL_18_19,mean_temp_BL_18_19,
                       min_temp_PD_17_18,max_temp_PD_17_18,mean_temp_PD_17_18,
                       min_temp_PD_18_19,max_temp_PD_18_19,mean_temp_PD_18_19],
        "Rain":[min_rain_BL_17_18,max_rain_BL_17_18,mean_rain_BL_17_18,
                min_rain_BL_18_19,max_rain_BL_18_19,mean_rain_BL_18_19,
                min_rain_PD_17_18,max_rain_PD_17_18,mean_rain_PD_17_18,
                min_rain_PD_18_19,max_rain_PD_18_19,mean_rain_PD_18_19],
        "Humidity":[min_humi_BL_17_18,max_humi_BL_17_18,mean_humi_BL_17_18,
                    min_humi_BL_18_19,max_humi_BL_18_19,mean_humi_BL_18_19,
                    min_humi_PD_17_18,max_humi_PD_17_18,mean_humi_PD_17_18,
                    min_humi_PD_18_19,max_humi_PD_18_19,mean_humi_PD_18_19]}

weather_data = pd.DataFrame(data)
weather_data

In [None]:
# labels for the specific columns in the bar plot

labels_temp = [min_temp_BL_17_18,max_temp_BL_17_18,round(mean_temp_BL_17_18, 2),
               min_temp_BL_18_19,max_temp_BL_18_19,round(mean_temp_BL_18_19, 2),
               min_temp_PD_17_18,max_temp_PD_17_18,round(mean_temp_PD_17_18, 2),
               min_temp_PD_18_19,max_temp_PD_18_19,round(mean_temp_PD_18_19, 2)]

labels_rain = [min_rain_BL_17_18,max_rain_BL_17_18,round(mean_rain_BL_17_18, 2),
               min_rain_BL_18_19,max_rain_BL_18_19,round(mean_rain_BL_18_19, 2),
               min_rain_PD_17_18,max_rain_PD_17_18,round(mean_rain_PD_17_18, 2),
               min_rain_PD_18_19,max_rain_PD_18_19,round(mean_rain_PD_18_19, 2)]

labels_humi = [min_humi_BL_17_18,max_humi_BL_17_18,round(mean_humi_BL_17_18, 2),
               min_humi_BL_18_19,max_humi_BL_18_19,round(mean_humi_BL_18_19, 2),
               min_humi_PD_17_18,max_humi_PD_17_18,round(mean_humi_PD_17_18, 2),
               min_humi_PD_18_19,max_humi_PD_18_19,round(mean_humi_PD_18_19, 2)]


# plot weather statistics

weather_plot(weather_data, "Temperature", labels_temp)
weather_plot(weather_data, "Rain", labels_rain)
weather_plot(weather_data, "Humidity", labels_humi)

In [None]:
def weather_plot_winter(weather_data_df, weather_type, labels):
  """
    Plots the weather values in the winter months for every league and season.

            Parameters:
                    weather_data_df (data frame) : Data frame with weather data that should be plotted
                    weather_type (str)           : Type of weather variable
                    labels (list)                : Contains the values for the labels of the bars in the bar plot

            Returns:
                    None
  """

  leagues = ["BL 17-18", "BL 18-19", "PD 17-18", "PD 18-19"]

  plot = weather_data_df.plot(kind="bar", x="League", y="{}".format(weather_type))
  plt.xticks((0, 1, 2, 3), leagues)
  rects = plot.patches

  for rect, label in zip(rects, labels):
      height = rect.get_height()
      if height < 0:
        height *= -1
      plot.text(rect.get_x() + rect.get_width() / 2, height + 1, label, ha='center', va='bottom')

  if weather_type == "Temperature":
    plt.ylabel("Temperature in °C")
  elif weather_type == "Rain":  
    plt.ylabel("Amount of rain in mm")
  elif weather_type == "Humidity":
    plt.ylabel("Humidity in %")

  plt.style.use("seaborn-whitegrid")
  plt.tight_layout()

  plt.gca().spines['right'].set_color('none')
  plt.gca().spines['top'].set_color('none')

  plt.savefig("plot_{}_winter.png".format(weather_type))

In [None]:
# convert the "Date" column values into datetime objects

matches["Date"] = pd.to_datetime(matches["Date"])
matches


# data for mean temperatures in winter months

mean_temp_winter_BL_17_18 = matches[(matches["League"] == "BL 17-18") &
                                    (matches["Date"] >= datetime.datetime(2017, 12, 1)) &
                                    (matches["Date"] < datetime.datetime(2018, 3, 1))]["Temperature"].mean()
mean_temp_winter_BL_18_19 = matches[(matches["League"] == "BL 18-19") &
                                    (matches["Date"] >= datetime.datetime(2018, 12, 1)) &
                                    (matches["Date"] < datetime.datetime(2019, 3, 1))]["Temperature"].mean()
mean_temp_winter_PD_17_18 = matches[(matches["League"] == "PD 17-18") &
                                    (matches["Date"] >= datetime.datetime(2017, 12, 1)) &
                                    (matches["Date"] < datetime.datetime(2018, 3, 1))]["Temperature"].mean()
mean_temp_winter_PD_18_19 = matches[(matches["League"] == "PD 18-19") &
                                    (matches["Date"] >= datetime.datetime(2018, 12, 1)) &
                                    (matches["Date"] < datetime.datetime(2019, 3, 1))]["Temperature"].mean()


# create a data frame containing the temperature mean of the winter months

data_winter = {"League":["BL 17-18","BL 18-19","PD 17-18","PD 18-19"],
               "Type":["Mean","Mean","Mean","Mean"],
               "Temperature":[mean_temp_winter_BL_17_18,
                              mean_temp_winter_BL_18_19,
                              mean_temp_winter_PD_17_18,
                              mean_temp_winter_PD_18_19]}

winter_df = pd.DataFrame(data_winter)
winter_df

In [None]:
# winter temperature labels for specific columns in the bar plot

labels_temp_winter = [round(mean_temp_winter_BL_17_18, 2),
                      round(mean_temp_winter_BL_18_19, 2),
                      round(mean_temp_winter_PD_17_18, 2),
                      round(mean_temp_winter_PD_18_19, 2)]


# plot mean temperature for winter months across the seasons

weather_plot_winter(winter_df, "Temperature", labels_temp_winter)

In [None]:
def weather_months_plot(weather_data_df, weather_type, labels):
  """
    Plots the weather statistics across the leagues and seasons.

            Parameters:
                    weather_data_df (data frame) : Data frame with weather data that should be plotted
                    weather_type (str)           : Type of weather variable
                    labels (list)                : Contains the values for the labels of the bars in the bar plot

            Returns:
                    None
  """

  months = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"]

  colors = {'BL 17-18':'blue', 'BL 18-19':'red', "PD 17-18":"green", "PD 18-19":"yellow"}
  c = weather_data_df["Type"].apply(lambda x: colors[x])
  
  plot = weather_data_df.plot(kind="bar", x="Months", y="{}".format(weather_type), color=c)
  plt.xticks((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), months)
  rects = plot.patches

  for rect, label in zip(rects, labels):
      height = rect.get_height()
      if height < 0:
        height *= -1
      plot.text(rect.get_x() + rect.get_width() / 2, height + 3, label, ha='center', va='bottom')
    
  labels_colors = list(colors.keys())
  handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels_colors]
  plt.legend(handles, labels_colors)

  if weather_type == "Temperature":
    plt.ylabel("Temperature in °C")
  elif weather_type == "Rain":  
    plt.ylabel("Amount of rain in mm")
  elif weather_type == "Humidity":
    plt.ylabel("Humidity in %")

  plt.style.use("seaborn-whitegrid")
  plt.tight_layout()

  plt.gca().spines['right'].set_color('none')
  plt.gca().spines['top'].set_color('none')

  plt.savefig("plot_months_{}.png".format(weather_type))

In [None]:
def create_months_series(matches_df, league, weather_type):
  """
    Creates a series with mean weather values for every month.

            Parameters:
                    matches_df (data frame) : Matches data frame from which the series is created
                    league (str)            : Name and season of the league for the series
                    weather_type (list)     : Types of weather variables for the series

            Returns:
                    monthly_avg (series) : A series of the monthly averages 
  """
  if weather_type == "Temperature":
    months_matches = matches.set_index("Date")
    months = months_matches[months_matches["League"] == league].index.month
    monthly_avg = round(months_matches[months_matches["League"] == league].groupby(months).Temperature.mean(), 1)
    return monthly_avg
  elif weather_type == "Rain":
    months_matches = matches.set_index("Date")
    months = months_matches[months_matches["League"] == league].index.month
    monthly_avg = round(months_matches[months_matches["League"] == league].groupby(months).Rain.mean(), 2)
    return monthly_avg
  else:
    months_matches = matches.set_index("Date")
    months = months_matches[months_matches["League"] == league].index.month
    monthly_avg = round(months_matches[months_matches["League"] == league].groupby(months).Humidity.mean(), 1)
    return monthly_avg

months_df = pd.DataFrame(create_months_series(matches, "BL 17-18", "Temperature"))
months_df.rename(columns={"Temperature":"Temperature BL 17-18"}, inplace=True)
months_df["Temperature BL 18-19"] = create_months_series(matches, "BL 18-19", "Temperature")
months_df["Temperature PD 17-18"] = create_months_series(matches, "PD 17-18", "Temperature")
months_df["Temperature PD 18-19"] = create_months_series(matches, "PD 18-19", "Temperature")

months_df["Rain BL 17-18"] = create_months_series(matches, "BL 17-18", "Rain")
months_df["Rain BL 18-19"] = create_months_series(matches, "BL 18-19", "Rain")
months_df["Rain PD 17-18"] = create_months_series(matches, "PD 17-18", "Rain")
months_df["Rain PD 18-19"] = create_months_series(matches, "PD 18-19", "Rain")

months_df["Humidity BL 17-18"] = create_months_series(matches, "BL 17-18", "Humidity")
months_df["Humidity BL 18-19"] = create_months_series(matches, "BL 18-19", "Humidity")
months_df["Humidity PD 17-18"] = create_months_series(matches, "PD 17-18", "Humidity")
months_df["Humidity PD 18-19"] = create_months_series(matches, "PD 18-19", "Humidity")

months_df

In [None]:
# reindex to have the rows ordered according to the months of the season

months_df = months_df.reindex([8,9,10,11,12,1,2,3,4,5])
months_df

In [None]:
# create df with the index of months_df resetted to have the x values in the plots in order of the months of the season

new_months_df = months_df.reset_index()
new_months_df

In [None]:
# print the plots for mean values for the temperatures, amount of rain and humidity for every month in every league and season

# temperature
print(new_months_df.iloc[:, 1:5].plot(color=["#005b96", "#e29d3f", "#56b24e", "#b72e20"], legend=None))
plt.xticks((0, 1, 2, 3, 4, 5, 6, 7, 8, 9), (8, 9, 10, 11, 12, 1, 2, 3, 4, 5))
plt.text(x = 7.75, y = 18, s = "BL 17/18", color = "#005b96", rotation = 45)
plt.text(x = 7.8, y = 11.8, s = "BL 18/19", color = "#e29d3f", rotation = 17)
plt.text(x = 4.2, y = 8, s = "PD 17/18", color = "#56b24e", rotation = -10)
plt.text(x = 4.75, y = 11, s = "PD 18/19", color = "#b72e20", rotation = 26)
plt.xlabel("Monat der Saison")
plt.ylabel("Temperatur in °C")
sns.set(style="whitegrid")
plt.rcParams['figure.dpi'] = 360
plt.tight_layout()
sns.despine(left=True)
plt.savefig("plot_months_{}.png".format("temperature"))

# rain
print(new_months_df.iloc[:, 5:9].plot(color=["#005b96", "#e29d3f", "#56b24e", "#b72e20"], legend=None))
plt.xticks((0, 1, 2, 3, 4, 5, 6, 7, 8, 9), (8, 9, 10, 11, 12, 1, 2, 3, 4, 5))
plt.text(x = 4, y = 0.31, s = "BL 17/18", color = "#005b96", rotation = -42)
plt.text(x = 5.25, y = 0.6, s = "BL 18/19", color = "#e29d3f", rotation = 0)
plt.text(x = 7, y = 0.12, s = "PD 17/18", color = "#56b24e", rotation = -38)
plt.text(x = 0, y = 0, s = "PD 18/19", color = "#b72e20", rotation = 0)
plt.xlabel("Monat der Saison")
plt.ylabel("Regenmenge in mm")
sns.set(style="whitegrid")
plt.rcParams['figure.dpi'] = 360
plt.tight_layout()
sns.despine(left=True)
plt.savefig("plot_months_{}.png".format("rain"))

# humidity
print(new_months_df.iloc[:, 9:].plot(color=["#005b96", "#e29d3f", "#56b24e", "#b72e20"], legend=None))
plt.xticks((0, 1, 2, 3, 4, 5, 6, 7, 8, 9), (8, 9, 10, 11, 12, 1, 2, 3, 4, 5))
plt.text(x = 0.6, y = 71.6, s = "BL 17/18", color = "#005b96", rotation = 11)
plt.text(x = 3.7, y = 71.8, s = "BL 18/19", color = "#e29d3f", rotation = -37)
plt.text(x = 6.8, y = 65.3, s = "PD 17/18", color = "#56b24e", rotation = -15)
plt.text(x = 5, y = 51.9, s = "PD 18/19", color = "#b72e20", rotation = -27)
plt.xlabel("Monat der Saison")
plt.ylabel("Luftfeuchtigkeit in %")
sns.set(style="whitegrid")
plt.rcParams['figure.dpi'] = 360
plt.tight_layout()
sns.despine(left=True)
plt.savefig("plot_months_{}.png".format("humidity"))