In [1]:
import pandas as pd
import numpy as np
import json
import os
from tqdm import tqdm
import time
import re
from IPython.display import display
from pathlib import Path 
DATAPATH = Path(r'Data')

In [2]:
def remove_tags(string):
    """
    Clean text from html tag using regex
    """
    result = re.sub('<.*?>','',string)
    return result

In [3]:
def new_date_features(df) :
    
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df.date.dt.year 
    df['quarter'] = df.date.dt.quarter
    df['month'] = df.date.dt.month  
    df['week'] = df.date.dt.week 
    df['day'] = df.date.dt.day  
    df['weekday'] = df.date.dt.weekday
    
    df['is_monday'] = np.where((df['weekday'] == 0), 1, 0)
    df['is_tuesday'] = np.where((df['weekday'] == 1), 1, 0)
    df['is_wednesday'] = np.where((df['weekday'] == 2), 1, 0)
    df['is_thursday'] = np.where((df['weekday'] == 3), 1, 0)
    df['is_friday'] = np.where((df['weekday'] == 4), 1, 0)
    df['is_saturday'] = np.where((df['weekday'] == 5), 1, 0)
    df['is_sunday'] = np.where((df['weekday'] == 6), 1, 0)
    
    return df

In [126]:
def new_features(df) :
    
    # remove html tags > normalize between 0-1
    df["possession_home"] = df["possession_home"].apply(lambda x : int(remove_tags(str(x)).strip("%"))/100 ) 
    df["possession_away"] = df["possession_away"].apply(lambda x : int(remove_tags(str(x)).strip("%"))/100 )
    # --
    df["shot_on_target_home_raw"] = df["shot_on_target_home"].apply(lambda x : remove_tags(str(x))) 
    df["shot_on_target_away_raw"] = df["shot_on_target_away"].apply(lambda x : remove_tags(str(x)))
    
    # regex values : [shot on target, total shot, prc of shot on target] or [prc of shot on target, total shot, shot on target]
    
    df["shot_on_target_home"] = df["shot_on_target_home_raw"].apply(lambda x : int(re.findall(r"\d+",
                                                                                          re.sub('\d+%',' ',x))[0]))
    df["shot_total_home"] = df["shot_on_target_home_raw"].apply(lambda x : int(re.findall(r"\d+",
                                                                                      re.sub('\d+%',' ',x))[1]))
    df["shot_on_target_away"] = df["shot_on_target_away_raw"].apply(lambda x : int(re.findall(r"\d+",
                                                                                          re.sub('\d+%',' ',x))[0]))
    df["shot_total_away"] = df["shot_on_target_away_raw"].apply(lambda x : int(re.findall(r"\d+",
                                                                                      re.sub('\d+%',' ',x))[1]))
    # --
    df["saves_home_raw"] = df["saves_home"].apply(lambda x : remove_tags(str(x))) 
    df["saves_away_raw"] = df["saves_away"].apply(lambda x : remove_tags(str(x)))
    
    df["saves_home"] = df["saves_home_raw"].apply(lambda x : int(re.findall(r"\d+",
                                                                                          re.sub('\d+%',' ',x))[0]))
    df["saves_total_home"] = df["saves_home_raw"].apply(lambda x : int(re.findall(r"\d+",
                                                                                      re.sub('\d+%',' ',x))[1]))
    df["saves_away"] = df["saves_away_raw"].apply(lambda x : int(re.findall(r"\d+",
                                                                                          re.sub('\d+%',' ',x))[0]))
    df["saves_total_away"] = df["saves_away_raw"].apply(lambda x : int(re.findall(r"\d+",
                                                                                      re.sub('\d+%',' ',x))[1]))
    df["goal_home"] = df["score"].apply(lambda x : int(re.split(r"–|-",x)[0]))
    df["goal_away"] = df["score"].apply(lambda x : int(re.split(r"–|-",x)[1]))
    
    # Victory : 3 pts, Draw : 1 pts and Loose : 0 pts
    df["points_result_home"] = np.where(eda_df['goal_home'] > eda_df['goal_away'] , 3,
                                        np.where(eda_df['goal_home'] == eda_df['goal_away'], 1, 0))
    
    df["points_result_away"] = np.where(eda_df['goal_away'] > eda_df['goal_home'] , 3,
                                        np.where(eda_df['goal_away'] == eda_df['goal_home'], 1, 0))
    return df

In [128]:
def new_cumul_sum_features(df) :
    """
    This are only cumsum features to keep track of features week by week
    TODO : We need to supress thoses column because of data leakage. 
    Solution : remove last row to only keep the cumulative sum before the last row. but if no value (first row) replace by np.NaN
    """
    
    features = ['points_result', 'shot_on_target', 'saves', 'shot_total',
       'saves_total', 'goal', 'points_result_against',
       'shot_on_target_against', 'saves_against', 'shot_total_against',
       'saves_total_against', 'goal_against',]
    
    for feature in features :
        col = {f"cumul_{feature}" :
                           df.groupby(
                            ["season", "team"], sort = False)[f'{feature}'].agg('cumsum')}
        df = df.assign(**col)
        
    df["goal_difference"] = df["cumul_goal"] - df["cumul_goal_against"]
                               
    return df

In [129]:
def new_cumul_average_features(df) :
    
    # Average cumul of point
    features = ['points_result', 'shot_on_target', 'saves', 'shot_total',
       'saves_total', 'goal', 'points_result_against',
       'shot_on_target_against', 'saves_against', 'shot_total_against',
       'saves_total_against', 'goal_against',]
    for feature in features :
    
        col = {f"cumul_average_{feature}" :
                       df.groupby(
                        ["season", "team"], sort = False)[f'{feature}'].transform(
                        lambda x: x.expanding().mean())}

        df = df.assign(**col)
    
    
    
    return df 

### Moving average features

points_result, goal, goal_against, goal-goal_against

In [130]:
def new_moving_average_features(df) :
    
    # moving average for
    features = ['points_result', 'shot_on_target', 'saves', 'shot_total',
       'saves_total', 'goal', 'points_result_against',
       'shot_on_target_against', 'saves_against', 'shot_total_against',
       'saves_total_against', 'goal_against',]
    
    for feature in features :
        for i in [1,2,3,4,5,6] :
            # create a column moving_average_1_point_result
            #  closed = "right" mean that we 
            col = {f"moving_average_{i}_{feature}" :
                   df.groupby(
                    ["season", "team"], sort = False)[f'{feature}'].transform(
                    lambda x: x.rolling(i, closed = 'left').mean())}
            df = df.assign(**col)
    
    return df 

### Fatigue features

In [131]:
def new_fatigues_features(df) :
    
    # moving average for
    for i in [2,3,4,5,6] :
        df["date_timestamp"] = df.groupby(["season", "team"], sort = False)['date'].transform(
            lambda y: y.rolling(i))
        
        col = {f"date_timestamp" :
               df.groupby(["season", "team"], sort = False)['date'].transform(
                   lambda y: y.rolling(i))}
        
        df = df.assign(**col)
        
        df[f"fatigue_{i-1}_match"] = df[f"date_timestamp"].apply(lambda x : (x.iloc[-1] - x.iloc[0]) if len(x)>=i else np.nan)
        df[f"fatigue_{i-1}_match"] = df[f"fatigue_{i-1}_match"].apply(lambda x : abs(x.days) if x != np.nan else np.nan)
        
    df = df.drop("date_timestamp", axis = 1)
        
    return df
        

# Production

In [132]:
filepath = './Ligue-1-2015-2022.csv'

In [133]:
df_raw = pd.read_csv(DATAPATH /filepath)

In [134]:
features = ['gameweek', 'dayofweek', 'date', 'start_time', 'home_team',
 'score', 'away_team',
 'season', 'possession_home', 'possession_away',
 'shot_on_target_home', 'shot_on_target_away', 'saves_home',
 'saves_away'
           ]

In [135]:
eda_df = df_raw
eda_df = eda_df[features].dropna(axis = 0)
eda_df = new_date_features(eda_df)
eda_df = new_features(eda_df)

  df['week'] = df.date.dt.week


In [136]:
#SEASON = "2015-2016"
#df_unique = eda_df.query(f"(season == '{SEASON}')").drop_duplicates()
df_unique = eda_df.drop_duplicates()

In [137]:
features_home = ["gameweek", "season",
                 "date", "start_time", "home_team",
                 "points_result_home", "shot_on_target_home",
                "saves_home", "shot_total_home", "saves_total_home",
                "goal_home",
                "points_result_away", "shot_on_target_away", 
                "saves_away", "shot_total_away", "saves_total_away",
                "goal_away"]

features_away = ["gameweek", "season", 
                 "date", "start_time","away_team",
                 "points_result_away", "shot_on_target_away", 
                "saves_away", "shot_total_away", "saves_total_away",
                "goal_away",
                "points_result_home", "shot_on_target_home",
                "saves_home", "shot_total_home", "saves_total_home",
                "goal_home",]

In [138]:
# We split home and away match for each team and concatenate after
home_df = df_unique[features_home].groupby(features_home).first().reset_index()
home_df.rename(columns=lambda x: x.replace("_home",'') if "_home" in x else(x.replace("home_",'') if "home_" in x else x), inplace = True)
home_df.rename(columns=lambda x: x.replace("_away",'_against') if "_away" in x else(x.replace("away_",'against_') if "away_" in x else x), inplace = True)

away_df = df_unique[features_away].groupby(features_away).first().reset_index()
away_df.rename(columns=lambda x: x.replace("_away",'') if "_away" in x else(x.replace("away_",'') if "away_" in x else x), inplace = True)
away_df.rename(columns=lambda x: x.replace("_home",'_against') if "_home" in x else(x.replace("home_",'against_') if "home_" in x else x), inplace = True)

concat_team = pd.concat([home_df, away_df])
concat_team = concat_team.sort_values(by = ['season', "gameweek"])

In [139]:
league_table = new_cumul_sum_features(concat_team)
league_table = new_cumul_average_features(league_table)
league_table = new_moving_average_features(league_table)
league_table = new_fatigues_features(league_table)
league_table_test = league_table.copy()

In [140]:
with pd.option_context('display.max_columns', None) :
    display(league_table[league_table['team']== 'Paris S-G'])

Unnamed: 0,gameweek,season,date,start_time,team,points_result,shot_on_target,saves,shot_total,saves_total,goal,points_result_against,shot_on_target_against,saves_against,shot_total_against,saves_total_against,goal_against,cumul_points_result,cumul_shot_on_target,cumul_saves,cumul_shot_total,cumul_saves_total,cumul_goal,cumul_points_result_against,cumul_shot_on_target_against,cumul_saves_against,cumul_shot_total_against,cumul_saves_total_against,cumul_goal_against,goal_difference,cumul_average_points_result,cumul_average_shot_on_target,cumul_average_saves,cumul_average_shot_total,cumul_average_saves_total,cumul_average_goal,cumul_average_points_result_against,cumul_average_shot_on_target_against,cumul_average_saves_against,cumul_average_shot_total_against,cumul_average_saves_total_against,cumul_average_goal_against,moving_average_1_points_result,moving_average_2_points_result,moving_average_3_points_result,moving_average_4_points_result,moving_average_5_points_result,moving_average_6_points_result,moving_average_1_shot_on_target,moving_average_2_shot_on_target,moving_average_3_shot_on_target,moving_average_4_shot_on_target,moving_average_5_shot_on_target,moving_average_6_shot_on_target,moving_average_1_saves,moving_average_2_saves,moving_average_3_saves,moving_average_4_saves,moving_average_5_saves,moving_average_6_saves,moving_average_1_shot_total,moving_average_2_shot_total,moving_average_3_shot_total,moving_average_4_shot_total,moving_average_5_shot_total,moving_average_6_shot_total,moving_average_1_saves_total,moving_average_2_saves_total,moving_average_3_saves_total,moving_average_4_saves_total,moving_average_5_saves_total,moving_average_6_saves_total,moving_average_1_goal,moving_average_2_goal,moving_average_3_goal,moving_average_4_goal,moving_average_5_goal,moving_average_6_goal,moving_average_1_points_result_against,moving_average_2_points_result_against,moving_average_3_points_result_against,moving_average_4_points_result_against,moving_average_5_points_result_against,moving_average_6_points_result_against,moving_average_1_shot_on_target_against,moving_average_2_shot_on_target_against,moving_average_3_shot_on_target_against,moving_average_4_shot_on_target_against,moving_average_5_shot_on_target_against,moving_average_6_shot_on_target_against,moving_average_1_saves_against,moving_average_2_saves_against,moving_average_3_saves_against,moving_average_4_saves_against,moving_average_5_saves_against,moving_average_6_saves_against,moving_average_1_shot_total_against,moving_average_2_shot_total_against,moving_average_3_shot_total_against,moving_average_4_shot_total_against,moving_average_5_shot_total_against,moving_average_6_shot_total_against,moving_average_1_saves_total_against,moving_average_2_saves_total_against,moving_average_3_saves_total_against,moving_average_4_saves_total_against,moving_average_5_saves_total_against,moving_average_6_saves_total_against,moving_average_1_goal_against,moving_average_2_goal_against,moving_average_3_goal_against,moving_average_4_goal_against,moving_average_5_goal_against,moving_average_6_goal_against,fatigue_1_match,fatigue_2_match,fatigue_3_match,fatigue_4_match,fatigue_5_match
0,1.0,2015-2016,2015-08-07,20:30,Paris S-G,3,3,2,7,2,1,0,2,1,12,3,0,3,3,2,7,2,1,0,2,1,12,3,0,1,3.000000,3.000000,2.000000,7.000000,2.000000,1.000000,0.000000,2.000000,1.000000,12.000000,3.000000,0.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
79,2.0,2015-2016,2015-08-16,21:00,Paris S-G,3,6,2,18,2,2,0,2,4,8,6,0,6,9,4,25,4,3,0,4,5,20,9,0,3,3.000000,4.500000,2.000000,12.500000,2.000000,1.500000,0.000000,2.000000,2.500000,10.000000,4.500000,0.000000,3.0,,,,,,3.0,,,,,,2.0,,,,,,7.0,,,,,,2.0,,,,,,1.0,,,,,,0.0,,,,,,2.0,,,,,,1.0,,,,,,12.0,,,,,,3.0,,,,,,0.0,,,,,,9.0,,,,
140,3.0,2015-2016,2015-08-21,20:30,Paris S-G,3,5,3,9,3,1,0,3,4,10,5,0,9,14,7,34,7,4,0,7,9,30,14,0,4,3.000000,4.666667,2.333333,11.333333,2.333333,1.333333,0.000000,2.333333,3.000000,10.000000,4.666667,0.000000,3.0,3.0,,,,,6.0,4.5,,,,,2.0,2.0,,,,,18.0,12.5,,,,,2.0,2.0,,,,,2.0,1.5,,,,,0.0,0.0,,,,,2.0,2.0,,,,,4.0,2.5,,,,,8.0,10.0,,,,,6.0,4.5,,,,,0.0,0.0,,,,,5.0,14.0,,,
219,4.0,2015-2016,2015-08-30,21:00,Paris S-G,3,8,2,19,2,3,0,2,5,3,8,0,12,22,9,53,9,7,0,9,14,33,22,0,7,3.000000,5.500000,2.250000,13.250000,2.250000,1.750000,0.000000,2.250000,3.500000,8.250000,5.500000,0.000000,3.0,3.0,3.000000,,,,5.0,5.5,4.666667,,,,3.0,2.5,2.333333,,,,9.0,13.5,11.333333,,,,3.0,2.5,2.333333,,,,1.0,1.5,1.333333,,,,0.0,0.0,0.000000,,,,3.0,2.5,2.333333,,,,4.0,4.0,3.000000,,,,10.0,9.0,10.000000,,,,5.0,5.5,4.666667,,,,0.0,0.0,0.000000,,,,9.0,14.0,23.0,,
280,5.0,2015-2016,2015-09-11,20:30,Paris S-G,1,5,1,19,3,2,1,3,3,10,5,2,13,27,10,72,12,9,1,12,17,43,27,2,7,2.600000,5.400000,2.000000,14.400000,2.400000,1.800000,0.200000,2.400000,3.400000,8.600000,5.400000,0.400000,3.0,3.0,3.000000,3.0,,,8.0,6.5,6.333333,5.50,,,2.0,2.5,2.333333,2.25,,,19.0,14.0,15.333333,13.25,,,2.0,2.5,2.333333,2.25,,,3.0,2.0,2.000000,1.75,,,0.0,0.0,0.000000,0.00,,,2.0,2.5,2.333333,2.25,,,5.0,4.5,4.333333,3.50,,,3.0,6.5,7.000000,8.25,,,8.0,6.5,6.333333,5.50,,,0.0,0.0,0.000000,0.00,,,12.0,21.0,26.0,35.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2311,34.0,2021-2022,2022-04-23,21:00,Paris S-G,1,6,2,18,3,1,1,3,5,8,6,1,78,171,102,484,131,76,18,131,102,372,171,31,45,2.294118,5.029412,3.000000,14.235294,3.852941,2.235294,0.529412,3.852941,3.000000,10.941176,5.029412,0.911765,3.0,3.0,3.000000,3.0,2.4,2.500000,8.0,5.0,6.333333,6.50,6.2,5.833333,1.0,0.5,1.333333,1.75,2.4,2.833333,15.0,10.5,12.333333,13.00,13.0,12.833333,1.0,1.0,2.000000,2.50,3.4,3.666667,3.0,2.5,3.666667,4.00,3.2,3.166667,0.0,0.0,0.000000,0.00,0.6,0.500000,1.0,1.0,2.000000,2.50,3.4,3.666667,5.0,3.0,3.333333,3.00,3.4,3.000000,8.0,5.5,7.000000,8.00,8.8,9.833333,8.0,5.0,6.333333,6.50,6.2,5.833333,0.0,0.5,0.666667,0.75,1.2,1.000000,3.0,6.0,14.0,20.0,34.0
2369,35.0,2021-2022,2022-04-29,21:00,Paris S-G,1,6,0,14,2,3,1,2,3,14,6,3,79,177,102,498,133,79,19,133,105,386,177,34,45,2.257143,5.057143,2.914286,14.228571,3.800000,2.257143,0.542857,3.800000,3.000000,11.028571,5.057143,0.971429,1.0,2.0,2.333333,2.5,2.6,2.166667,6.0,7.0,5.333333,6.25,6.4,6.166667,2.0,1.5,1.000000,1.50,1.8,2.333333,18.0,16.5,13.000000,13.75,14.0,13.833333,3.0,2.0,1.666667,2.25,2.6,3.333333,1.0,2.0,2.000000,3.00,3.4,2.833333,1.0,0.5,0.333333,0.25,0.2,0.666667,3.0,2.0,1.666667,2.25,2.6,3.333333,5.0,5.0,3.666667,3.75,3.4,3.666667,8.0,8.0,6.333333,7.25,8.0,8.666667,6.0,7.0,5.333333,6.25,6.4,6.166667,1.0,0.5,0.666667,0.75,0.8,1.166667,6.0,9.0,12.0,20.0,26.0
2436,36.0,2021-2022,2022-05-08,20:45,Paris S-G,1,6,1,16,2,2,1,2,5,8,6,2,80,183,103,514,135,81,20,135,110,394,183,36,45,2.222222,5.083333,2.861111,14.277778,3.750000,2.250000,0.555556,3.750000,3.055556,10.944444,5.083333,1.000000,1.0,1.0,1.666667,2.0,2.2,2.333333,6.0,6.0,6.666667,5.50,6.2,6.333333,0.0,1.0,1.000000,0.75,1.2,1.500000,14.0,16.0,15.666667,13.25,13.8,14.000000,2.0,2.5,2.000000,1.75,2.2,2.500000,3.0,2.0,2.333333,2.25,3.0,3.333333,1.0,1.0,0.666667,0.50,0.4,0.333333,2.0,2.5,2.000000,1.75,2.2,2.500000,3.0,4.0,4.333333,3.50,3.6,3.333333,14.0,11.0,10.000000,8.25,8.6,9.000000,6.0,6.0,6.666667,5.50,6.2,6.333333,3.0,2.0,1.333333,1.25,1.2,1.166667,9.0,15.0,18.0,21.0,29.0
2497,37.0,2021-2022,2022-05-14,21:00,Paris S-G,3,8,3,14,3,4,0,3,5,10,8,0,83,191,106,528,138,85,20,138,115,404,191,36,49,2.243243,5.162162,2.864865,14.270270,3.729730,2.297297,0.540541,3.729730,3.108108,10.918919,5.162162,0.972973,1.0,1.0,1.000000,1.5,1.8,2.000000,6.0,6.0,6.000000,6.50,5.6,6.166667,1.0,0.5,1.000000,1.00,0.8,1.166667,16.0,15.0,16.000000,15.75,13.8,14.166667,2.0,2.0,2.333333,2.00,1.8,2.166667,2.0,2.5,2.000000,2.25,2.2,2.833333,1.0,1.0,1.000000,0.75,0.6,0.500000,2.0,2.0,2.333333,2.00,1.8,2.166667,5.0,4.0,4.333333,4.50,3.8,3.833333,8.0,11.0,10.000000,9.50,8.2,8.500000,6.0,6.0,6.000000,6.50,5.6,6.166667,2.0,2.5,2.000000,1.50,1.4,1.333333,6.0,15.0,21.0,24.0,27.0


# DEV - Study on Ligue 1 data

In [41]:
filepath = './Ligue-1-2015-2022.csv'

In [508]:
df_raw = pd.read_csv(filepath)

In [509]:
features = ['gameweek', 'dayofweek', 'date', 'start_time', 'home_team',
 'score', 'away_team',
 'season', 'possession_home', 'possession_away',
 'shot_on_target_home', 'shot_on_target_away', 'saves_home',
 'saves_away'
           ]

In [554]:
eda_df = df_raw
eda_df = eda_df[features].dropna(axis = 0)
eda_df = new_date_features(eda_df)
eda_df = new_features(eda_df)

  df['week'] = df.date.dt.week


In [555]:
with pd.option_context('display.max_columns', None) :
    display(eda_df.head())

Unnamed: 0,gameweek,dayofweek,date,start_time,home_team,score,away_team,season,possession_home,possession_away,shot_on_target_home,shot_on_target_away,saves_home,saves_away,year,quarter,month,week,day,weekday,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_sunday,shot_on_target_home_raw,shot_on_target_away_raw,shot_total_home,shot_total_away,saves_home_raw,saves_away_raw,saves_total_home,saves_total_away,goal_home,goal_away,points_result_home,points_result_away
0,1.0,Fri,2015-08-07,20:30,Lille,0–1,Paris S-G,2015-2016,0.52,0.48,2,3,1,2,2015,3,8,32,7,4,0,0,0,0,1,0,0,2 of 12 — 17%,43% — 3 of 7,12,7,1 of 3 — 33%,100% — 2 of 2,3,2,0,1,0,3
1,1.0,Sat,2015-08-08,21:00,Montpellier,0–2,Angers,2015-2016,0.63,0.37,4,6,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 11 — 36%,38% — 6 of 16,11,16,4 of 6 — 66%,100% — 4 of 4,6,4,0,2,0,3
2,1.0,Sat,2015-08-08,21:00,Nantes,1–0,Guingamp,2015-2016,0.56,0.44,4,4,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 9 — 44%,40% — 4 of 10,9,10,4 of 4 — 100%,100% — 4 of 4,4,4,1,0,3,0
3,1.0,Sat,2015-08-08,21:00,Troyes,0–0,Gazélec Ajaccio,2015-2016,0.53,0.47,4,1,1,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 12 — 33%,14% — 1 of 7,12,7,1 of 1 — 100%,100% — 4 of 4,1,4,0,0,1,1
4,1.0,Sat,2015-08-08,21:00,Nice,1–2,Monaco,2015-2016,0.43,0.57,2,6,4,1,2015,3,8,32,8,5,0,0,0,0,0,1,0,2 of 6 — 33%,33% — 6 of 18,6,18,4 of 6 — 66%,50% — 1 of 2,6,2,1,2,0,3


### Dev

In [180]:
eda_df.head()

Unnamed: 0,gameweek,dayofweek,date,start_time,home_team,score,away_team,season,possession_home,possession_away,shot_on_target_home,shot_on_target_away,saves_home,saves_away,year,quarter,month,week,day,weekday,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_sunday,shot_on_target_home_raw,shot_on_target_away_raw,shot_total_home,shot_total_away,saves_home_raw,saves_away_raw,saves_total_home,saves_total_away,score_home,score_away
0,1.0,Fri,2015-08-07,20:30,Lille,0–1,Paris S-G,2015-2016,0.52,0.48,2,3,1,2,2015,3,8,32,7,4,0,0,0,0,1,0,0,2 of 12 — 17%,43% — 3 of 7,12,7,1 of 3 — 33%,100% — 2 of 2,3,2,0,1
1,1.0,Sat,2015-08-08,21:00,Montpellier,0–2,Angers,2015-2016,0.63,0.37,4,6,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 11 — 36%,38% — 6 of 16,11,16,4 of 6 — 66%,100% — 4 of 4,6,4,0,2
2,1.0,Sat,2015-08-08,21:00,Nantes,1–0,Guingamp,2015-2016,0.56,0.44,4,4,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 9 — 44%,40% — 4 of 10,9,10,4 of 4 — 100%,100% — 4 of 4,4,4,1,0
3,1.0,Sat,2015-08-08,21:00,Troyes,0–0,Gazélec Ajaccio,2015-2016,0.53,0.47,4,1,1,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 12 — 33%,14% — 1 of 7,12,7,1 of 1 — 100%,100% — 4 of 4,1,4,0,0
4,1.0,Sat,2015-08-08,21:00,Nice,1–2,Monaco,2015-2016,0.43,0.57,2,6,4,1,2015,3,8,32,8,5,0,0,0,0,0,1,0,2 of 6 — 33%,33% — 6 of 18,6,18,4 of 6 — 66%,50% — 1 of 2,6,2,1,2


In [269]:
TEAM = 'Paris S-G'
SEASON = '2015-2016' 
sgl_team = eda_df.query(f"(home_team == '{TEAM}' or away_team == '{TEAM}') and (season == '{SEASON}')")

In [249]:
sgl_team.head()

Unnamed: 0,gameweek,dayofweek,date,start_time,home_team,score,away_team,season,possession_home,possession_away,shot_on_target_home,shot_on_target_away,saves_home,saves_away,year,quarter,month,week,day,weekday,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_sunday,shot_on_target_home_raw,shot_on_target_away_raw,shot_total_home,shot_total_away,saves_home_raw,saves_away_raw,saves_total_home,saves_total_away,score_home,score_away,points_result_home,points_result_away
0,1.0,Fri,2015-08-07,20:30,Lille,0–1,Paris S-G,2015-2016,0.52,0.48,2,3,1,2,2015,3,8,32,7,4,0,0,0,0,1,0,0,2 of 12 — 17%,43% — 3 of 7,12,7,1 of 3 — 33%,100% — 2 of 2,3,2,0,1,0,3
19,2.0,Sun,2015-08-16,21:00,Paris S-G,2–0,Gazélec Ajaccio,2015-2016,0.7,0.3,6,2,2,4,2015,3,8,33,16,6,0,0,0,0,0,0,1,6 of 18 — 33%,25% — 2 of 8,18,8,2 of 2 — 100%,66% — 4 of 6,2,6,2,0,3,0
20,3.0,Fri,2015-08-21,20:30,Montpellier,0–1,Paris S-G,2015-2016,0.36,0.64,3,5,4,3,2015,3,8,34,21,4,0,0,0,0,1,0,0,3 of 10 — 30%,56% — 5 of 9,10,9,4 of 5 — 80%,100% — 3 of 3,5,3,0,1,0,3
39,4.0,Sun,2015-08-30,21:00,Monaco,0–3,Paris S-G,2015-2016,0.29,0.71,2,8,5,2,2015,3,8,35,30,6,0,0,0,0,0,0,1,2 of 3 — 67%,42% — 8 of 19,3,19,5 of 8 — 62%,100% — 2 of 2,8,2,0,3,0,3
40,5.0,Fri,2015-09-11,20:30,Paris S-G,2–2,Bordeaux,2015-2016,0.71,0.29,5,3,1,3,2015,3,9,37,11,4,0,0,0,0,1,0,0,5 of 19 — 26%,30% — 3 of 10,19,10,1 of 3 — 33%,60% — 3 of 5,3,5,2,2,1,1


In [250]:
teams = sgl_team["home_team"].unique().tolist()

In [240]:
team = [teams[1]]

In [270]:
for t in team :
    #away_event = sgl_team[sgl_team["away_team"] == t] # away game for one team 
    #home_event = sgl_team[sgl_team["home_team"] == t] # home game for one team 
    sgl_team["away_cumulatif_point"] = sgl_team["points_result_away"].cumsum() # cumul point away
    sgl_team["home_cumulatif_point"] = sgl_team["points_result_home"].cumsum() # cumul point home

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgl_team["away_cumulatif_point"] = sgl_team["points_result_away"].cumsum() # cumul point away
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgl_team["home_cumulatif_point"] = sgl_team["points_result_home"].cumsum() # cumul point home


In [320]:
sgl_season = eda_df.query(f"(season == '{SEASON}')")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value


In [330]:
sgl_season.head()

Unnamed: 0,gameweek,dayofweek,date,start_time,home_team,score,away_team,season,possession_home,possession_away,shot_on_target_home,shot_on_target_away,saves_home,saves_away,year,quarter,month,week,day,weekday,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_sunday,shot_on_target_home_raw,shot_on_target_away_raw,shot_total_home,shot_total_away,saves_home_raw,saves_away_raw,saves_total_home,saves_total_away,score_home,score_away,points_result_home,points_result_away,away_home_cumulatif_point,away_away_cumulatif_point,home_home_cumulatif_point,home_away_cumulatif_point
0,1.0,Fri,2015-08-07,20:30,Lille,0–1,Paris S-G,2015-2016,0.52,0.48,2,3,1,2,2015,3,8,32,7,4,0,0,0,0,1,0,0,2 of 12 — 17%,43% — 3 of 7,12,7,1 of 3 — 33%,100% — 2 of 2,3,2,0,1,0,3,,,,
1,1.0,Sat,2015-08-08,21:00,Montpellier,0–2,Angers,2015-2016,0.63,0.37,4,6,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 11 — 36%,38% — 6 of 16,11,16,4 of 6 — 66%,100% — 4 of 4,6,4,0,2,0,3,,,,
2,1.0,Sat,2015-08-08,21:00,Nantes,1–0,Guingamp,2015-2016,0.56,0.44,4,4,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 9 — 44%,40% — 4 of 10,9,10,4 of 4 — 100%,100% — 4 of 4,4,4,1,0,3,0,,,,
3,1.0,Sat,2015-08-08,21:00,Troyes,0–0,Gazélec Ajaccio,2015-2016,0.53,0.47,4,1,1,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 12 — 33%,14% — 1 of 7,12,7,1 of 1 — 100%,100% — 4 of 4,1,4,0,0,1,1,,,,
4,1.0,Sat,2015-08-08,21:00,Nice,1–2,Monaco,2015-2016,0.43,0.57,2,6,4,1,2015,3,8,32,8,5,0,0,0,0,0,1,0,2 of 6 — 33%,33% — 6 of 18,6,18,4 of 6 — 66%,50% — 1 of 2,6,2,1,2,0,3,,,,


In [None]:
for t in teams :
    print(t)
    sgl_team_away = sgl_season.query(f"(away_team == '{t}')")
    sgl_team_home = sgl_season.query(f"(home_team == '{t}')")
    sgl_team_away["away_cumulatif_point"] = sgl_team_away["points_result_away"].cumsum() # cumul point away
    sgl_team_home["home_cumulatif_point"] = sgl_team_home["points_result_home"].cumsum() # cumul point home
    x
    sgl_season.at[sgl_team_home.index,'home_cumulatif_point']= sgl_team_home["home_home_cumulatif_point"]
    sgl_season.at[sgl_team_away.index,'away_cumulatif_point']= sgl_team_away["away_away_cumulatif_point"]
    
    
    
    sgl_season.at[sgl_team_home.index,'home_cumulatif_point']= sgl_team_home["away_home_cumulatif_point"]
    sgl_season.at[sgl_team_away.index,'away_cumulatif_point']= sgl_team_away["away_away_cumulatif_point"]
    

Lille
Paris S-G
Montpellier
Monaco
Reims
Nantes
Bastia
Rennes
Lorient
Angers
Nice
Caen
Toulouse
Saint-Étienne
Marseille
Lyon
Troyes
Guingamp
Gazélec Ajaccio
Bordeaux


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgl_team_away["away_cumulatif_point"] = sgl_team_away["points_result_away"].cumsum() # cumul point away
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sgl_team_home["home_cumulatif_point"] = sgl_team_home["points_result_home"].cumsum() # cumul point home


In [351]:
sgl_season.head()

Unnamed: 0,gameweek,dayofweek,date,start_time,home_team,score,away_team,season,possession_home,possession_away,shot_on_target_home,shot_on_target_away,saves_home,saves_away,year,quarter,month,week,day,weekday,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_sunday,shot_on_target_home_raw,shot_on_target_away_raw,shot_total_home,shot_total_away,saves_home_raw,saves_away_raw,saves_total_home,saves_total_away,score_home,score_away,points_result_home,points_result_away,away_home_cumulatif_point,away_away_cumulatif_point,home_home_cumulatif_point,home_away_cumulatif_point
0,1.0,Fri,2015-08-07,20:30,Lille,0–1,Paris S-G,2015-2016,0.52,0.48,2,3,1,2,2015,3,8,32,7,4,0,0,0,0,1,0,0,2 of 12 — 17%,43% — 3 of 7,12,7,1 of 3 — 33%,100% — 2 of 2,3,2,0,1,0,3,,,,
1,1.0,Sat,2015-08-08,21:00,Montpellier,0–2,Angers,2015-2016,0.63,0.37,4,6,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 11 — 36%,38% — 6 of 16,11,16,4 of 6 — 66%,100% — 4 of 4,6,4,0,2,0,3,,,,
2,1.0,Sat,2015-08-08,21:00,Nantes,1–0,Guingamp,2015-2016,0.56,0.44,4,4,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 9 — 44%,40% — 4 of 10,9,10,4 of 4 — 100%,100% — 4 of 4,4,4,1,0,3,0,,,,
3,1.0,Sat,2015-08-08,21:00,Troyes,0–0,Gazélec Ajaccio,2015-2016,0.53,0.47,4,1,1,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 12 — 33%,14% — 1 of 7,12,7,1 of 1 — 100%,100% — 4 of 4,1,4,0,0,1,1,,,,
4,1.0,Sat,2015-08-08,21:00,Nice,1–2,Monaco,2015-2016,0.43,0.57,2,6,4,1,2015,3,8,32,8,5,0,0,0,0,0,1,0,2 of 6 — 33%,33% — 6 of 18,6,18,4 of 6 — 66%,50% — 1 of 2,6,2,1,2,0,3,,,,


In [349]:
aa = sgl_season.groupby(['gameweek', 'dayofweek'])[["gameweek", "dayofweek"]].first()

# Reset the index of the grouped dataframe

## Create a ranking league table, reconstruct points/goal week by week team by team for all season

In [639]:
SEASON = "2015-2016"

In [640]:
df_unique = eda_df.query(f"(season == '{SEASON}')").drop_duplicates()

In [641]:
df_unique.head()

Unnamed: 0,gameweek,dayofweek,date,start_time,home_team,score,away_team,season,possession_home,possession_away,shot_on_target_home,shot_on_target_away,saves_home,saves_away,year,quarter,month,week,day,weekday,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_sunday,shot_on_target_home_raw,shot_on_target_away_raw,shot_total_home,shot_total_away,saves_home_raw,saves_away_raw,saves_total_home,saves_total_away,goal_home,goal_away,points_result_home,points_result_away
0,1.0,Fri,2015-08-07,20:30,Lille,0–1,Paris S-G,2015-2016,0.52,0.48,2,3,1,2,2015,3,8,32,7,4,0,0,0,0,1,0,0,2 of 12 — 17%,43% — 3 of 7,12,7,1 of 3 — 33%,100% — 2 of 2,3,2,0,1,0,3
1,1.0,Sat,2015-08-08,21:00,Montpellier,0–2,Angers,2015-2016,0.63,0.37,4,6,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 11 — 36%,38% — 6 of 16,11,16,4 of 6 — 66%,100% — 4 of 4,6,4,0,2,0,3
2,1.0,Sat,2015-08-08,21:00,Nantes,1–0,Guingamp,2015-2016,0.56,0.44,4,4,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 9 — 44%,40% — 4 of 10,9,10,4 of 4 — 100%,100% — 4 of 4,4,4,1,0,3,0
3,1.0,Sat,2015-08-08,21:00,Troyes,0–0,Gazélec Ajaccio,2015-2016,0.53,0.47,4,1,1,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 12 — 33%,14% — 1 of 7,12,7,1 of 1 — 100%,100% — 4 of 4,1,4,0,0,1,1
4,1.0,Sat,2015-08-08,21:00,Nice,1–2,Monaco,2015-2016,0.43,0.57,2,6,4,1,2015,3,8,32,8,5,0,0,0,0,0,1,0,2 of 6 — 33%,33% — 6 of 18,6,18,4 of 6 — 66%,50% — 1 of 2,6,2,1,2,0,3


In [642]:
features_home = ["gameweek", "season",
                 "date", "start_time", "home_team",
                 "points_result_home", "shot_on_target_home",
                "saves_home", "shot_total_home", "saves_total_home",
                "goal_home",
                "points_result_away", "shot_on_target_away", 
                "saves_away", "shot_total_away", "saves_total_away",
                "goal_away"]

features_away = ["gameweek", "season", 
                 "date", "start_time","away_team",
                 "points_result_away", "shot_on_target_away", 
                "saves_away", "shot_total_away", "saves_total_away",
                "goal_away",
                "points_result_home", "shot_on_target_home",
                "saves_home", "shot_total_home", "saves_total_home",
                "goal_home",]

In [655]:
# We split home and away match for each team and concatenate after
home_df = df_unique[features_home].groupby(features_home).first().reset_index()
home_df.rename(columns=lambda x: x.replace("_home",'') if "_home" in x else(x.replace("home_",'') if "home_" in x else x), inplace = True)
home_df.rename(columns=lambda x: x.replace("_away",'_against') if "_away" in x else(x.replace("away_",'against_') if "away_" in x else x), inplace = True)

away_df = df_unique[features_away].groupby(features_away).first().reset_index()
away_df.rename(columns=lambda x: x.replace("_away",'') if "_away" in x else(x.replace("away_",'') if "away_" in x else x), inplace = True)
away_df.rename(columns=lambda x: x.replace("_home",'_against') if "_home" in x else(x.replace("home_",'against_') if "home_" in x else x), inplace = True)

concat_team = pd.concat([home_df, away_df])
concat_team = concat_team.sort_values(by = ['gameweek', "season"])

In [656]:
concat_team.head()

Unnamed: 0,gameweek,season,date,start_time,team,points_result,shot_on_target,saves,shot_total,saves_total,goal,points_result_against,shot_on_target_against,saves_against,shot_total_against,saves_total_against,goal_against
0,1.0,2015-2016,2015-08-07,20:30,Lille,0,2,1,12,3,0,3,3,2,7,2,1
1,1.0,2015-2016,2015-08-08,21:00,Bastia,3,3,2,11,3,2,0,3,0,6,3,1
2,1.0,2015-2016,2015-08-08,21:00,Marseille,0,6,3,18,4,0,3,4,5,8,6,1
3,1.0,2015-2016,2015-08-08,21:00,Montpellier,0,4,4,11,6,0,3,6,4,16,4,2
4,1.0,2015-2016,2015-08-08,21:00,Nantes,3,4,4,9,4,1,0,4,4,10,4,0


### Cumulative features

### Dev

In [989]:
league_table = new_cumul_sum_features(concat_team)
league_table = new_cumul_average_features(league_table)
league_table = new_moving_average_features(league_table)
league_table = new_fatigues_features(league_table)
league_table_test = league_table.copy()

In [990]:
league_table[league_table["team"] == "Paris S-G"].tail()

Unnamed: 0,gameweek,season,date,start_time,team,points_result,shot_on_target,saves,shot_total,saves_total,goal,points_result_against,shot_on_target_against,saves_against,shot_total_against,saves_total_against,goal_against,cumul_points_result,cumul_shot_on_target,cumul_saves,cumul_shot_total,cumul_goal,cumul_points_result_against,cumul_shot_on_target_against,cumul_saves_against,cumul_shot_total_against,cumul_goal_against,goal_difference,cumul_average_points_result,cumul_average_shot_on_target,cumul_average_save,cumul_average_shot_total,cumul_average_goal,cumul_average_points_result_against,cumul_average_shot_on_target_against,cumul_average_save_against,cumul_average_shot_total_against,cumul_average_goal_against,moving_average_2_points_result,moving_average_2_goal,moving_average_2_points_result_against,moving_average_2_goal_against,moving_average_3_points_result,moving_average_3_goal,moving_average_3_points_result_against,moving_average_3_goal_against,moving_average_6_points_result,moving_average_6_goal,moving_average_6_points_result_against,moving_average_6_goal_against,fatigue_1_match,fatigue_2_match,fatigue_3_match
331,34.0,2015-2016,2016-04-16,17:00,Paris S-G,3,9,7,21,8,6,0,8,4,15,9,0,86,219,89,499,89,11,108,125,326,18,71,2.529412,6.441176,2.617647,14.676471,2.617647,0.323529,3.176471,3.676471,9.588235,0.529412,3.0,4.0,0.0,0.0,3.0,4.0,0.0,0.333333,2.166667,3.5,0.666667,0.5,7.0,14.0,27.0
349,35.0,2015-2016,2016-05-11,20:30,Paris S-G,1,2,2,9,3,1,1,3,1,8,2,1,87,221,91,508,90,12,111,126,334,19,71,2.485714,6.314286,2.6,14.514286,2.571429,0.342857,3.171429,3.6,9.542857,0.542857,2.0,3.5,0.5,0.5,2.333333,3.0,0.333333,0.333333,2.166667,3.666667,0.666667,0.666667,25.0,32.0,39.0
350,36.0,2015-2016,2016-04-29,20:30,Paris S-G,3,12,2,27,2,4,0,2,7,9,12,0,90,233,93,535,94,12,113,133,343,19,75,2.5,6.472222,2.583333,14.861111,2.611111,0.333333,3.138889,3.694444,9.527778,0.527778,2.0,2.5,0.5,0.5,2.333333,3.666667,0.333333,0.333333,2.166667,2.833333,0.666667,0.666667,12.0,13.0,20.0
365,37.0,2015-2016,2016-05-07,21:00,Paris S-G,3,6,0,9,1,4,0,1,2,7,6,0,93,239,93,544,98,12,114,135,350,19,79,2.513514,6.459459,2.513514,14.702703,2.648649,0.324324,3.081081,3.648649,9.459459,0.513514,3.0,4.0,0.0,0.0,2.333333,3.0,0.333333,0.333333,2.666667,3.5,0.166667,0.333333,8.0,4.0,21.0
375,38.0,2015-2016,2016-05-14,21:00,Paris S-G,3,13,4,20,3,4,0,3,9,13,13,0,96,252,97,564,102,12,117,144,363,19,83,2.526316,6.631579,2.552632,14.842105,2.684211,0.315789,3.078947,3.789474,9.552632,0.5,3.0,4.0,0.0,0.0,3.0,4.0,0.0,0.0,2.666667,3.5,0.166667,0.166667,7.0,15.0,3.0


# DEV - Update original CSV with transformed data

## Add ID for each GAME and TEAM

   - GAME_ID : unique ID for each game
   - TEAM_ID : unique ID for each team 

In [32]:
with pd.option_context('display.max_columns', None) :
    display(eda_df.head())

Unnamed: 0,gameweek,dayofweek,date,start_time,home_team,score,away_team,season,possession_home,possession_away,shot_on_target_home,shot_on_target_away,saves_home,saves_away,year,quarter,month,week,day,weekday,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_sunday,shot_on_target_home_raw,shot_on_target_away_raw,shot_total_home,shot_total_away,saves_home_raw,saves_away_raw,saves_total_home,saves_total_away,goal_home,goal_away,points_result_home,points_result_away
0,1.0,Fri,2015-08-07,20:30,Lille,0–1,Paris S-G,2015-2016,0.52,0.48,2,3,1,2,2015,3,8,32,7,4,0,0,0,0,1,0,0,2 of 12 — 17%,43% — 3 of 7,12,7,1 of 3 — 33%,100% — 2 of 2,3,2,0,1,0,3
1,1.0,Sat,2015-08-08,21:00,Montpellier,0–2,Angers,2015-2016,0.63,0.37,4,6,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 11 — 36%,38% — 6 of 16,11,16,4 of 6 — 66%,100% — 4 of 4,6,4,0,2,0,3
2,1.0,Sat,2015-08-08,21:00,Nantes,1–0,Guingamp,2015-2016,0.56,0.44,4,4,4,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 9 — 44%,40% — 4 of 10,9,10,4 of 4 — 100%,100% — 4 of 4,4,4,1,0,3,0
3,1.0,Sat,2015-08-08,21:00,Troyes,0–0,Gazélec Ajaccio,2015-2016,0.53,0.47,4,1,1,4,2015,3,8,32,8,5,0,0,0,0,0,1,0,4 of 12 — 33%,14% — 1 of 7,12,7,1 of 1 — 100%,100% — 4 of 4,1,4,0,0,1,1
4,1.0,Sat,2015-08-08,21:00,Nice,1–2,Monaco,2015-2016,0.43,0.57,2,6,4,1,2015,3,8,32,8,5,0,0,0,0,0,1,0,2 of 6 — 33%,33% — 6 of 18,6,18,4 of 6 — 66%,50% — 1 of 2,6,2,1,2,0,3


In [145]:
def new_unique_id(df) :
    
    # GAME_ID : YEAR_id, ex : 2022380 : last game of 2022
    df["dummy_id"] = 1
    df["id"] = df.groupby('season')["dummy_id"].agg('cumsum')
    df["GAME_ID"] = df.apply(lambda x : int(f"{x['year']}{x['id']:03d}"), axis = 1)
    
    df = df.drop(["dummy_id", "id"], axis = 1)
    
    return df
    

In [146]:
eda_df = new_unique_id(eda_df)
league_table["TEAM_ID"] = league_table['team'].astype('category').cat.codes

### Add league table attributes

In [160]:
with pd.option_context('display.max_columns', None) :
    display(league_table.head(1))

Unnamed: 0,gameweek,season,date,start_time,team,points_result,shot_on_target,saves,shot_total,saves_total,goal,points_result_against,shot_on_target_against,saves_against,shot_total_against,saves_total_against,goal_against,cumul_points_result,cumul_shot_on_target,cumul_saves,cumul_shot_total,cumul_goal,cumul_points_result_against,cumul_shot_on_target_against,cumul_saves_against,cumul_shot_total_against,cumul_goal_against,goal_difference,cumul_average_points_result,cumul_average_shot_on_target,cumul_average_save,cumul_average_shot_total,cumul_average_goal,cumul_average_points_result_against,cumul_average_shot_on_target_against,cumul_average_save_against,cumul_average_shot_total_against,cumul_average_goal_against,moving_average_2_points_result,moving_average_2_goal,moving_average_2_points_result_against,moving_average_2_goal_against,moving_average_3_points_result,moving_average_3_goal,moving_average_3_points_result_against,moving_average_3_goal_against,moving_average_6_points_result,moving_average_6_goal,moving_average_6_points_result_against,moving_average_6_goal_against,fatigue_1_match,fatigue_2_match,fatigue_3_match,TEAM_ID
0,1.0,2015-2016,2015-08-07,20:30,Lille,0,2,1,12,3,0,3,3,2,7,2,1,0,2,1,12,0,3,3,2,7,1,-1,0.0,2.0,1.0,12.0,0.0,3.0,3.0,2.0,7.0,1.0,,,,,,,,,,,,,,,,11


In [176]:
features_league_table = ['gameweek', 'season', 'team',
                         'cumul_points_result', 'cumul_shot_on_target', 'cumul_saves',
       'cumul_shot_total', 'cumul_goal', 'cumul_points_result_against',
       'cumul_shot_on_target_against', 'cumul_saves_against',
       'cumul_shot_total_against', 'cumul_goal_against', 'goal_difference',
       'cumul_average_points_result', 'cumul_average_shot_on_target',
       'cumul_average_save', 'cumul_average_shot_total', 'cumul_average_goal',
       'cumul_average_points_result_against',
       'cumul_average_shot_on_target_against', 'cumul_average_save_against',
       'cumul_average_shot_total_against', 'cumul_average_goal_against',
       'moving_average_2_points_result', 'moving_average_2_goal',
       'moving_average_2_points_result_against',
       'moving_average_2_goal_against', 'moving_average_3_points_result',
       'moving_average_3_goal', 'moving_average_3_points_result_against',
       'moving_average_3_goal_against', 'moving_average_6_points_result',
       'moving_average_6_goal', 'moving_average_6_points_result_against',
       'moving_average_6_goal_against', 'fatigue_1_match', 'fatigue_2_match',
       'fatigue_3_match', 'TEAM_ID']

In [191]:
# We merge at first the features of the HOME teams with the suffix HOMME
games = eda_df.merge(
    league_table[features_league_table].add_suffix('_HOME'), how = 'left', 
    left_on = ['gameweek', 'home_team', 'season'],
    right_on = ['gameweek_HOME', 'team_HOME', 'season_HOME'])

games = games.merge(
    league_table[features_league_table].add_suffix('_AWAY'), how = 'left', 
    left_on = ['gameweek', 'away_team', 'season'],
    right_on = ['gameweek_AWAY', 'team_AWAY', 'season_AWAY'])

In [194]:
features_sort = ['GAME_ID', 'TEAM_ID_HOME', 'TEAM_ID_AWAY', 'season', 'gameweek', 'date', 'start_time', 'home_team',
       'score', 'away_team', 'possession_home',
       'possession_away', 'shot_on_target_home', 'shot_on_target_away',
       'saves_home', 'saves_away', 'year', 'quarter', 'month', 'week',
       'day', 'weekday', 'is_monday', 'is_tuesday', 'is_wednesday',
       'is_thursday', 'is_friday', 'is_saturday', 'is_sunday',
       'shot_total_home', 'shot_total_away',
       'saves_total_home', 'saves_total_away',
       'goal_home', 'goal_away', 'points_result_home',
       'points_result_away','cumul_points_result_HOME',
       'cumul_shot_on_target_HOME', 'cumul_saves_HOME',
       'cumul_shot_total_HOME', 'cumul_goal_HOME',
       'cumul_points_result_against_HOME',
       'cumul_shot_on_target_against_HOME', 'cumul_saves_against_HOME',
       'cumul_shot_total_against_HOME', 'cumul_goal_against_HOME',
       'goal_difference_HOME', 'cumul_average_points_result_HOME',
       'cumul_average_shot_on_target_HOME', 'cumul_average_save_HOME',
       'cumul_average_shot_total_HOME', 'cumul_average_goal_HOME',
       'cumul_average_points_result_against_HOME',
       'cumul_average_shot_on_target_against_HOME',
       'cumul_average_save_against_HOME',
       'cumul_average_shot_total_against_HOME',
       'cumul_average_goal_against_HOME',
       'moving_average_2_points_result_HOME',
       'moving_average_2_goal_HOME',
       'moving_average_2_points_result_against_HOME',
       'moving_average_2_goal_against_HOME',
       'moving_average_3_points_result_HOME',
       'moving_average_3_goal_HOME',
       'moving_average_3_points_result_against_HOME',
       'moving_average_3_goal_against_HOME',
       'moving_average_6_points_result_HOME',
       'moving_average_6_goal_HOME',
       'moving_average_6_points_result_against_HOME',
       'moving_average_6_goal_against_HOME', 'fatigue_1_match_HOME',
       'fatigue_2_match_HOME', 'fatigue_3_match_HOME',
       'cumul_points_result_AWAY', 'cumul_shot_on_target_AWAY',
       'cumul_saves_AWAY', 'cumul_shot_total_AWAY', 'cumul_goal_AWAY',
       'cumul_points_result_against_AWAY',
       'cumul_shot_on_target_against_AWAY', 'cumul_saves_against_AWAY',
       'cumul_shot_total_against_AWAY', 'cumul_goal_against_AWAY',
       'goal_difference_AWAY', 'cumul_average_points_result_AWAY',
       'cumul_average_shot_on_target_AWAY', 'cumul_average_save_AWAY',
       'cumul_average_shot_total_AWAY', 'cumul_average_goal_AWAY',
       'cumul_average_points_result_against_AWAY',
       'cumul_average_shot_on_target_against_AWAY',
       'cumul_average_save_against_AWAY',
       'cumul_average_shot_total_against_AWAY',
       'cumul_average_goal_against_AWAY',
       'moving_average_2_points_result_AWAY',
       'moving_average_2_goal_AWAY',
       'moving_average_2_points_result_against_AWAY',
       'moving_average_2_goal_against_AWAY',
       'moving_average_3_points_result_AWAY',
       'moving_average_3_goal_AWAY',
       'moving_average_3_points_result_against_AWAY',
       'moving_average_3_goal_against_AWAY',
       'moving_average_6_points_result_AWAY',
       'moving_average_6_goal_AWAY',
       'moving_average_6_points_result_against_AWAY',
       'moving_average_6_goal_against_AWAY', 'fatigue_1_match_AWAY',
       'fatigue_2_match_AWAY', 'fatigue_3_match_AWAY']

In [195]:
games = games[features_sort]

In [196]:
with pd.option_context('display.max_columns', None) :
    display(games.head(1))

Unnamed: 0,GAME_ID,TEAM_ID_HOME,TEAM_ID_AWAY,season,gameweek,date,start_time,home_team,score,away_team,possession_home,possession_away,shot_on_target_home,shot_on_target_away,saves_home,saves_away,year,quarter,month,week,day,weekday,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_sunday,shot_total_home,shot_total_away,saves_total_home,saves_total_away,goal_home,goal_away,points_result_home,points_result_away,cumul_points_result_HOME,cumul_shot_on_target_HOME,cumul_saves_HOME,cumul_shot_total_HOME,cumul_goal_HOME,cumul_points_result_against_HOME,cumul_shot_on_target_against_HOME,cumul_saves_against_HOME,cumul_shot_total_against_HOME,cumul_goal_against_HOME,goal_difference_HOME,cumul_average_points_result_HOME,cumul_average_shot_on_target_HOME,cumul_average_save_HOME,cumul_average_shot_total_HOME,cumul_average_goal_HOME,cumul_average_points_result_against_HOME,cumul_average_shot_on_target_against_HOME,cumul_average_save_against_HOME,cumul_average_shot_total_against_HOME,cumul_average_goal_against_HOME,moving_average_2_points_result_HOME,moving_average_2_goal_HOME,moving_average_2_points_result_against_HOME,moving_average_2_goal_against_HOME,moving_average_3_points_result_HOME,moving_average_3_goal_HOME,moving_average_3_points_result_against_HOME,moving_average_3_goal_against_HOME,moving_average_6_points_result_HOME,moving_average_6_goal_HOME,moving_average_6_points_result_against_HOME,moving_average_6_goal_against_HOME,fatigue_1_match_HOME,fatigue_2_match_HOME,fatigue_3_match_HOME,cumul_points_result_AWAY,cumul_shot_on_target_AWAY,cumul_saves_AWAY,cumul_shot_total_AWAY,cumul_goal_AWAY,cumul_points_result_against_AWAY,cumul_shot_on_target_against_AWAY,cumul_saves_against_AWAY,cumul_shot_total_against_AWAY,cumul_goal_against_AWAY,goal_difference_AWAY,cumul_average_points_result_AWAY,cumul_average_shot_on_target_AWAY,cumul_average_save_AWAY,cumul_average_shot_total_AWAY,cumul_average_goal_AWAY,cumul_average_points_result_against_AWAY,cumul_average_shot_on_target_against_AWAY,cumul_average_save_against_AWAY,cumul_average_shot_total_against_AWAY,cumul_average_goal_against_AWAY,moving_average_2_points_result_AWAY,moving_average_2_goal_AWAY,moving_average_2_points_result_against_AWAY,moving_average_2_goal_against_AWAY,moving_average_3_points_result_AWAY,moving_average_3_goal_AWAY,moving_average_3_points_result_against_AWAY,moving_average_3_goal_against_AWAY,moving_average_6_points_result_AWAY,moving_average_6_goal_AWAY,moving_average_6_points_result_against_AWAY,moving_average_6_goal_against_AWAY,fatigue_1_match_AWAY,fatigue_2_match_AWAY,fatigue_3_match_AWAY
0,2015001,11,22,2015-2016,1.0,2015-08-07,20:30,Lille,0–1,Paris S-G,0.52,0.48,2,3,1,2,2015,3,8,32,7,4,0,0,0,0,1,0,0,12,7,3,2,0,1,0,3,0,2,1,12,0,3,3,2,7,1,-1,0.0,2.0,1.0,12.0,0.0,3.0,3.0,2.0,7.0,1.0,,,,,,,,,,,,,,,,3,3,2,7,1,0,2,1,12,0,1,3.0,3.0,2.0,7.0,1.0,0.0,2.0,1.0,12.0,0.0,,,,,,,,,,,,,,,


## TODO

  - [X] Faire un csv du classement by week/season
  - [X] cumul des points sur les 2,3,6,10 derniers matchs 
  - [X] add features indicating the time gaps between the match days of previous matches for the home and away teams, 
    and how long they played 3 matches and 6 matches,as an reflection of fatique factor
  - [X] average points got, average goal scored, average goal conceded and average goal difference up to that match as previous n-match form
  - [X] add ID for each : GAME, HOME TEAM, AWAY TEAM
