# Objective

The purpose of this notebook is to build a model able to predict the number of goals a team is going to score
at the next game.

Features are Team plays at home/away, number of goals it scored the last 5 games, during all season so far, the leg, its position, the number of points it has as well as the opponent's rank, number of points, goals conceded last 5 games and number of goals conceded so far.

Then use the probabilities obtained to simulate the last N games of a championship.

In [1]:
import pandas as pd
import numpy as np
import math
import plotly.express as px
from copy import deepcopy

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
def rolling_mean_n_performance(df, window=5, performance_col='goals_scored'):
    dg = df.sort_values(by=['leg'])[['season', 'team', performance_col]].groupby(
        by=['season', 'team'])[performance_col].rolling(window=window, min_periods=1).mean().reset_index()
    
    new_col_name = f'rolling_{window}_games_avg_{performance_col}'
    df[new_col_name] = dg.set_index('level_2')[performance_col]
    return df

def get_past_feature(df, feat_col, team=True):

    merge_col = 'team' if team else 'opponent'    
    tmp_df = deepcopy(df[['season', 'leg', merge_col, feat_col]])
    tmp_df.loc[:,'next_leg'] = tmp_df['leg'] +1

    tmp_df.rename(columns={'leg': 'previous_leg', 
                           'next_leg':'leg', 
                           feat_col:f'previous_{merge_col}_{feat_col}'},
                  inplace=True)

    df = df.merge(tmp_df, how='left', on=['leg', 'season', merge_col])
    df.drop(columns=['previous_leg'], inplace=True)
    return df

In [4]:
def prepare_data(csv_path, championship, rolling=5):
    df = pd.read_csv(csv_path).drop(columns='Unnamed: 0')
    df['championship'] = championship
    df['goal_diff'] = df['goals_scored'] - df['goals_conceded']
    # cumulative
    df['cum_pts'] = df[['season', 'team', 'nb_points']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goal_diff'] = df[['season', 'team', 'goal_diff']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goals_scored'] = df[['season', 'team', 'goals_scored']].groupby(
    by=['season', 'team']).cumsum()
    
    df['cum_goals_conceded'] = df['cum_goals_scored']-df['cum_goal_diff']
    df['rank'] = df[['season', 'leg', 'cum_pts', 'cum_goal_diff', 'cum_goals_scored']].sort_values(
        by=['cum_pts', 'cum_goal_diff', 'cum_goals_scored'], ascending=False).groupby(
        by=['season', 'leg']).cumcount() + 1
    
    df['avg_goals_scored_since_season_start'] = df['cum_goals_scored'].div(df['leg'])
    df['avg_goals_conceded_since_season_start'] = df['cum_goals_conceded'].div(df['leg'])
    df['avg_cum_pts_since_season_start'] = df['cum_pts'].div(df['leg'])
    
    # removed unwanted useless seasons
    data = deepcopy(df[df.season > '2003-2004'])
    data.reset_index(drop=True, inplace=True)
    
    leg_max = data.leg.max()
    
    end_season = data[data.leg==leg_max].rename(columns={'rank':'final_rank', 'cum_pts': 'final_cum_pts'})
    data = data.merge(end_season[['season', 'team', 'final_rank', 'final_cum_pts']], on=['season', 'team'])
    
    # rolling mean
    cols = ['goals_conceded', 'goals_scored', 'nb_points']
    for c in cols:
        data = rolling_mean_n_performance(df=data, window=rolling, performance_col=c)
    # past features
    past_features = {'rank': [True, False], 
                     'rolling_5_games_avg_goals_scored' : [True],
                     'rolling_5_games_avg_goals_conceded': [False],
                     'avg_goals_scored_since_season_start': [True],
                     'avg_goals_conceded_since_season_start': [False],
                     'goals_scored': [True],
                     'goals_conceded': [False],
                     'rolling_5_games_avg_nb_points': [True, False],
                     'nb_points': [True, False]
                    }
    # print(f'length {len(data)}')
    for col, is_team_ll in past_features.items():
        for is_team in is_team_ll:
            # print(f"is_team ={is_team}, col = {col}")
            data = get_past_feature(df=data, feat_col=col, team=is_team)
    
    return data

In [5]:
def get_pivoted(data: pd.DataFrame, break_leg: int, value_col: str = 'goals_scored'):
    
    df = deepcopy(data[data.leg <= break_leg])
    df.rolling_5_games_avg_nb_points = [y if x!=x else x for x, y in 
           zip(df.rolling_5_games_avg_nb_points, df.avg_cum_pts_since_season_start)]
    
    df_pivot = df.pivot_table(index=['season', 'team'], 
                              columns='leg', 
                              values=[value_col]).reset_index()
    
    df_pivot.columns = [f'leg_{l}' if l!='' else n for n, l in df_pivot.columns]
    
    final = df[['season', 'team', 'final_rank', 'final_cum_pts']].drop_duplicates()
    df_last_leg = df[df.leg==break_leg][['season', 
                                          'team', 
                                          'rank', 
                                          'rolling_5_games_avg_nb_points', 
                                          'avg_cum_pts_since_season_start', 
                                          'cum_pts']].reset_index(drop=True)
    
    df_pivot = df_pivot.merge(df_last_leg, on=['season', 'team'])
    
    return df_pivot.merge(final, on=['season', 'team'])

In [6]:
# hist on goal scored

In [7]:
# Preprocessing

In [8]:
championship_csv = {'ligue-1': 'ligue-1_data_2002_2019',
                   'ligue-2': 'ligue-2_data_2002_2019',
                   'serie-A': 'serie-a_data_2004_2019',
                   'bundesliga': 'bundesliga_data_2004_2019',
                   'premier-league': 'premier-league_data_2004_2019',
                   'liga':'liga_data_2004_2019'}

In [9]:
data_df = prepare_data(championship_csv['ligue-1'], championship='ligue-1')

In [10]:
all_data_dfs = [prepare_data(csv_path=path, championship=champ) for champ, path in championship_csv.items()]

In [11]:
all_data_df = pd.concat(all_data_dfs)

In [12]:
data_exploitable_df = deepcopy(all_data_df[all_data_df.leg > 1]).reset_index()

In [13]:
#data_df.head()

In [14]:
data_exploitable_df.head()

Unnamed: 0,index,country,season,leg,team,play,goals_scored,opponent,goals_conceded,nb_points,...,previous_team_rolling_5_games_avg_goals_scored,previous_opponent_rolling_5_games_avg_goals_conceded,previous_team_avg_goals_scored_since_season_start,previous_opponent_avg_goals_conceded_since_season_start,previous_team_goals_scored,previous_opponent_goals_conceded,previous_team_rolling_5_games_avg_nb_points,previous_opponent_rolling_5_games_avg_nb_points,previous_team_nb_points,previous_opponent_nb_points
0,1,France,2004-2005,2,Lyon,Home,1,Sochaux,1,1,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,3.0,1.0,3.0
1,2,France,2004-2005,3,Lyon,Away,1,Metz,1,1,...,0.5,1.5,0.5,1.5,1.0,2.0,1.0,1.5,1.0,0.0
2,3,France,2004-2005,4,Lyon,Home,1,Lille,0,3,...,0.666667,1.333333,0.666667,1.333333,1.0,0.0,1.0,1.666667,1.0,1.0
3,4,France,2004-2005,5,Lyon,Away,2,Rennes,1,3,...,0.75,1.25,0.75,1.25,1.0,1.0,1.5,1.0,3.0,1.0
4,5,France,2004-2005,6,Lyon,Home,0,Bastia,0,1,...,1.0,1.2,1.0,1.2,2.0,1.0,1.8,1.8,3.0,0.0


In [16]:
from collections import Counter

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [23]:
from sklearn.preprocessing import MinMaxScaler

#### data prep

In [24]:
no_use_cols = ['index', 'country', 'season', 'team', 'opponent', 
               'previous_team_rolling_5_games_avg_goals_scored_binned']
usable_cols = [c for c in data_exploitable_df.columns if c not in no_use_cols]
usable_cols = ['leg', 'play'] + [c for c in usable_cols if c.startswith('previous')] + ['goals_scored']

In [25]:
data_df = deepcopy(data_exploitable_df.loc[:,usable_cols])

In [26]:
data_df = deepcopy(data_df[data_df.goals_scored < 5])

In [27]:
# data_df

Unnamed: 0,leg,play,previous_team_rank,previous_opponent_rank,previous_team_rolling_5_games_avg_goals_scored,previous_opponent_rolling_5_games_avg_goals_conceded,previous_team_avg_goals_scored_since_season_start,previous_opponent_avg_goals_conceded_since_season_start,previous_team_goals_scored,previous_opponent_goals_conceded,previous_team_rolling_5_games_avg_nb_points,previous_opponent_rolling_5_games_avg_nb_points,previous_team_nb_points,previous_opponent_nb_points,goals_scored
0,2,Home,12.0,2.0,0.000000,1.000000,0.000000,1.000000,0.0,1.0,1.0,3.000000,1.0,3.0,1
1,3,Away,14.0,8.0,0.500000,1.500000,0.500000,1.500000,1.0,2.0,1.0,1.500000,1.0,0.0,1
2,4,Home,15.0,4.0,0.666667,1.333333,0.666667,1.333333,1.0,0.0,1.0,1.666667,1.0,1.0,1
3,5,Away,6.0,13.0,0.750000,1.250000,0.750000,1.250000,1.0,1.0,1.5,1.000000,3.0,1.0,2
4,6,Home,4.0,5.0,1.000000,1.200000,1.000000,1.200000,2.0,1.0,1.8,1.800000,3.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64405,34,Away,1.0,18.0,2.000000,1.600000,2.484848,1.484848,2.0,2.0,2.2,0.600000,3.0,1.0,2
64406,35,Home,1.0,8.0,2.000000,2.400000,2.470588,1.352941,2.0,4.0,2.2,0.600000,3.0,0.0,1
64407,36,Away,1.0,12.0,1.400000,1.000000,2.428571,1.114286,1.0,0.0,2.6,0.600000,3.0,1.0,0
64408,37,Home,1.0,18.0,1.000000,1.200000,2.361111,1.416667,0.0,2.0,2.0,0.600000,0.0,0.0,2


In [28]:
# encode play variable
data_df['play'] = [1 if p=='Home' else 0 for p in data_df.play]

In [29]:
# split train-test-valid : 0.8x.95-0.2x.95-0.05
features_data = data_df.drop('goals_scored', axis=1).values
target_data = data_df['goals_scored'].values

In [30]:
mm_scaler = MinMaxScaler(feature_range=(0,100))

In [31]:
scale = False

In [32]:
if scale:
    feature_data = mm_scaler.fit_transform(features_data)

In [33]:
X_model, X_valid, y_model, y_valid = train_test_split(features_data, 
                                                      target_data, 
                                                      test_size=0.05, 
                                                      random_state=42,
                                                      stratify=target_data
                                                     )

X_train, X_test, y_train, y_test = train_test_split(X_model,
                                                    y_model,
                                                    test_size=.2,
                                                    random_state=61,
                                                    stratify=y_model
                                                   )

In [None]:
oversample = False

In [34]:
oversampler = SMOTE(random_state=42) #RandomOverSampler()

In [35]:
if oversample:
    X_train_over, y_train_over = oversampler.fit_resample(X_train, y_train)
else:
    X_train_over, y_train_over = X_train, y_train

In [36]:
len(y_train_over)

83785

In [37]:
len(y_train)

48198

In [38]:
Counter(y_test)

Counter({2: 2670, 0: 3542, 1: 4189, 3: 1206, 4: 443})

 # Bayesian Prediction

In [105]:
markov_df = deepcopy(data_df[['previous_team_goals_scored', 'previous_opponent_goals_conceded',
                                        'goals_scored']])
markov_df.loc[:,'previous_team_goals_scored'] = markov_df.previous_team_goals_scored.astype(int)
markov_df.loc[:,'previous_opponent_goals_conceded'] = markov_df.previous_opponent_goals_conceded.astype(int)

In [106]:
markov_df

Unnamed: 0,previous_team_goals_scored,previous_opponent_goals_conceded,goals_scored
0,0,1,1
1,1,2,1
2,1,0,1
3,1,1,2
4,2,1,0
...,...,...,...
64405,2,2,2
64406,2,4,1
64407,1,0,0
64408,0,2,2


In [125]:
markov_features_data = markov_df[['previous_team_goals_scored', 'previous_opponent_goals_conceded']].values
markov_target_data = markov_df['goals_scored'].values

markov_X_model, markov_X_valid, markov_y_model, markov_y_valid = train_test_split(markov_features_data, 
                                                                                  markov_target_data, 
                                                                                  test_size=0.05, 
                                                                                  random_state=42,
                                                                                  stratify=target_data
                                                                                 )

markov_X_train, markov_X_test, markov_y_train, markov_y_test = train_test_split(markov_X_model,
                                                                                markov_y_model,
                                                                                test_size=.2,
                                                                                random_state=61,
                                                                                stratify=y_model
                                                                               )

In [126]:
markov_X_train

array([[1, 1],
       [0, 2],
       [0, 1],
       ...,
       [2, 5],
       [0, 2],
       [2, 1]])

In [127]:
markov_X_train[:,0]

array([1, 0, 0, ..., 2, 0, 2])

In [128]:
mark_df = pd.DataFrame(data={'source_team': markov_X_train[:,0], 
                             'source_opponent': markov_X_train[:,1],
                             'team_score': markov_y_train, 
                             'qty' : [1]*len(markov_X_train)})

In [150]:
fake_df = pd.DataFrame(data={'source_team': markov_X_train[:,0], 
                             'source_opponent': markov_X_train[:,1],
                             'team_score': [-1]*len(markov_y_train), 
                             'qty' : [0]*len(markov_X_train)})

In [178]:
bayes_df = pd.concat([mark_df, fake_df], ignore_index=True)

In [152]:
test_df

Unnamed: 0,source_team,source_opponent,team_score,qty
0,1,1,1,1
1,0,2,0,1
2,0,1,1,1
3,1,1,0,1
4,3,2,1,1
...,...,...,...,...
96391,0,2,-1,0
96392,0,7,-1,0
96393,2,5,-1,0
96394,0,2,-1,0


In [195]:
dg = bayes_df[['team_score', 'qty']].groupby(by=['team_score']).sum().reset_index()

In [199]:
dg['proportion'] = dg.qty / dg.qty.sum()

In [200]:
dg

Unnamed: 0,team_score,qty,prpportion,proportion
0,-1,0,0.0,0.0
1,0,14166,0.293913,0.293913
2,1,16757,0.34767,0.34767
3,2,10681,0.221607,0.221607
4,3,4825,0.100108,0.100108
5,4,1769,0.036703,0.036703


In [220]:
get_general_distribution(df=bayes_df, dest_col='team_score', qty_col='qty')

Unnamed: 0,team_score,qty,proportion,cumulative_distribution
0,-1,0,0.0,0.0
1,0,14166,0.293913,0.293913
2,1,16757,0.34767,0.641583
3,2,10681,0.221607,0.863189
4,3,4825,0.100108,0.963297
5,4,1769,0.036703,1.0


In [214]:
def get_general_distribution(df, dest_col, qty_col):
    dg = bayes_df[[dest_col, qty_col]].groupby(by=[dest_col]).sum().reset_index()
    dg['proportion'] = getattr(dg, qty_col) / getattr(dg, qty_col).sum()
    dg['cumulative_distribution'] = dg['proportion'].cumsum().values
    
    return dg

In [133]:
def get_bayesian_df(df, source_cols: list, dest_col: str, qty_col: str):
    edge_cnt = df[source_cols + [dest_col, qty_col]].groupby(by=source_cols + [dest_col]).sum().reset_index()
    edge_cnt.rename(columns={qty_col: 'edge_cnt'}, inplace=True)
    
    source_degree = mark_df[source_cols + [qty_col]].groupby(by=source_cols).sum().reset_index()
    source_degree.rename(columns={qty_col: 'degree'}, inplace=True)
    transition_df = edge_cnt.merge(source_degree, how='left', on=source_cols)
    transition_df['proportion'] = transition_df['edge_cnt'].div(transition_df['degree'])
    transition_df['cumulative_prop'] = transition_df[source_cols + ['proportion']
                                                ].groupby(by=source_cols).cumsum().values
    
    return transition_df

In [183]:
only_team_df = get_bayesian_df(df=bayes_df, source_cols=['source_team'], dest_col='team_score', qty_col='qty')

In [184]:
team_opp_df = get_bayesian_df(df=bayes_df, source_cols=['source_team', 'source_opponent'], 
                              dest_col='team_score', qty_col='qty')

In [163]:
from random import random
from functools import reduce

In [239]:
def predict_score(bayes_df, general_distribution, source_inputs:dict, cumulative_col='cumulative_prop',
                  target_col='team_score'):
    
    dice_result = random()
    # print(dice_result)
    filters = [getattr(bayes_df, cumulative_col)<= dice_result]
    filters += [getattr(bayes_df, col) == val for col, val in source_inputs.items()]
    final_filter = reduce(lambda f1, f2: f1&f2, filters)
    # print(final_filter)
    try:
        prediction = list(bayes_df.loc[final_filter, target_col])[-1] + 1
    except IndexError:
        dice_result = random()
        prediction = list(
            general_distribution.loc[general_distribution.cumulative_distribution <= dice_result, 
                                     'team_score']
        )[-1] + 1
    return prediction

In [215]:
general_distribution_df = get_general_distribution(df=bayes_df, dest_col='team_score', qty_col='qty')

In [189]:
only_team_pred = np.array(
    [predict_score(bayes_df=only_team_df,
                   general_distribution=general_distribution_df, 
                   source_inputs={'source_team' : gs},
                   target_col='team_score') 
     for gs in markov_X_test[:,0]]
)

In [240]:
team_opp_pred = np.array(
    [predict_score(bayes_df=team_opp_df, 
                   general_distribution=general_distribution_df,
                   source_inputs={'source_team' : gs, 'source_opponent': gc}, 
                   target_col='team_score') 
     for gs, gc in markov_X_test]
)

In [221]:
confusion_matrix(y_true=markov_y_test, y_pred=only_team_pred)

array([[1046, 1215,  824,  350,  107],
       [1259, 1414,  935,  435,  146],
       [ 802,  940,  597,  231,  100],
       [ 361,  429,  275,   94,   47],
       [ 133,  151,  104,   40,   15]])

In [192]:
accuracy_score(y_true=markov_y_test, y_pred=only_team_pred)

0.26273858921161825

In [241]:
team_opp_pred

array([1, 0, 1, ..., 0, 2, 1])

In [242]:
confusion_matrix(y_true=markov_y_test, y_pred=team_opp_pred)

array([[1032, 1215,  797,  369,  129],
       [1255, 1425,  968,  403,  138],
       [ 765,  953,  601,  263,   88],
       [ 342,  401,  297,  116,   50],
       [ 109,  158,  118,   42,   16]])

In [243]:
accuracy_score(y_true=markov_y_test, y_pred=team_opp_pred)

0.26473029045643154

In [245]:
only_team_df

Unnamed: 0,source_team,team_score,edge_cnt,degree,proportion,cumulative_prop
0,0,-1,0,14087,0.0,0.0
1,0,0,4241,14087,0.301058,0.301058
2,0,1,4955,14087,0.351743,0.6528
3,0,2,3119,14087,0.22141,0.87421
4,0,3,1313,14087,0.093207,0.967417
5,0,4,459,14087,0.032583,1.0
6,1,-1,0,16496,0.0,0.0
7,1,0,4939,16496,0.299406,0.299406
8,1,1,5836,16496,0.353783,0.653189
9,1,2,3589,16496,0.217568,0.870757


In [246]:
team_opp_df

Unnamed: 0,source_team,source_opponent,team_score,edge_cnt,degree,proportion,cumulative_prop
0,0,0,-1,0,3954,0.000000,0.000000
1,0,0,0,1123,3954,0.284016,0.284016
2,0,0,1,1410,3954,0.356601,0.640617
3,0,0,2,916,3954,0.231664,0.872281
4,0,0,3,359,3954,0.090794,0.963075
...,...,...,...,...,...,...,...
350,9,1,1,1,1,1.000000,1.000000
351,9,2,-1,0,1,0.000000,0.000000
352,9,2,2,1,1,1.000000,1.000000
353,10,0,-1,0,1,0.000000,0.000000
