In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
pd.options.display.max_columns=None

import numpy as np

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


import matplotlib.pyplot as plt
import wandb
from wandb.keras import WandbCallback
from sklearn.preprocessing import StandardScaler, Normalizer

In [None]:
def cm_metrics(true_label,pred_label, labels_):
  cm = confusion_matrix(true_label,pred_label,labels = labels_)
  cmDisp = ConfusionMatrixDisplay(cm,display_labels=labels_)
  recall = { i:cm[i][i]/cm[i].sum()  for i in range(len(cm))}
  precision = { i:cm[i][i]/cm[:,i].sum()  for i in range(len(cm))}
  F1_score = { i: 2 / (1/recall[i] + 1/precision[i]) for i in range(len(cm))}
  accuracy = (cm[0][0]+cm[1][1]+cm[2][2])/cm.sum()
  print('accuracy: ', accuracy)
  print('Recall: \n',recall,'\nPrecision: \n', precision,'\nF1-score: \n',F1_score)
  cmDisp.plot()
  plt.show()
  return None

In [None]:
home_history = pd.read_csv('../data/train_LSTM/home_history_full.csv', index_col = 0,\
    dtype={'home_id' : np.int32, 'game_id' : np.int32, 'H_goals' : np.int8, 'H_goals_conceded' : np.int8,\
         'H_HorA' : np.int8, 'H_yellow_cards' : np.int8, 'H_red_cards' : np.int8}\
    )

away_history = pd.read_csv('../data/train_LSTM/away_history_full.csv', index_col = 0,\
    dtype={'away_id' : np.int32, 'game_id' : np.int32, 'A_goals' : np.int8, 'A_goals_conceded' : np.int8,\
         'A_HorA' : np.int8, 'A_yellow_cards' : np.int8, 'A_red_cards' : np.int8}\
    )

In [None]:
game_history = pd.read_csv('../data/train_LSTM/game_history_full.csv', index_col = 0,\
    dtype={'game_history_id' : np.int32, 'game_id' : np.int32, 'H_goals' : np.int8, 'H_goals_conceded' : np.int8,\
         'H_HorA' : np.int8, 'H_yellow_cards' : np.int8, 'H_red_cards' : np.int8, 'A_yellow_cards' : np.int8, 'A_red_cards' : np.int8}\
    )

In [None]:
train_df = pd.read_csv('../data/training_data/train_final_6_bis.csv', index_col = 0)
#train_df = train_df.astype(dtype={x:np.float32 for x in train_df.columns[1:-1]})


In [None]:
train_best = pd.read_csv( '../data/training_data/train_best_final_6.csv', index_col = 0)
train_best = train_best.astype(dtype={x:np.float16 for x in train_best.columns[1:]})

## selection des matchs dans game_history, away_history et game_history

In [None]:

# on rajoute des 0 et des 1 au début de chaque ligne pour indiquer si il y a un match ou pas
# on inverse le game_history avec np.flip(game_history,axis=1) pour classer les matches du plus ancien au plus récent
# Cela dit l'expérience montre que cette inversion n'a pas énormément d'impact sur les performances du modèle... 

def make_home_train(game_id):
    game_history = home_history.loc[home_history.game_id == game_id,:].drop(columns=['home_id','game_id'])
    if (n_lines := game_history.shape[0]) == 0:
        return np.zeros((10,6)).astype(np.int8)
    else:
        return np.vstack(( np.zeros((10-n_lines,6)) , np.hstack(( np.ones((n_lines,1)) , np.flip(game_history.to_numpy(),axis=1))) )).astype(np.int8)

def make_away_train(game_id):
    game_history = away_history.loc[away_history.game_id == game_id,:].drop(columns=['away_id','game_id'])
    if (n_lines := game_history.shape[0]) == 0:
        return np.zeros((10,6)).astype(np.int8)
    else:
        return np.vstack(( np.zeros((10-n_lines,6)) , np.hstack(( np.ones((n_lines,1)) , np.flip(game_history.to_numpy(),axis=1))) )).astype(np.int8)

# on rajoute des 0 et des 1 au début de chaque ligne pour indiquer si il y a un match ou pas
# on inverse le game_history avec np.flip(game_history,axis=1) pour classer les matches du plus ancien au plus récent
# Cela dit l'expérience montre que cette inversion n'a pas énormément d'impact sur les performances du modèle... 

def make_history_train(game_id):
    encounter_history = game_history.loc[game_history.game_id == game_id,:].drop(columns=['game_history_id','game_id'])
    if (n_lines := encounter_history.shape[0]) == 0:
        return np.zeros((5,8)).astype(np.int8)
    else:
        return np.vstack(( np.zeros((5-n_lines,8)) , np.hstack(( np.ones((n_lines,1)) , np.flip(encounter_history.to_numpy(),axis=1))) )).astype(np.int8)


In [None]:
#ATTENTION: certaines parties apparaissent en double dans games.csv
tt = home_history.groupby('game_id')['home_id'].count().reset_index()
display(tt.loc[tt.home_id>10,:])
# ça nous casse les pieds on va s'en débarasser, il n'y en a que trois, on les jarte
home_history.drop(home_history.loc[home_history.game_id.isin( tt.loc[tt.home_id>10,'game_id'] ), :].index,inplace = True)


#idem pour away_history

tt = away_history.groupby('game_id')['away_id'].count().reset_index()
display(tt.loc[tt.away_id>10,:])
away_history.drop(away_history.loc[away_history.game_id.isin( tt.loc[tt.away_id>10,'game_id'] ), :].index,inplace = True)

#idem pour game_history?

tt = game_history.groupby('game_id')['game_history_id'].count().reset_index()
display(tt.loc[tt.game_history_id>5,:])
game_history.drop(game_history.loc[game_history.game_id.isin( tt.loc[tt.game_history_id>5,'game_id'] ), :].index,inplace = True)



In [None]:
game_ids = train_df.game_id.astype(np.int32)

train_home = game_ids.progress_apply(make_home_train)
train_home = np.stack(train_home.values)

train_away = game_ids.progress_apply(make_away_train)
train_away = np.stack(train_away.values)

train_history = game_ids.progress_apply(make_history_train)
train_history = np.stack(train_history.values)


In [None]:
train_home.shape, train_away.shape, train_history.shape, train_best.shape, train_df.shape

In [None]:
train_df.Home_result = train_df.Home_result.apply(lambda x: \
    0 if x>0 else( 1 if x==0 else 2 )\
    )

## Normalisation

In [None]:
col_list = [x+y for x in ['H_', 'A_'] for y in ['GK', 'attack', 'defense', 'midfield']]
col_list = col_list + ['game_id', 'Home_result']

In [None]:
train_df.loc[:, [x for x in train_df.columns if x not in col_list]]=train_df.loc[:, [x for x in train_df.columns if x not in col_list]]/100


In [None]:
train_best.loc[:, train_best.columns[1:]] = train_best.loc[:, train_best.columns[1:]]/100

## préparons les données pour l'entrainement du modèle

In [None]:
train_home = train_home.astype(np.float16)
train_away = train_away.astype(np.float16)
train_history = train_history.astype(np.float16)

In [None]:
y = train_df.Home_result.values
best_players = train_best.drop(columns=['game_id']).to_numpy()
players = train_df.drop(columns = ['game_id', 'Home_result']).to_numpy()
XH_train, XH_test, XA_train, XA_test, XG_train, XG_test, best_train, best_test, players_train, players_test, y_train, y_test = \
    train_test_split(train_home, train_away, train_history, best_players, players, y, test_size = 0.2, random_state=0, stratify=y)

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
  def __init__(self, xh_data, xa_data, xg_data, xbest_data, xplayer_data, y_data, batch_size):
    self.xh,self.xa, self.xg, self.xbest, self.xplayers, self.y = \
        xh_data, xa_data, xg_data, xbest_data, xplayer_data, y_data
    self.batch_size = batch_size
    self.num_batches = np.ceil(len(xh_data) / batch_size)
    self.batch_idx = np.array_split(range(len(xh_data)), self.num_batches)

  def __len__(self):
    return len(self.batch_idx)

  def __getitem__(self, idx):
    batch_xh = self.xh[self.batch_idx[idx]]
    batch_xa = self.xa[self.batch_idx[idx]]
    batch_xg = self.xg[self.batch_idx[idx]]
    batch_xbest = self.xbest[self.batch_idx[idx]]
    batch_xplayers = self.xplayers[self.batch_idx[idx]]
    batch_y = self.y[self.batch_idx[idx]]
    return [batch_xh, batch_xa, batch_xg, batch_xplayers, batch_xbest], batch_y

train_generator = DataGenerator(XH_train, XA_train, XG_train, best_train, players_train, y_train, batch_size = 128)
val_generator =   DataGenerator(XH_test,  XA_test,  XG_test,  best_test,  players_test,  y_test,  batch_size = 32)


In [None]:
class DataGenerator2(tf.keras.utils.Sequence):
  def __init__(self, xh_data, xa_data, xg_data, xplayer_data, y_data, batch_size):
    self.xh,self.xa, self.xg, self.xplayers, self.y = \
        xh_data, xa_data, xg_data, xplayer_data, y_data
    self.batch_size = batch_size
    self.num_batches = np.ceil(len(xh_data) / batch_size)
    self.batch_idx = np.array_split(range(len(xh_data)), self.num_batches)

  def __len__(self):
    return len(self.batch_idx)

  def __getitem__(self, idx):
    batch_xh = self.xh[self.batch_idx[idx]].astype(np.float32)
    batch_xa = self.xa[self.batch_idx[idx]].astype(np.float32)
    batch_xg = self.xg[self.batch_idx[idx]].astype(np.float32)
    batch_xplayers = self.xplayers[self.batch_idx[idx]].astype(np.float32)
    batch_y = self.y[self.batch_idx[idx]].astype(np.float32)
    return [batch_xh, batch_xa, batch_xg, batch_xplayers], batch_y

train_generator2 = DataGenerator2(XH_train, XA_train, XG_train, players_train, y_train, batch_size = 128)
val_generator2 =   DataGenerator2(XH_test,  XA_test,  XG_test,  players_test,  y_test,  batch_size = 32)

# Modèle

In [None]:
from tensorflow.keras.regularizers import L1L2
LSTM_model = tf.keras.models.load_model('../data/models/H_A_game_history_LSTM')

In [None]:
players_train.shape

In [None]:
next(iter(train_generator2))[0][3].shape

In [None]:

home_input = tf.keras.Input(shape = (10,6) , name = 'home_train_input')                                   #input 1
home_LSTM_1 = tf.keras.layers.LSTM(32, kernel_regularizer = L1L2(l1=0.01, l2=0.01), name = 'home_LSTM_1')(home_input)

away_input = tf.keras.Input(shape = (10,6), name = 'away_train_input')                                   #input 2
away_LSTM_1 = tf.keras.layers.LSTM(32, kernel_regularizer = L1L2(l1=0.01, l2=0.01), name = 'away_LSTM_1')(away_input)

h_a = tf.keras.layers.Concatenate()([home_LSTM_1,away_LSTM_1])
#h_a = tf.keras.layers.Dropout(0.234375)(h_a)
#Dense1 = tf.keras.layers.Dense(64,'relu',name='Dense1')(h_a)
#Dense1 = tf.keras.layers.Dropout(0.234375)(Dense1)
#Dense2 = tf.keras.layers.Dense(16,'relu',name='Dense2')(Dense1)
#Dense2 = tf.keras.layers.Dropout(0.125)(Dense2)


game_input = tf.keras.Input(shape = (5,8), name = 'game_train_input')                                   #input 3
game_LSTM_1 = tf.keras.layers.LSTM(32, kernel_regularizer = L1L2(l1=0.001, l2=0.02), name = 'game_LSTM_1')(game_input)


h_a_g = tf.keras.layers.Concatenate()([h_a,game_LSTM_1])
#Dense3 = tf.keras.layers.Dense(64,'relu',name='Dense3')(h_a_g)
#Dense3 = tf.keras.layers.Dropout(0.125)(Dense3)

#players
player_input = tf.keras.Input(shape = (192), name = 'player_input')  
#Dense1 = tf.keras.layers.Dense(64,'relu', kernel_regularizer=L1L2(l1=0.001, l2=0.001),name='Dense1')(player_input)
#Dense1 = tf.keras.layers.Dropout(0.2)(Dense1)


#final concat

h_a_g_p = tf.keras.layers.Concatenate()([h_a_g,player_input])
Dense4 = tf.keras.layers.Dense(64,'relu', kernel_regularizer=L1L2(l1=0.001, l2=0.001),name='Dense4')(h_a_g_p)
Dense4 = tf.keras.layers.Dropout(0.2)(Dense4)



Dense_output = tf.keras.layers.Dense(3, 'softmax', name='output')(Dense4)

In [None]:
model_1 = tf.keras.Model(inputs=[home_input,away_input,game_input,player_input], outputs=Dense_output)

In [None]:
opt = Adam(0.0003)
loss = SparseCategoricalCrossentropy()
metric = tf.keras.metrics.SparseCategoricalAccuracy()
model_1.compile(optimizer=opt, loss=loss, metrics = [metric])

In [None]:
print([x.name for x in model_1.layers])
print([x.name for x in LSTM_model.layers])

In [None]:
#on va chercher les poids du modèle déjà entrainé
model_1.layers[2].set_weights(LSTM_model.layers[2].get_weights())
model_1.layers[3].set_weights(LSTM_model.layers[3].get_weights())
model_1.layers[6].set_weights(LSTM_model.layers[11].get_weights())
#model_1.layers[8].set_weights(LSTM_model.layers[8].get_weights())
#model_1.layers[9].set_weights(LSTM_model.layers[11].get_weights())
#model_1.layers[11].set_weights(LSTM_model.layers[11].get_weights())
#model_1.layers[12].set_weights(LSTM_model.layers[13].get_weights())




In [None]:
class_weight = {0: 1.,
                1: 1,
                2: 1.}

In [None]:
wandb.init(project="foot_LSTM_AVGplayers", entity="padda")


In [None]:
#wandb.init(project="foot_LSTM_AVGplayers", entity="padda")
model_1.fit(
    train_generator2,
    epochs=500,
    validation_data=val_generator2,
    callbacks=[WandbCallback()])

In [None]:
y_test_pred = model_1.predict([XH_test, XA_test, XG_test, players_test]).argmax(axis=-1)
y_train_pred = model_1.predict([XH_train, XA_train, XG_train, players_train]).argmax(axis=-1)


cm_metrics(y_test,y_test_pred, [0,1,2])
cm_metrics(y_train,y_train_pred, [0,1,2])

In [None]:
model_1.save('data/models/H_A_G_P_reg')

# Modèle 2

In [None]:
next(iter(train_generator))[0][4].shape

In [None]:

home_input = tf.keras.Input(shape = (10,6) , name = 'home_train_input')                                   #input 1
home_LSTM_1 = tf.keras.layers.LSTM(32, kernel_regularizer = L1L2(l1=0.01, l2=0.01), name = 'home_LSTM_1')(home_input)

away_input = tf.keras.Input(shape = (10,6), name = 'away_train_input')                                   #input 2
away_LSTM_1 = tf.keras.layers.LSTM(32, kernel_regularizer = L1L2(l1=0.01, l2=0.01), name = 'away_LSTM_1')(away_input)

h_a = tf.keras.layers.Concatenate()([home_LSTM_1,away_LSTM_1])
#h_a = tf.keras.layers.Dropout(0.234375)(h_a)
#Dense1 = tf.keras.layers.Dense(64,'relu',name='Dense1')(h_a)
#Dense1 = tf.keras.layers.Dropout(0.234375)(Dense1)
#Dense2 = tf.keras.layers.Dense(16,'relu',name='Dense2')(Dense1)
#Dense2 = tf.keras.layers.Dropout(0.125)(Dense2)


game_input = tf.keras.Input(shape = (5,8), name = 'game_train_input')                                   #input 3
game_LSTM_1 = tf.keras.layers.LSTM(32, kernel_regularizer = L1L2(l1=0.001, l2=0.02), name = 'game_LSTM_1')(game_input)


h_a_g = tf.keras.layers.Concatenate()([h_a,game_LSTM_1])
#Dense3 = tf.keras.layers.Dense(64,'relu',name='Dense3')(h_a_g)
#Dense3 = tf.keras.layers.Dropout(0.125)(Dense3)

#players
player_input = tf.keras.Input(shape = (192), name = 'player_input')  
Dense1 = tf.keras.layers.Dense(64,'relu', kernel_regularizer=L1L2(l1=0.001, l2=0.001),name='Dense1')(player_input)
Dense1 = tf.keras.layers.Dropout(0.2)(Dense1)


#best_players
best_player_input = tf.keras.Input(shape = (144), name = 'best_player_input')  
Dense2 = tf.keras.layers.Dense(64,'relu', kernel_regularizer=L1L2(l1=0.001, l2=0.001),name='Dense2')(best_player_input)
Dense2 = tf.keras.layers.Dropout(0.2)(Dense2)


#final concat

h_a_g_p = tf.keras.layers.Concatenate()([h_a_g, Dense1, Dense2])
Dense4 = tf.keras.layers.Dense(128,'relu', kernel_regularizer=L1L2(l1=0.001, l2=0.001),name='Dense4')(h_a_g_p)
Dense4 = tf.keras.layers.Dropout(0.2)(Dense4)



Dense_output = tf.keras.layers.Dense(3, 'softmax', name='output')(Dense4)

In [None]:
model_2 = tf.keras.Model(inputs=[home_input,away_input,game_input,player_input,best_player_input], outputs=Dense_output)

In [None]:
opt = Adam(0.0003)
loss = SparseCategoricalCrossentropy()
metric = tf.keras.metrics.SparseCategoricalAccuracy()
model_2.compile(optimizer=opt, loss=loss, metrics = [metric])

In [None]:
print([x.name for x in model_2.layers])
print()
print([x.name for x in LSTM_model.layers])

In [None]:
model_2.layers[2].set_weights(LSTM_model.layers[2].get_weights())
model_2.layers[3].set_weights(LSTM_model.layers[3].get_weights())
#model_2.layers[6].set_weights(LSTM_model.layers[6].get_weights())
model_2.layers[8].set_weights(LSTM_model.layers[11].get_weights())


In [None]:
wandb.init(project="foot_LSTM_AVGplayers", entity="padda")

In [None]:
model_2.fit(
    train_generator,
    epochs=500,
    validation_data=val_generator,
    callbacks=[WandbCallback()]
)


In [None]:
y_test_pred = model_2.predict([XH_test, XA_test, XG_test, players_test, best_test]).argmax(axis=-1)
y_train_pred = model_2.predict([XH_train, XA_train, XG_train, players_train, best_train]).argmax(axis=-1)


cm_metrics(y_test,y_test_pred, [0,1,2])
cm_metrics(y_train,y_train_pred, [0,1,2])