# ⚽ Modélisation et Analyse de Données Footballistiques

* Les imports : 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



## 🟢 matches : 

* Importer dataframe (matches) : 

In [2]:
# les matches : 
df_matches = pd.read_csv('../data/data_scraper/matchs_infos.csv')
df_matches.head(5)

Unnamed: 0,Squad,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Opp Formation,Referee
0,Liverpool,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,2.6,0.5,62.0,30014,Virgil van Dijk,4-2-3-1,4-2-3-1,Tim Robinson
1,Liverpool,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2,0,Brentford,2.5,0.5,62.0,60017,Virgil van Dijk,4-2-3-1,4-4-2,Stuart Attwell
2,Liverpool,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,1.8,1.4,47.0,73738,Virgil van Dijk,4-2-3-1,4-2-3-1,Anthony Taylor
3,Liverpool,2024-09-14,15:00,Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,0.9,0.4,68.0,60344,Virgil van Dijk,4-2-3-1,4-2-3-1,Michael Oliver
4,Liverpool,2024-09-17,21:00 (20:00),Champions Lg,League phase,Tue,Away,W,3,1,it Milan,3.1,0.6,51.0,59826,Virgil van Dijk,4-2-3-1,4-2-3-1,Espen Eskås



### 2️⃣ Transformation des données (matches)

#### ➕ Nettoyage : suppression ou traitement des valeurs manquantes et incohérentes.

##### 🔎 Compter le nombre des manquantes dans dataframes (matches)

* visualisation général : 

In [3]:
print(f"Le nombres total des matches : {df_matches.shape}")
print(f"Les colonnes de table des matches : {df_matches.columns}")


Le nombres total des matches : (975, 19)
Les colonnes de table des matches : Index(['Squad', 'Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result',
       'GF', 'GA', 'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain',
       'Formation', 'Opp Formation', 'Referee'],
      dtype='object')


* Identifier les manquants : 

In [4]:
print(f"Le nombre des manquantes par colonnes : \n{df_matches.isnull().sum()}")
print(f"Le nombre total des manquantes dans les colonnes : \n{df_matches.isnull().sum().sum()}")


Le nombre des manquantes par colonnes : 
Squad              0
Date               0
Time               0
Comp               0
Round              0
Day                0
Venue              0
Result             0
GF                 0
GA                 0
Opponent           0
xG               126
xGA              126
Poss              22
Attendance         6
Captain            0
Formation          0
Opp Formation      0
Referee            5
dtype: int64
Le nombre total des manquantes dans les colonnes : 
285


* gestion des manquantes : 

In [5]:
df_matches_clean = df_matches.copy()

# Remplacement des valeurs manquantes
df_matches_clean['xG'] = df_matches_clean['xG'].fillna(0)
df_matches_clean['xGA'] = df_matches_clean['xGA'].fillna(0)
df_matches_clean['Attendance'] = df_matches_clean['Attendance'].fillna('0')
df_matches_clean['Referee'] = df_matches_clean['Referee'].fillna('Inconnu')
df_matches_clean['Poss'] = df_matches_clean['Poss'].fillna('Inconnu')


print(f"Le nombre des manquantes par colonnes apres le remplacement des valeurs manquantes par 0 : \n{df_matches_clean.isnull().sum()}")


Le nombre des manquantes par colonnes apres le remplacement des valeurs manquantes par 0 : 
Squad            0
Date             0
Time             0
Comp             0
Round            0
Day              0
Venue            0
Result           0
GF               0
GA               0
Opponent         0
xG               0
xGA              0
Poss             0
Attendance       0
Captain          0
Formation        0
Opp Formation    0
Referee          0
dtype: int64


* identifier et supprimer les doubleons : 

In [6]:
print(f"Le nombre total des duplications dans la table des matches : {df_matches.duplicated().sum()}")

df_matches_clean = df_matches_clean.drop_duplicates()

Le nombre total des duplications dans la table des matches : 0


* 🔎 Corriger les incohérences dans les données

In [7]:
# Vérification
incoherences = df_matches_clean[
    ((df_matches_clean['GF'] > df_matches_clean['GA']) & (df_matches_clean['Result'] != 'W')) |
    ((df_matches_clean['GF'] < df_matches_clean['GA']) & (df_matches_clean['Result'] != 'L')) |
    ((df_matches_clean['GF'] == df_matches_clean['GA']) & (df_matches_clean['Result'] != 'D'))
]
pd.DataFrame(incoherences.head(5))


Unnamed: 0,Squad,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Opp Formation,Referee
86,Arsenal,2025-01-12,15:00 (16:00),FA Cup,Third round proper,Sun,Home,D,1 (3),1 (5),Manchester Utd,0.0,0.0,Inconnu,60109,Martin Ødegaard,4-3-3,3-4-3,Andy Madley
114,Manchester City,2024-08-10,15:00,FA Community Shield,FA Community Shield,Sat,Home,D,1 (7),1 (6),Manchester Utd,0.0,0.0,56.0,78146,Rúben Dias,4-3-3,4-2-3-1,Jarred Gillett
230,Newcastle Utd,2024-08-28,20:00,EFL Cup,Second round,Wed,Away,D,1 (4),1 (3),Nott'ham Forest,0.0,0.0,59.0,23083,Kieran Trippier,4-3-3,3-4-1-2,Samuel Allison
335,Nott'ham Forest,2024-08-28,20:00,EFL Cup,Second round,Wed,Home,D,1 (3),1 (4),Newcastle Utd,0.0,0.0,42.0,23083,Willy Boly,3-4-1-2,4-3-3,Samuel Allison
359,Nott'ham Forest,2025-02-11,20:00 (21:00),FA Cup,Fourth round proper,Tue,Away,D,2 (4),2 (2),Exeter City,0.0,0.0,77.0,8330,Ryan Yates,4-2-3-1,5-4-1,Andrew Kitchen


In [8]:
df_matches_clean.loc[df_matches_clean['GF'] > df_matches_clean['GA'], 'Result'] = 'W'
df_matches_clean.loc[df_matches_clean['GF'] < df_matches_clean['GA'], 'Result'] = 'L'
df_matches_clean.loc[df_matches_clean['GF'] == df_matches_clean['GA'], 'Result'] = 'D'



#### ➕ Standardisation des données (matches)

* Normalisation des noms de colonnes

In [9]:
df_matches_clean.columns = df_matches_clean.columns.str.strip().str.replace(' ', '_')


In [10]:
df_matches_clean.columns

Index(['Squad', 'Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result',
       'GF', 'GA', 'Opponent', 'xG', 'xGA', 'Poss', 'Attendance', 'Captain',
       'Formation', 'Opp_Formation', 'Referee'],
      dtype='object')

* Format de la date et de l’heure

In [11]:
# Time
df_matches_clean['Time'] = df_matches_clean['Time'].str.extract(r'(\d{2}:\d{2})')

# Date
df_matches_clean['Date'] = pd.to_datetime(df_matches_clean['Date'], errors='coerce')



* convertir de Attendance comme un int : 

In [12]:
print("Avant : \n",df_matches['Attendance'].head(4))
df_matches_clean['Attendance'] = df_matches_clean['Attendance'].str.replace(',', '').astype(int)

print("Apres : \n",df_matches_clean['Attendance'].head(4))


Avant : 
 0    30,014
1    60,017
2    73,738
3    60,344
Name: Attendance, dtype: object
Apres : 
 0    30014
1    60017
2    73738
3    60344
Name: Attendance, dtype: int64


##### 📍 Gerer les matches Home/Away : 

In [13]:
df_matches_clean['MatchKey'] = df_matches_clean.apply(
    lambda row: f"{row['Date']}_" + '_'.join(sorted([row['Squad'], row['Opponent']])),
    axis=1
)

df_matches_clean.head(3)

Unnamed: 0,Squad,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Opp_Formation,Referee,MatchKey
0,Liverpool,2024-08-17,12:30,Premier League,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,2.6,0.5,62.0,30014,Virgil van Dijk,4-2-3-1,4-2-3-1,Tim Robinson,2024-08-17 00:00:00_Ipswich Town_Liverpool
1,Liverpool,2024-08-25,16:30,Premier League,Matchweek 2,Sun,Home,W,2,0,Brentford,2.5,0.5,62.0,60017,Virgil van Dijk,4-2-3-1,4-4-2,Stuart Attwell,2024-08-25 00:00:00_Brentford_Liverpool
2,Liverpool,2024-09-01,16:00,Premier League,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,1.8,1.4,47.0,73738,Virgil van Dijk,4-2-3-1,4-2-3-1,Anthony Taylor,2024-09-01 00:00:00_Liverpool_Manchester Utd


* Garder la première ligne trouvée.

In [14]:
data_matches_clean = df_matches_clean.drop_duplicates(subset='MatchKey', keep='first').reset_index(drop=True)



print(df_matches_clean.shape)
print(data_matches_clean.shape)

(975, 20)
(559, 20)


##### 📍 sauvegarder les matches cleaned dans un nouveau fichier CSV 

In [15]:

data_matches_clean.to_csv('../data/data_cleaned/matches_clean.csv', index=False)


***
***

## 🟢 players : 

* Importer dataframe (players) : 

In [16]:
# les players : 
df_players = pd.read_csv('../data/data_scraper/players_infos.csv')
df_players.head(5)

Unnamed: 0,Squad,Player,Nation,Pos,Age,MP,Starts,Min,90s,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR
0,Liverpool,Mohamed Salah,eg EGY,FW,32.0,38,38,3371,37.5,29.0,18.0,47.0,20.0,9.0,9.0,1.0,0.0
1,Liverpool,Virgil van Dijk,nl NED,DF,33.0,37,37,3330,37.0,3.0,1.0,4.0,3.0,0.0,0.0,5.0,0.0
2,Liverpool,Ryan Gravenberch,nl NED,MF,22.0,37,37,3160,35.1,0.0,4.0,4.0,0.0,0.0,0.0,6.0,1.0
3,Liverpool,Alexis Mac Allister,ar ARG,MF,25.0,35,30,2599,28.9,5.0,5.0,10.0,5.0,0.0,0.0,6.0,0.0
4,Liverpool,Ibrahima Konaté,fr FRA,DF,25.0,31,30,2560,28.4,1.0,2.0,3.0,1.0,0.0,0.0,5.0,0.0



### 2️⃣ Transformation des données (players)

#### ➕ Nettoyage : suppression ou traitement des valeurs manquantes et incohérentes.

##### 🔎 Compter le nombre des manquantes dans dataframes (players)

* visualisation général : 

In [17]:
print(f"Le nombres total des players : {df_players.shape}")
print(f"Les colonnes de table des players : {df_players.columns}")


Le nombres total des players : (702, 17)
Les colonnes de table des players : Index(['Squad', 'Player', 'Nation', 'Pos', 'Age', 'MP', 'Starts', 'Min', '90s',
       'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt', 'CrdY', 'CrdR'],
      dtype='object')


* Identifier les manquants : 

In [18]:
print(f"Le nombre des manquantes par colonnes : \n{df_players.isnull().sum()}")
print(f"Le nombre total des manquantes dans les colonnes : \n{df_players.isnull().sum().sum()}")


Le nombre des manquantes par colonnes : 
Squad       0
Player      0
Nation      6
Pos         0
Age         5
MP          0
Starts      0
Min       128
90s       128
Gls       128
Ast       128
G+A       128
G-PK      128
PK        128
PKatt     128
CrdY      128
CrdR      128
dtype: int64
Le nombre total des manquantes dans les colonnes : 
1291


* gestion des manquantes : 

In [19]:
df_players_clean = df_players.copy()
print(f"Median age = {df_players_clean['Age'].median()}\n")

df_players_clean['Nation'] = df_players_clean['Nation'].fillna('Inconnu')
df_players_clean['Age'] = df_players_clean['Age'].fillna(df_players_clean['Age'].median())
# df_players_clean['Min'] = df_players_clean['Min'].fillna(0.0)
df_players_clean = df_players_clean.fillna(0)

print(f"Le nombre des manquantes par colonnes apres le remplacement des valeurs manquantes par 0 : \n{df_players_clean.isnull().sum()}")


Median age = 24.0

Le nombre des manquantes par colonnes apres le remplacement des valeurs manquantes par 0 : 
Squad     0
Player    0
Nation    0
Pos       0
Age       0
MP        0
Starts    0
Min       0
90s       0
Gls       0
Ast       0
G+A       0
G-PK      0
PK        0
PKatt     0
CrdY      0
CrdR      0
dtype: int64


* identifier et supprimer les doubleons : 

In [20]:
print(f"Le nombre total des duplications dans la table des players : {df_players_clean.duplicated().sum()}")

df_matches_clean = df_players.drop_duplicates()

Le nombre total des duplications dans la table des players : 0


* 🔎 Corriger les incohérences dans les données


#### ➕ Standardisation des données (players)

* Normalisation des noms de colonnes

In [21]:
df_players_clean.columns = df_players_clean.columns.str.strip().str.replace(' ', '_')


* convertir de Minute comme un int : 

In [22]:
print("Avant : \n",df_players['Min'].head(4))
df_players_clean['Min'] = df_players_clean['Min'].str.replace(',', '').astype(float)
df_players_clean['Min'] = df_players_clean['Min'].fillna(0.0)

print("Apres : \n",df_players_clean['Min'].head(4))


Avant : 
 0    3,371
1    3,330
2    3,160
3    2,599
Name: Min, dtype: object
Apres : 
 0    3371.0
1    3330.0
2    3160.0
3    2599.0
Name: Min, dtype: float64


#### 📍 sauvegarder les players cleaned dans un nouveau fichier CSV 

In [23]:
data_players_clean = df_players_clean.copy()

print(data_players_clean.isnull().sum())

data_players_clean.to_csv('../data/data_cleaned/players_clean.csv', index=False)


Squad     0
Player    0
Nation    0
Pos       0
Age       0
MP        0
Starts    0
Min       0
90s       0
Gls       0
Ast       0
G+A       0
G-PK      0
PK        0
PKatt     0
CrdY      0
CrdR      0
dtype: int64


In [24]:
df_players_clean.dtypes

Squad      object
Player     object
Nation     object
Pos        object
Age       float64
MP          int64
Starts      int64
Min       float64
90s       float64
Gls       float64
Ast       float64
G+A       float64
G-PK      float64
PK        float64
PKatt     float64
CrdY      float64
CrdR      float64
dtype: object