# Data preprocess

Clean data to keep interresting features

In [1]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path

# Ajouter les dossiers src et config au path pour utiliser les modules du projet
sys.path.append(str(Path.cwd().parent / "src"))
sys.path.append(str(Path.cwd().parent / "config"))

print("✅ Modules importés avec succès")
print(f"📁 Répertoire de travail: {Path.cwd()}")
print(f"📁 Répertoire parent: {Path.cwd().parent}")

✅ Modules importés avec succès
📁 Répertoire de travail: g:\Mon Drive\BeCode\becodeorg-classroom-thomas5-football-prediction-football_prediction\notebooks
📁 Répertoire parent: g:\Mon Drive\BeCode\becodeorg-classroom-thomas5-football-prediction-football_prediction


In [17]:
def clean_data(df):
    # Filtrer uniquement la Jupiler Pro League belge
    df = df[df['Div'] == 'B1']  # B1 = Jupiler Pro League

    # Keep only specified columns
    df = df[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC']]
    # Drop rows with missing data across all columns
    df = df.dropna()
    return df


df = pd.read_csv("../data/raw/data_2023_2026.csv") 

df_clean = clean_data(df.copy())
df_clean.head()

  df = pd.read_csv("../data/raw/data_2023_2026.csv")


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HC,AC
0,B1,25/07/2025,19:45,Antwerp,St. Gilloise,1.0,1.0,D,8.0,17.0,4.0,8.0,2.0,5.0
1,B1,26/07/2025,15:00,Dender,Cercle Brugge,0.0,0.0,D,11.0,10.0,3.0,2.0,3.0,3.0
2,B1,26/07/2025,17:15,Waregem,Mechelen,1.0,1.0,D,21.0,7.0,5.0,2.0,9.0,6.0
3,B1,26/07/2025,19:45,RAAL La Louviere,Standard,0.0,2.0,A,14.0,9.0,2.0,4.0,5.0,1.0
4,B1,27/07/2025,12:30,Anderlecht,Westerlo,5.0,2.0,H,25.0,10.0,10.0,4.0,8.0,3.0


1. Feature Engineering - Statistiques par équipe

In [18]:
def create_team_features(df, n_matches=5):
    """
    Crée des features basées sur les N derniers matchs de chaque équipe
    """
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')  # Détection automatique du format
    df = df.sort_values(['Date', 'Time'])
    
    features_list = []
    
    for idx, row in df.iterrows():
        home_team = row['HomeTeam']
        away_team = row['AwayTeam']
        match_date = row['Date']
        
        # Récupérer les N derniers matchs pour chaque équipe AVANT ce match
        home_history = get_team_history(df, home_team, match_date, n_matches, idx)
        away_history = get_team_history(df, away_team, match_date, n_matches, idx)
        
        # Créer les features
        match_features = {
            'Date': match_date,
            'HomeTeam': home_team,
            'AwayTeam': away_team,
            'Target': row['FTR'],  # H, D, A
            
            # Features équipe domicile
            'home_avg_goals_scored': home_history['goals_scored'].mean() if len(home_history) > 0 else 0,
            'home_avg_goals_conceded': home_history['goals_conceded'].mean() if len(home_history) > 0 else 0,
            'home_avg_shots': home_history['shots'].mean() if len(home_history) > 0 else 0,
            'home_avg_shots_target': home_history['shots_target'].mean() if len(home_history) > 0 else 0,
            'home_avg_corners': home_history['corners'].mean() if len(home_history) > 0 else 0,
            'home_win_rate': (home_history['result'] == 'W').mean() if len(home_history) > 0 else 0,
            'home_form': calculate_form(home_history),
            
            # Features équipe extérieur
            'away_avg_goals_scored': away_history['goals_scored'].mean() if len(away_history) > 0 else 0,
            'away_avg_goals_conceded': away_history['goals_conceded'].mean() if len(away_history) > 0 else 0,
            'away_avg_shots': away_history['shots'].mean() if len(away_history) > 0 else 0,
            'away_avg_shots_target': away_history['shots_target'].mean() if len(away_history) > 0 else 0,
            'away_avg_corners': away_history['corners'].mean() if len(away_history) > 0 else 0,
            'away_win_rate': (away_history['result'] == 'W').mean() if len(away_history) > 0 else 0,
            'away_form': calculate_form(away_history),
        }
        
        features_list.append(match_features)
    
    return pd.DataFrame(features_list)

def get_team_history(df, team, match_date, n_matches, current_idx):
    """Récupère l'historique d'une équipe avant une date donnée"""
    # Matchs où l'équipe joue à domicile ou à l'extérieur, avant la date du match actuel
    team_matches = df[(df.index < current_idx) & 
                     ((df['HomeTeam'] == team) | (df['AwayTeam'] == team))]
    
    # Prendre les N derniers matchs
    team_matches = team_matches.tail(n_matches)
    
    history = []
    for _, match in team_matches.iterrows():
        if match['HomeTeam'] == team:  # Équipe joue à domicile
            goals_scored = match['FTHG']
            goals_conceded = match['FTAG']
            shots = match['HS']
            shots_target = match['HST']
            corners = match['HC']
            result = 'W' if match['FTR'] == 'H' else 'D' if match['FTR'] == 'D' else 'L'
        else:  # Équipe joue à l'extérieur
            goals_scored = match['FTAG']
            goals_conceded = match['FTHG']
            shots = match['AS']
            shots_target = match['AST']
            corners = match['AC']
            result = 'W' if match['FTR'] == 'A' else 'D' if match['FTR'] == 'D' else 'L'
        
        history.append({
            'goals_scored': goals_scored,
            'goals_conceded': goals_conceded,
            'shots': shots,
            'shots_target': shots_target,
            'corners': corners,
            'result': result
        })
    
    return pd.DataFrame(history)

def calculate_form(history):
    """Calcule la forme récente (points sur les derniers matchs)"""
    if len(history) == 0:
        return 0
    
    points = []
    for _, match in history.iterrows():
        if match['result'] == 'W':
            points.append(3)
        elif match['result'] == 'D':
            points.append(1)
        else:
            points.append(0)
    
    return sum(points) / len(points)  # Moyenne de points par match

2. Features additionnelles

In [19]:
def add_additional_features(df):
    """Ajoute des features supplémentaires"""
    # Différence de forme entre les équipes
    df['form_difference'] = df['home_form'] - df['away_form']
    
    # Différence de buts marqués/encaissés
    df['goal_difference'] = (df['home_avg_goals_scored'] - df['home_avg_goals_conceded']) - \
                           (df['away_avg_goals_scored'] - df['away_avg_goals_conceded'])
    
    # Efficacité des tirs
    df['home_shot_efficiency'] = df['home_avg_shots_target'] / (df['home_avg_shots'] + 1)  # +1 pour éviter division par 0
    df['away_shot_efficiency'] = df['away_avg_shots_target'] / (df['away_avg_shots'] + 1)
    
    # Avantage du terrain (pourcentage de victoires à domicile vs extérieur)
    df['home_advantage'] = df['home_win_rate'] - df['away_win_rate']
    
    return df

3. Gestion des données manquantes et validation

In [20]:
def validate_and_clean_features(df):
    """Nettoie et valide le dataset final"""
    # Supprimer les matchs sans historique suffisant
    min_history_threshold = 3  # Au moins 3 matchs d'historique
    df = df.dropna()
    
    # Encoder la variable cible
    target_mapping = {'H': 0, 'D': 1, 'A': 2}  # Home, Draw, Away
    df['Target_encoded'] = df['Target'].map(target_mapping)
    
    # Supprimer les outliers extrêmes si nécessaire
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col != 'Target_encoded':
            Q1 = df[col].quantile(0.01)
            Q3 = df[col].quantile(0.99)
            df = df[(df[col] >= Q1) & (df[col] <= Q3)]
    
    return df

# Utilisation complète
df_features = create_team_features(df_clean, n_matches=5)
df_features = add_additional_features(df_features)
df_final = validate_and_clean_features(df_features)

print(f"Dataset final: {df_final.shape}")
print(f"Distribution des classes: \n{df_final['Target'].value_counts()}")
df_final.head()

Dataset final: (663, 24)
Distribution des classes: 
Target
H    288
D    191
A    184
Name: count, dtype: int64


Unnamed: 0,Date,HomeTeam,AwayTeam,Target,home_avg_goals_scored,home_avg_goals_conceded,home_avg_shots,home_avg_shots_target,home_avg_corners,home_win_rate,...,away_avg_shots_target,away_avg_corners,away_win_rate,away_form,form_difference,goal_difference,home_shot_efficiency,away_shot_efficiency,home_advantage,Target_encoded
0,2023-01-06,Standard,St Truiden,D,0.4,0.2,8.6,2.6,3.6,0.2,...,5.2,5.0,0.6,2.0,-0.8,-0.4,0.270833,0.371429,-0.4,1
1,2023-01-07,Eupen,Charleroi,A,0.8,1.4,10.2,2.8,3.4,0.2,...,5.4,4.8,0.6,2.2,-1.6,-1.2,0.25,0.355263,-0.4,2
2,2023-01-07,Cercle Brugge,Westerlo,A,1.2,2.0,13.8,4.4,5.2,0.2,...,5.0,7.4,0.4,1.4,-0.6,-0.8,0.297297,0.328947,-0.2,2
3,2023-01-07,Oostende,Seraing,A,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,2023-01-07,Antwerp,Gent,H,1.2,1.8,10.2,4.0,3.8,0.2,...,3.2,4.8,0.0,0.0,1.0,2.0,0.357143,0.285714,0.2,0


4. Export en CSV

In [21]:
# Vérifier la structure des dossiers et l'existence des fichiers
raw_data_path = Path("../data/raw/data_2023_2026.csv")
processed_data_path = Path("../data/processed/data_2023_2026_final.csv")

print("🔍 Vérification des chemins:")
print(f"📊 Dataset brut: {raw_data_path}")
print(f"   Existe: {'✅' if raw_data_path.exists() else '❌'}")
print(f"📈 Dataset traité: {processed_data_path}")
print(f"   Dossier parent existe: {'✅' if processed_data_path.parent.exists() else '❌'}")

# Créer le dossier processed s'il n'existe pas
processed_data_path.parent.mkdir(parents=True, exist_ok=True)
print("✅ Dossier processed vérifié/créé")

🔍 Vérification des chemins:
📊 Dataset brut: ..\data\raw\data_2023_2026.csv
   Existe: ✅
📈 Dataset traité: ..\data\processed\data_2023_2026_final.csv
   Dossier parent existe: ✅
✅ Dossier processed vérifié/créé


In [22]:
# Export du dataset final
df_final.to_csv("../data/processed/data_2023_2026_final.csv", index=False)
print("Dataset final exporté avec succès.")

Dataset final exporté avec succès.
