In [4]:
import pandas as pd
import os
import datetime
import time
import requests
from bs4 import BeautifulSoup
from PIL import Image
import io

class FootballDataScraper:

    def __init__(self, config_file):
        
        self.config_file = config_file
        self.headers = {'User-Agent': 'Mozilla/5.0'}
        self.championnats = self.load_configs()
        self.data_folder = "Database"
        self.logos_folder = "Logos"
        self.futur_data_folder = "Futur_data"
        self.last_request_time = None
        self.request_interval = 3  # secondes
        self.max_seasons = 6

    def load_configs(self):
        # loading config of championship containing which championship are used and their urls
        return pd.read_csv(self.config_file)

    def scrape_or_update(self):
        all_data = {}
        for _, config in self.championnats.iterrows():
            championnat = config['Championnat']
            url = config['URL']
            csv_file_path = os.path.join(self.data_folder, f"{championnat}_data.csv")

            if os.path.exists(csv_file_path):
                data = pd.read_csv(csv_file_path)
                last_update = pd.to_datetime((data['Date'] + ' ' + data['Time']).max())
                if datetime.datetime.now() > last_update + datetime.timedelta(hours = 2):
                    updated_data = self.update_data(url, data)
                    all_data[championnat] = updated_data[0][updated_date["Comp"]==championnat]
                    futur_data = self.save_futur_data(updated_data[1], championnat)
                    self.save_data(futur_data, os.path.join(self.futur_data_folder, f"{championnat}_futur_data.csv"))
                    
            else:
                data = self.scrape_data(url)
                all_data[championnat] = data[data["Comp"]==championnat]

            self.save_data(all_data[championnat], csv_file_path)
        return all_data

    def scrape_data(self, url):
        
        all_seasons_data = []

        for saison in range(self.max_seasons):

            self.rate_limit()

            response = requests.get(url, headers=self.headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            teams_urls = ["https://fbref.com" + equipe.get("href") 
                        for equipe in soup.select("table.stats_table")[0].find_all("a") 
                        if "squads" in equipe.get("href", "")]

            url = f"https://fbref.com{soup.find('a', class_='button2 prev').get('href')}"

            for team_url in teams_urls:

                self.rate_limit()
                team_response = requests.get(team_url, headers=self.headers)
                try:
                    team_data = pd.read_html(team_response.text, match="Scores")[0]
                except ValueError as e:
                    # Gestion de l'exception si aucune table n'est trouvée
                    print(f"Aucune table trouvée dans {team_url} - Erreur: {e}")
                    continue  # Passe à l'itération suivante de la boucle

                team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
                team_data["Team"] = team_name

                self.scrape_and_save_logo(team_url, team_name)

                # Récupération des URLs pour les statistiques détaillées
                url_stats = {
                    f"https://fbref.com{a.get('href')}" 
                    for a in BeautifulSoup(team_response.text, 'html.parser').find_all("a") 
                    if "matchlogs/all_comps" in a.get('href', '') and 
                    any(substring in a.get('href', '') for substring in ["passing/", "shooting", "possession/", "defense/", "keeper"])
                }

                # Traitement des statistiques détaillées
                for stats_url in url_stats:

                    self.rate_limit()

                    stats_response = requests.get(stats_url, headers=self.headers)

                    try:
                        detailed_stats = pd.read_html(stats_response.text)[0]
                    except ValueError as e:
                        print(f"Aucune table trouvée dans {stats_url} - Erreur: {e}")
                        continue  # Passe à l'itération suivante de la boucle

                    # Nettoyage des colonnes du DataFrame
                    if detailed_stats.columns.nlevels > 1:
                        detailed_stats.columns = [f"{col}_{branch}" 
                                                if "For" not in col and "Unnamed:" not in col 
                                                else f"{branch}" 
                                                for col, branch in detailed_stats.columns]

                    columns_to_drop = ["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Poss"] + [col for col in detailed_stats.columns if 'Report' in col]
                    columns_to_drop = [col for col in columns_to_drop if col in detailed_stats.columns]

                    detailed_stats.drop(columns_to_drop, axis=1, inplace=True)

                    team_data = team_data.merge(detailed_stats, on="Date")

                # Ajout des données de l'équipe au résultat global
                all_seasons_data.append(team_data)

        return pd.concat(all_seasons_data, ignore_index=True)

    def update_data(self, url, data):

        futur_data = []
        all_data = []
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        teams_urls = ["https://fbref.com" + equipe.get("href") 
                    for equipe in soup.select("table.stats_table")[0].find_all("a") 
                    if "squads" in equipe.get("href", "")]

        # Traitement similaire à scrape_ligue1_data() pour chaque équipe
        for team_url in teams_urls:

            self.rate_limit()

            team_response = requests.get(team_url, headers=self.headers)
            try:
                team_data = pd.read_html(team_response.text, match="Scores")[0]
            except ValueError as e:
                # Gestion de l'exception si aucune table n'est trouvée
                print(f"Aucune table trouvée dans {team_url} - Erreur: {e}")
                continue  # Passe à l'itération suivante de la boucle

            team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
            team_data["Team"] = team_name

            futur_data.append(team_data)


            # Récupération des URLs pour les statistiques détaillées
            url_stats = {
                f"https://fbref.com{a.get('href')}" 
                for a in BeautifulSoup(team_response.text, 'html.parser').find_all("a") 
                if "matchlogs/all_comps" in a.get('href', '') and 
                any(substring in a.get('href', '') for substring in ["passing/", "shooting", "possession/", "defense/", "keeper"])
            }

            for stats_url in url_stats:

                self.rate_limit()

                stats_response = requests.get(stats_url, headers=self.headers)
                
                try:
                    detailed_stats = pd.read_html(stats_response.text)[0]
                except ValueError as e:
                    print(f"Aucune table trouvée dans {stats_url} - Erreur: {e}")
                    continue  # Passe à l'itération suivante de la boucle


                # Nettoyage des colonnes du DataFrame
                if detailed_stats.columns.nlevels > 1:
                    detailed_stats.columns = [f"{col}_{branch}" 
                                            if "For" not in col and "Unnamed:" not in col 
                                            else f"{branch}" 
                                            for col, branch in detailed_stats.columns]

                columns_to_drop = ["Time", "Comp", "Round", "Day", "Venue", "Result", "GF", "GA", "Opponent", "Poss"] + [col for col in detailed_stats.columns if 'Report' in col]
                columns_to_drop = [col for col in columns_to_drop if col in detailed_stats.columns]

                detailed_stats.drop(columns_to_drop, axis=1, inplace=True)

                team_data = team_data.merge(detailed_stats, on="Date")

            # Ajout des données de l'équipe au résultat global
            all_data.append(team_data)

        new_data = pd.concat(all_data, ignore_index=True)

        # Concaténation de la base initiale et de la base nouvelle
        concatenated_df = pd.concat([data, new_data])
        
        # Suppression des doublons basée sur les colonnes "Date", "Team" et "Opponent"
        concatenated_df = concatenated_df.drop_duplicates(subset=['Date', 'Team', 'Opponent'])
            
        
        # Retourne les données concaténées de toutes les équipes
        return concatenated_df(ignore_index=True), pd.concat(futur_data, ignore_index=True)

    def save_data(self, data, file_path):
        # Enregistrer les données dans un fichier CSV
        data.to_csv(file_path, index=False)

    def rate_limit(self):
        if self.last_request_time is not None:
            elapsed_time = time.time() - self.last_request_time
            if elapsed_time < self.request_interval:
                time.sleep(self.request_interval - elapsed_time)
        self.last_request_time = time.time()

    def access_data(self, championnat):
        csv_file_path = os.path.join(self.data_folder, f"{championnat}_data.csv")
        
        if os.path.exists(csv_file_path):
            data = pd.read_csv(csv_file_path)
            return data
        else:
            print(f"Les données pour le championnat '{championnat}' n'existent pas.")

    def scrape_and_save_logo(self, page_url, file_name):

        file_path = os.path.join(self.logos_folder, file_name + '.png')
        if os.path.exists(file_path):
            print(f"Le logo existe déjà à {file_path}")
            return

        try:
            self.rate_limit()
            response = requests.get(page_url, headers=self.headers)
            soup = BeautifulSoup(response.text, 'html.parser')
            logo_img = soup.find('img', {'class': 'teamlogo'})

            if logo_img and logo_img.get('src'):
                logo_url = logo_img['src']
                self.rate_limit()
                img_response = requests.get(logo_url)

                if img_response.status_code == 200:
                    # Convertir l'image en PNG
                    image = Image.open(io.BytesIO(img_response.content))
                    file_path = os.path.join(self.logos_folder, file_name)
                    if not file_path.endswith('.png'):
                        file_path += '.png'
                    image.save(file_path, 'PNG')
                    print(f"Logo sauvegardé en PNG à {file_path}")
                else:
                    print("Erreur lors du téléchargement de l'image")
            else:
                print("Logo non trouvé sur la page")

        except Exception as e:
            print(f"Une erreur s'est produite : {e}")

    def save_futur_data(self, data, championnat):

        """
        À partir du scrapping on récupère un DataFrame qui contient les futurs journées, il faut le mettre en forme.
        args: DataFrame, le mapping ligue1
        """

        # Supprimer les lignes où les colonnes 'Date', 'Time' et 'Round' sont manquantes.
        data.dropna(subset=["Date", "Time", "Round"], inplace=True)
        data['DateTime'] = pd.to_datetime((data['Date'] + ' ' + data['Time']))
        data = data[data["Comp"] == championnat]

        data = data[data['DateTime'] >= datetime.now()].sort_values(by="DateTime")


        if data.empty != True:
                first_date = data['DateTime'].iloc[0]
        else:
            return None
        
        ten_days = timedelta(days=10) + first_date

        # Filtrer pour garder seulement les matchs programmés dans les 10 jours suivant la 'premiere_date_proche'.
        data = data[data['DateTime'] <= ten_days]

        if data.empty == True:
            return None
        else:# Retourner le DataFrame s'il n'est pas vide, sinon retourner None.
            csv_file_path = os.path.join(self.futur_data_folder, f"{championnat}_futur_data.csv")
            if os.path.exists(csv_file_path):
                futur_data = pd.read_csv(csv_file_path)
                last_update = pd.to_datetime((futur_data['Date'] + ' ' + futur_data['Time']).max())
                if last_update == pd.to_datetime((data['Date'] + ' ' + data['Time']).max()):
                    return futur_data
                else:
                    return pd.concat([futur_data, data]).drop_duplicates(subset=["Team", "Opponent", "Date"])
            else:
                return data

            

    






        


    









In [5]:
allo = FootballDataScraper("config.csv")
allo.scrape_or_update()

  team_data = pd.read_html(team_response.text, match="Scores")[0]
  detailed_stats = pd.read_html(stats_response.text)[0]
  detailed_stats = pd.read_html(stats_response.text)[0]
  detailed_stats = pd.read_html(stats_response.text)[0]
  detailed_stats = pd.read_html(stats_response.text)[0]
  detailed_stats = pd.read_html(stats_response.text)[0]
  team_data = pd.read_html(team_response.text, match="Scores")[0]
  detailed_stats = pd.read_html(stats_response.text)[0]
  detailed_stats = pd.read_html(stats_response.text)[0]
  detailed_stats = pd.read_html(stats_response.text)[0]
  detailed_stats = pd.read_html(stats_response.text)[0]
  detailed_stats = pd.read_html(stats_response.text)[0]
  team_data = pd.read_html(team_response.text, match="Scores")[0]


In [134]:
a = allo.access_data("La Liga")

In [135]:
a

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Challenges_Att,Challenges_Tkl%,Challenges_Lost,Blocks_Blocks,Blocks_Sh,Blocks_Pass,Int,Tkl+Int,Clr,Err
0,2023-08-12,17:00,La Liga,Matchweek 1,Sat,Away,D,1.0,1.0,Real Sociedad,...,16.0,43.8,9.0,10.0,2.0,8.0,4.0,19,14.0,0.0
1,2023-08-20,19:00,La Liga,Matchweek 2,Sun,Home,W,3.0,0.0,Getafe,...,15.0,73.3,4.0,9.0,1.0,8.0,4.0,20,26.0,0.0
2,2023-08-26,21:30,La Liga,Matchweek 3,Sat,Away,W,2.0,1.0,Sevilla,...,11.0,45.5,6.0,13.0,4.0,9.0,8.0,19,47.0,0.0
3,2023-09-03,14:00,La Liga,Matchweek 4,Sun,Home,W,1.0,0.0,Las Palmas,...,17.0,41.2,10.0,14.0,4.0,10.0,8.0,26,12.0,0.0
4,2023-09-18,21:00,La Liga,Matchweek 5,Mon,Away,W,4.0,2.0,Granada,...,22.0,68.2,7.0,11.0,2.0,9.0,2.0,23,23.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5083,2024-01-21,18:30,La Liga,Matchweek 21,Sun,Home,L,2.0,4.0,Barcelona,...,18.0,50.0,9.0,10.0,2.0,8.0,7.0,23,29.0,0.0
5084,2024-01-21,14:00,La Liga,Matchweek 21,Sun,Away,L,2.0,3.0,Osasuna,...,9.0,55.6,4.0,8.0,1.0,7.0,9.0,19,16.0,0.0
5085,2024-01-21,14:00,La Liga,Matchweek 21,Sun,Home,W,3.0,2.0,Getafe,...,14.0,42.9,8.0,11.0,3.0,8.0,8.0,18,46.0,1.0
5086,2024-01-21,21:00,La Liga,Matchweek 21,Sun,Away,L,1,5,Girona,...,16.0,56.3,7.0,9.0,1.0,8.0,5.0,24,14.0,0.0


In [105]:
b = pd.read_csv("Data_copy/Ligue 1_data.csv")
a = a[b.columns]

In [114]:
a["Date"] = pd.to_datetime(a["Date"])
b["Date"] = pd.to_datetime(b["Date"])

In [122]:
a.sort_values(by="Date", ascending = False).head(30)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Passes_Launch%,Passes_AvgLen,Goal Kicks_Att,Goal Kicks_Launch%,Goal Kicks_AvgLen,Crosses_Opp,Crosses_Stp,Crosses_Stp%,Sweeper_#OPA,Sweeper_AvgDist
4606,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Home,L,1,3,Strasbourg,...,,,,,,,,,,
4605,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,D,3 (11),3 (12),FC Rouen,...,,,,,,,,,,
4600,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,D,2 (4),2 (5),Sochaux,...,,,,,,,,,,
4601,2024-01-21,21:05,Coupe de France,Round of 32,Sun,Away,D,1 (8),1 (9),Rennes,...,,,,,,,,,,
4602,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,W,3.0,1.0,Clermont Foot,...,,,,,,,,,,
4603,2024-01-21,21:05,Coupe de France,Round of 32,Sun,Home,D,1 (9),1 (8),Marseille,...,,,,,,,,,,
4604,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,W,1.0,0.0,Châteauroux,...,,,,,,,,,,
4599,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,W,1.0,0.0,RCFF Colombes 92,...,,,,,,,,,,
46,2024-01-20,17:30,Coupe de France,Round of 32,Sat,Away,W,3,2,Bordeaux,...,,,,,,,,,,
66,2024-01-20,17:30,Coupe de France,Round of 32,Sat,Away,W,2.0,1.0,Trélissac,...,,,,,,,,,,


In [123]:
pd.concat([a, b]).drop_duplicates(subset=["Date", "Team", "Opponent"], keep=False)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Passes_Launch%,Passes_AvgLen,Goal Kicks_Att,Goal Kicks_Launch%,Goal Kicks_AvgLen,Crosses_Opp,Crosses_Stp,Crosses_Stp%,Sweeper_#OPA,Sweeper_AvgDist
4599,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,W,1.0,0.0,RCFF Colombes 92,...,,,,,,,,,,
4600,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,D,2 (4),2 (5),Sochaux,...,,,,,,,,,,
4601,2024-01-21,21:05,Coupe de France,Round of 32,Sun,Away,D,1 (8),1 (9),Rennes,...,,,,,,,,,,
4602,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,W,3.0,1.0,Clermont Foot,...,,,,,,,,,,
4603,2024-01-21,21:05,Coupe de France,Round of 32,Sun,Home,D,1 (9),1 (8),Marseille,...,,,,,,,,,,
4604,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,W,1.0,0.0,Châteauroux,...,,,,,,,,,,
4605,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,D,3 (11),3 (12),FC Rouen,...,,,,,,,,,,
4606,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Home,L,1,3,Strasbourg,...,,,,,,,,,,


In [120]:
a

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Passes_Launch%,Passes_AvgLen,Goal Kicks_Att,Goal Kicks_Launch%,Goal Kicks_AvgLen,Crosses_Opp,Crosses_Stp,Crosses_Stp%,Sweeper_#OPA,Sweeper_AvgDist
0,2023-08-12,21:00,Ligue 1,Matchweek 1,Sat,Home,D,0.0,0.0,Lorient,...,0.0,21.5,3.0,0.0,24.0,2.0,0.0,0.0,0.0,
1,2023-08-19,21:00,Ligue 1,Matchweek 2,Sat,Away,D,1.0,1.0,Toulouse,...,11.8,25.4,6.0,16.7,25.2,6.0,0.0,0.0,0.0,7.0
2,2023-08-26,21:00,Ligue 1,Matchweek 3,Sat,Home,W,3.0,1.0,Lens,...,15.6,26.3,8.0,12.5,19.4,16.0,1.0,6.3,0.0,8.8
3,2023-09-03,20:45,Ligue 1,Matchweek 4,Sun,Away,W,4.0,1.0,Lyon,...,11.5,23.8,8.0,12.5,20.3,13.0,1.0,7.7,0.0,6.5
4,2023-09-15,21:00,Ligue 1,Matchweek 5,Fri,Home,L,2.0,3.0,Nice,...,6.7,22.4,2.0,0.0,19.0,2.0,0.0,0.0,0.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4602,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,W,3.0,1.0,Clermont Foot,...,,,,,,,,,,
4603,2024-01-21,21:05,Coupe de France,Round of 32,Sun,Home,D,1 (9),1 (8),Marseille,...,,,,,,,,,,
4604,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,W,1.0,0.0,Châteauroux,...,,,,,,,,,,
4605,2024-01-21,17:30,Coupe de France,Round of 32,Sun,Away,D,3 (11),3 (12),FC Rouen,...,,,,,,,,,,
