### **Scraping Data from Fbref**

In [82]:
import requests
import os
import numpy as np
import pandas as pd
import warnings
# Hide FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#### Player Data 

In [None]:
# web scraping setup (Since FBref now uses Cloudflare)
from curl_cffi import requests as cureq
from bs4 import BeautifulSoup
import pandas as pd
import time, random

url = 'https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats'

# basic headers (impersonate helps but explicit headers are useful)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
                  " (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com/",
}

# try a request
resp = cureq.get(url, headers=headers, impersonate="chrome", timeout=30)

if resp.status_code == 200:
    print("Successfully retrieved page. Status code: {}".format(resp.status_code))
else:
    print(f"Failed to retrieve page. Status code: Error {resp.status_code}")


Successfully retrieved page. Status code: 200


In [84]:
df = pd.read_html(resp.text)[0]
df.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Playing Time,Playing Time,...,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Per 90 Minutes,Unnamed: 37_level_0
Unnamed: 0_level_1,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,MP,Starts,...,Ast,G+A,G-PK,G+A-PK,xG,xAG,xG+xAG,npxG,npxG+xAG,Matches
0,1,Brenden Aaronson,us USA,"FW,MF",Leeds United,eng Premier League,25-011,2000,10,7,...,0.0,0.14,0.14,0.14,0.23,0.18,0.41,0.23,0.41,Matches
1,2,Jones El-Abdellaoui,ma MAR,"MF,FW",Celta Vigo,es La Liga,19-294,2006,3,0,...,0.0,0.0,0.0,0.0,0.56,0.05,0.61,0.56,0.61,Matches
2,3,Himad Abdelli,dz ALG,MF,Angers,fr Ligue 1,25-350,1999,7,5,...,0.0,0.21,0.0,0.0,0.22,0.06,0.28,0.06,0.12,Matches
3,4,Ali Abdi,tn TUN,"DF,MF",Nice,fr Ligue 1,31-317,1993,4,4,...,0.0,0.0,0.0,0.0,0.0,0.14,0.14,0.0,0.14,Matches
4,5,Salis Abdul Samed,gh GHA,MF,Nice,fr Ligue 1,25-221,2000,8,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches


In [85]:
# creating a data with the same headers but without multi indexing
df.columns = [' '.join(col).strip() for col in df.columns]

df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Unnamed: 0_level_0 Rk,Unnamed: 1_level_0 Player,Unnamed: 2_level_0 Nation,Unnamed: 3_level_0 Pos,Unnamed: 4_level_0 Squad,Unnamed: 5_level_0 Comp,Unnamed: 6_level_0 Age,Unnamed: 7_level_0 Born,Playing Time MP,Playing Time Starts,...,Per 90 Minutes Ast,Per 90 Minutes G+A,Per 90 Minutes G-PK,Per 90 Minutes G+A-PK,Per 90 Minutes xG,Per 90 Minutes xAG,Per 90 Minutes xG+xAG,Per 90 Minutes npxG,Per 90 Minutes npxG+xAG,Unnamed: 37_level_0 Matches
0,1,Brenden Aaronson,us USA,"FW,MF",Leeds United,eng Premier League,25-011,2000,10,7,...,0.0,0.14,0.14,0.14,0.23,0.18,0.41,0.23,0.41,Matches
1,2,Jones El-Abdellaoui,ma MAR,"MF,FW",Celta Vigo,es La Liga,19-294,2006,3,0,...,0.0,0.0,0.0,0.0,0.56,0.05,0.61,0.56,0.61,Matches
2,3,Himad Abdelli,dz ALG,MF,Angers,fr Ligue 1,25-350,1999,7,5,...,0.0,0.21,0.0,0.0,0.22,0.06,0.28,0.06,0.12,Matches
3,4,Ali Abdi,tn TUN,"DF,MF",Nice,fr Ligue 1,31-317,1993,4,4,...,0.0,0.0,0.0,0.0,0.0,0.14,0.14,0.0,0.14,Matches
4,5,Salis Abdul Samed,gh GHA,MF,Nice,fr Ligue 1,25-221,2000,8,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches


In [86]:
# creating a list with new names
new_columns = []
for col in df.columns:
  if 'level_0' in col:
      new_col = col.split()[-1]  # takes the last name
  else:
      new_col = col
  new_columns.append(new_col)

# rename columns
df.columns = new_columns
df = df.fillna(0)

df.head()

Unnamed: 0,Rk,Player,Nation,Pos,Squad,Comp,Age,Born,Playing Time MP,Playing Time Starts,...,Per 90 Minutes Ast,Per 90 Minutes G+A,Per 90 Minutes G-PK,Per 90 Minutes G+A-PK,Per 90 Minutes xG,Per 90 Minutes xAG,Per 90 Minutes xG+xAG,Per 90 Minutes npxG,Per 90 Minutes npxG+xAG,Matches
0,1,Brenden Aaronson,us USA,"FW,MF",Leeds United,eng Premier League,25-011,2000,10,7,...,0.0,0.14,0.14,0.14,0.23,0.18,0.41,0.23,0.41,Matches
1,2,Jones El-Abdellaoui,ma MAR,"MF,FW",Celta Vigo,es La Liga,19-294,2006,3,0,...,0.0,0.0,0.0,0.0,0.56,0.05,0.61,0.56,0.61,Matches
2,3,Himad Abdelli,dz ALG,MF,Angers,fr Ligue 1,25-350,1999,7,5,...,0.0,0.21,0.0,0.0,0.22,0.06,0.28,0.06,0.12,Matches
3,4,Ali Abdi,tn TUN,"DF,MF",Nice,fr Ligue 1,31-317,1993,4,4,...,0.0,0.0,0.0,0.0,0.0,0.14,0.14,0.0,0.14,Matches
4,5,Salis Abdul Samed,gh GHA,MF,Nice,fr Ligue 1,25-221,2000,8,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Matches


In [None]:
# filter out rows where position is Pos (error when scraping data)
df = df[df['Position'] != 'Pos']

df['Age'] = df['Age'].str[:2]
df['Position'] = df['Pos']
df['Nation'] = df['Nation'].str.split(' ').str.get(1)
df['League'] = df['Comp'].str.split(' ').str.get(1)
df['League_'] = df['Comp'].str.split(' ').str.get(2)
df['League'] = df['League'] + ' ' + df['League_']
df['League'] = df['League'].fillna('Bundesliga')
df = df.drop(columns=['League_', 'Comp', 'Rk', 'Pos','Matches'])

In [89]:
text_cols = ["Player", "Nation", "League", "Squad", "Position"]

def clean_numeric(s):
    s = s.astype(str).str.strip()
    s = s.replace(["—", "–", "-", "N/A", "na", "None", ""], np.nan)
    s = s.str.replace(",", "", regex=False)
    s = s.str.replace("%", "", regex=False)
    s = s.str.replace(r"[+]", "", regex=True)
    s = s.str.replace(r"\((.*?)\)", r"-\1", regex=True)
    return pd.to_numeric(s, errors="coerce")

for col in df.columns:
    if col not in text_cols:
        df[col] = clean_numeric(df[col])

In [90]:
filename = 'playerstats.csv'
if os.path.exists(filename):
    os.remove(filename)

df.to_csv(filename, index=False)
print(f"Data successfully exported to {filename}")

Data successfully exported to playerstats.csv
