In [None]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import numpy as np
from lxml import etree, html
import re
from io import StringIO
import os
import glob

DATA_DIR = os.path.join('data', 'fbref')

def get_soup(url):
    headers = {'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/39.0.2171.95 Safari/537.36')}
    r = requests.get(url, headers=headers)
    r.encoding = 'unicode-escape'
    return BeautifulSoup(r.content, 'html.parser')

def get_url(stat):
    prefix = 'https://fbref.com/en/comps/37/2022-2023/'
    suffix = '/2022-2023-Belgian-Pro-League-Stats'
    return f'{prefix}{stat}{suffix}'

def flatten_cols(df):
    col_level1 = list(df.columns.get_level_values(0))
    col_level1 = ['' if c[:7]=='Unnamed' else c.replace(' ', '_').lower() for c in col_level1]
    col_level2 = list(df.columns.get_level_values(1))
    col_level2 = [c.replace(' ', '_').lower() for c in col_level2]
    cols = [f'{c}_{col_level2[i]}' if c != '' else col_level2[i] for i, c in enumerate(col_level1)]
    cols = [re.sub('[^0-9a-zA-Z]+', '_', c.replace('%', '_percent').replace('+/-', '_plus_minus')).rstrip('_') for c in cols]
    df.columns = cols
    return df

def extract_stats(url):
    soup = get_soup(url)
    comments = soup.findAll(string=lambda string:isinstance(string, Comment))
    extracted_comments = [comment.extract() for comment in comments if 'table' in str(comment)]   
    df = pd.read_html(StringIO(str(extracted_comments[0])))[0]
    return flatten_cols(df)

def stats_to_parquet(stat, directory):
    url = get_url(stat)
    df = extract_stats(url)
    df = df[df['rk'] != 'Rk'].copy()
    df.drop(['rk', 'matches'], axis='columns', inplace=True)
    file_name = os.path.join(directory, f'{stat}.parquet')
    df.to_parquet(file_name)

In [None]:
stats_to_parquet('playingtime', DATA_DIR)

In [None]:
stats_to_parquet('stats', DATA_DIR)

In [None]:
stats_to_parquet('keepers', DATA_DIR)

In [None]:
stats_to_parquet('keepersadv', DATA_DIR)

In [None]:
stats_to_parquet('shooting', DATA_DIR)

In [None]:
stats_to_parquet('passing', DATA_DIR)

In [None]:
stats_to_parquet('passing_types', DATA_DIR)

In [None]:
stats_to_parquet('gca', DATA_DIR)

In [None]:
stats_to_parquet('defense', DATA_DIR)

In [None]:
stats_to_parquet('possession', DATA_DIR)

In [None]:
stats_to_parquet('misc', DATA_DIR)

In [None]:
files = glob.glob(os.path.join(DATA_DIR, '*'))
df = pd.read_parquet([f for f in files if 'playingtime' in f][0])
files = [f for f in files if 'playingtime' not in f]
for f in files:
    df_temp = pd.read_parquet(f)
    df = df.merge(df_temp, on=['player', 'squad'], how='left', suffixes=['', '_to_remove'])
cols_to_remove = [col for col in df.columns if '_to_remove' in col]
df.drop(cols_to_remove, axis='columns', inplace=True)
df.to_parquet('player_stats_112_2022.parquet')