In [31]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
from io import StringIO
pd.set_option('display.max_rows', 500)

In [None]:
league = "Premier-League"
season = "2023-2024"

LEAGUE_URLS = {
    'Premier-League': f'https://fbref.com/en/comps/9/{season}/schedule/{season}-Premier-League-Scores-and-Fixtures',
    'La-Liga': f'https://fbref.com/en/comps/12/{season}/schedule/{season}-La-Liga-Scores-and-Fixtures'
}

url = LEAGUE_URLS.get(league)
if not url:
    print(f"Unknown league: {league}.")

response = requests.get(url)
print(str(response.status_code))

if response.status_code != 200:
    print("Error getting response from http request.")
    
url_league_num = url.split('/')[5]
table_id = f'sched_{season}_{url_league_num}_1'
print(table_id)

soup = BeautifulSoup(response.text, 'html.parser')
schedule_table = soup.find('table', {'id': table_id})

In [70]:
df = pd.read_html(StringIO(str(schedule_table)))[0]
df = df.dropna(how='all')


Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,Match Report,Notes
0,1.0,Fri,2023-08-11,20:00,Burnley,0.3,0–3,1.9,Manchester City,21572.0,Turf Moor,Craig Pawson,Match Report,
1,1.0,Sat,2023-08-12,12:30,Arsenal,0.8,2–1,1.2,Nott'ham Forest,59984.0,Emirates Stadium,Michael Oliver,Match Report,
2,1.0,Sat,2023-08-12,15:00,Everton,2.7,0–1,1.5,Fulham,39940.0,Goodison Park,Stuart Attwell,Match Report,
3,1.0,Sat,2023-08-12,15:00,Sheffield Utd,0.5,0–1,1.9,Crystal Palace,31194.0,Bramall Lane,John Brooks,Match Report,
4,1.0,Sat,2023-08-12,15:00,Brighton,4.0,4–1,1.5,Luton Town,31872.0,The American Express Community Stadium,David Coote,Match Report,


In [71]:
df[['home_score', 'away_score']] = df['Score'].str.split(r'\D+', expand=True)
df = df.rename({'Home': 'home', 
                'Away': 'away', 
                'Date': 'match_date'}, axis="columns")
df['league'] = league
df['season'] = season
COLS_KEEP = ['match_date', 'league', 'season', 'home', 'away', 'home_score', 'away_score']
df.head()

Unnamed: 0,Wk,Day,match_date,Time,home,xG,Score,xG.1,away,Attendance,Venue,Referee,Match Report,Notes,home_score,away_score,league,season
0,1.0,Fri,2023-08-11,20:00,Burnley,0.3,0–3,1.9,Manchester City,21572.0,Turf Moor,Craig Pawson,Match Report,,0,3,Premier-League,2023-2024
1,1.0,Sat,2023-08-12,12:30,Arsenal,0.8,2–1,1.2,Nott'ham Forest,59984.0,Emirates Stadium,Michael Oliver,Match Report,,2,1,Premier-League,2023-2024
2,1.0,Sat,2023-08-12,15:00,Everton,2.7,0–1,1.5,Fulham,39940.0,Goodison Park,Stuart Attwell,Match Report,,0,1,Premier-League,2023-2024
3,1.0,Sat,2023-08-12,15:00,Sheffield Utd,0.5,0–1,1.9,Crystal Palace,31194.0,Bramall Lane,John Brooks,Match Report,,0,1,Premier-League,2023-2024
4,1.0,Sat,2023-08-12,15:00,Brighton,4.0,4–1,1.5,Luton Town,31872.0,The American Express Community Stadium,David Coote,Match Report,,4,1,Premier-League,2023-2024


In [72]:
df = df[COLS_KEEP]
df.head()

Unnamed: 0,match_date,league,season,home,away,home_score,away_score
0,2023-08-11,Premier-League,2023-2024,Burnley,Manchester City,0,3
1,2023-08-12,Premier-League,2023-2024,Arsenal,Nott'ham Forest,2,1
2,2023-08-12,Premier-League,2023-2024,Everton,Fulham,0,1
3,2023-08-12,Premier-League,2023-2024,Sheffield Utd,Crystal Palace,0,1
4,2023-08-12,Premier-League,2023-2024,Brighton,Luton Town,4,1


In [73]:
df['match_date'] = pd.to_datetime(df['match_date'], errors='coerce').dt.date
df['home_score'] = pd.to_numeric(df['home_score'], errors='coerce').fillna(0).astype(int)
df['away_score'] = pd.to_numeric(df['away_score'], errors='coerce').fillna(0).astype(int)

In [74]:
df.head()

Unnamed: 0,match_date,league,season,home,away,home_score,away_score
0,2023-08-11,Premier-League,2023-2024,Burnley,Manchester City,0,3
1,2023-08-12,Premier-League,2023-2024,Arsenal,Nott'ham Forest,2,1
2,2023-08-12,Premier-League,2023-2024,Everton,Fulham,0,1
3,2023-08-12,Premier-League,2023-2024,Sheffield Utd,Crystal Palace,0,1
4,2023-08-12,Premier-League,2023-2024,Brighton,Luton Town,4,1
