***The focus of this project is to scrape Premier League teams data such as scores and fixtures and then predict the score of the matches.***


Data until 07-05-2023

Importing Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import time
from google.colab import files

Getting request to scrape data

In [2]:
""" PL website has denied the request to scrape data so I had to scrape from FBREF.com..
a website devoted to tracking statistics for football teams and players from around the world. """
standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'
url = requests.post(standings_url)
url

<Response [200]>

In [3]:
data = requests.get(standings_url)  # data has been scraped

In [4]:
soup = BeautifulSoup(data.text) # the text in the link has been parsed

We will try to extract each team link only from the data. These links contain each team's scores and fixtures in all competitions

In [5]:
standings_table = soup.select('table.stats_table')[0] # tag name then class name

Now that we have parsed the standing table.. we will extract each teams's link from it

In [6]:
# find all finds tags & all the links are in anchor (a) tags that denotes a hyperlink
links = standings_table.find_all('a')

In [7]:
# href (hypertext reference) attribute indicates the hyperlink's destination
links = [l.get('href') for l in links] # loop in 'links' that contain the a tags and get all the links inside the href attribute
links

['/en/squads/18bb7c10/Arsenal-Stats',
 '/en/matches/c1739ced/Arsenal-Aston-Villa-April-14-2024-Premier-League',
 '/en/matches/22881ea2/Wolverhampton-Wanderers-Arsenal-April-20-2024-Premier-League',
 '/en/matches/3435dfcc/North-West-London-Derby-Arsenal-Chelsea-April-23-2024-Premier-League',
 '/en/matches/d98c9a99/North-London-Derby-Tottenham-Hotspur-Arsenal-April-28-2024-Premier-League',
 '/en/matches/00bcfc31/Arsenal-Bournemouth-May-4-2024-Premier-League',
 '/en/players/bc7dc64d/Bukayo-Saka',
 '/en/players/98ea5115/David-Raya',
 '/en/squads/b8fd03ef/Manchester-City-Stats',
 '/en/matches/fca82852/Crystal-Palace-Manchester-City-April-6-2024-Premier-League',
 '/en/matches/40128bc4/Manchester-City-Luton-Town-April-13-2024-Premier-League',
 '/en/matches/45bb8cac/Brighton-and-Hove-Albion-Manchester-City-April-25-2024-Premier-League',
 '/en/matches/80bbb25e/Nottingham-Forest-Manchester-City-April-28-2024-Premier-League',
 '/en/matches/5a9032bf/Manchester-City-Wolverhampton-Wanderers-May-4-20

In [8]:
# We got a lot of links that we don't need.. we only need the links to teams squads
links = [l for l in links if '/squads/' in l]

In [9]:
links

['/en/squads/18bb7c10/Arsenal-Stats',
 '/en/squads/b8fd03ef/Manchester-City-Stats',
 '/en/squads/822bd0ba/Liverpool-Stats',
 '/en/squads/8602292d/Aston-Villa-Stats',
 '/en/squads/361ca564/Tottenham-Hotspur-Stats',
 '/en/squads/b2b47a98/Newcastle-United-Stats',
 '/en/squads/cff3d9bb/Chelsea-Stats',
 '/en/squads/19538871/Manchester-United-Stats',
 '/en/squads/7c21e445/West-Ham-United-Stats',
 '/en/squads/4ba7cbea/Bournemouth-Stats',
 '/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 '/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 '/en/squads/fd962109/Fulham-Stats',
 '/en/squads/47c64c55/Crystal-Palace-Stats',
 '/en/squads/d3fd31cc/Everton-Stats',
 '/en/squads/cd051869/Brentford-Stats',
 '/en/squads/e4a775cb/Nottingham-Forest-Stats',
 '/en/squads/e297cd13/Luton-Town-Stats',
 '/en/squads/943e8050/Burnley-Stats',
 '/en/squads/1df6b87e/Sheffield-United-Stats']

In [10]:
# Adding the domain to have the complete links
team_urls = [f'https://fbref.com{l}' for l in links]

In [None]:
team_urls

['https://fbref.com/en/squads/18bb7c10/Arsenal-Stats',
 'https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats',
 'https://fbref.com/en/squads/822bd0ba/Liverpool-Stats',
 'https://fbref.com/en/squads/8602292d/Aston-Villa-Stats',
 'https://fbref.com/en/squads/361ca564/Tottenham-Hotspur-Stats',
 'https://fbref.com/en/squads/19538871/Manchester-United-Stats',
 'https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats',
 'https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats',
 'https://fbref.com/en/squads/7c21e445/West-Ham-United-Stats',
 'https://fbref.com/en/squads/4ba7cbea/Bournemouth-Stats',
 'https://fbref.com/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
 'https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
 'https://fbref.com/en/squads/fd962109/Fulham-Stats',
 'https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats',
 'https://fbref.com/en/squads/d3fd31cc/Everton-Stats',
 'https://fbref.com/en/squads/cd051869/Brentford-Stats',
 'https://fbref.com/en/s

Let's get the stats for the top of the table. We will try with one team then apply the same thing to the rest.

In [11]:
top_url = team_urls[0]

In [12]:
data = requests.get(top_url)

We want the Scores & Fixtures table

In [13]:
# pandas read_html scans for tables and scrape the table matched with the given string
top_matches = pd.read_html(data.text, match ='Scores & Fixtures')[0]

In [14]:
top_matches.head(10)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (4),1 (1),Manchester City,,,45.0,81145.0,Martin Ødegaard,4-3-3,Stuart Attwell,Match Report,Arsenal won on penalty kicks following normal ...
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,0.8,1.2,78.0,59984.0,Martin Ødegaard,4-3-3,Michael Oliver,Match Report,
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,2.0,1.0,53.0,24189.0,Martin Ødegaard,4-3-3,David Coote,Match Report,
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,3.2,0.6,71.0,59961.0,Martin Ødegaard,4-3-3,Paul Tierney,Match Report,
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,2.3,0.9,55.0,60192.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1,0,Everton,1.0,0.3,74.0,39217.0,Martin Ødegaard,4-3-3,Simon Hooper,Match Report,
6,2023-09-20,20:00,Champions Lg,Group stage,Wed,Home,W,4,0,nl PSV Eindhoven,2.3,0.5,58.0,58860.0,Martin Ødegaard,4-3-3,Felix Zwayer,Match Report,
7,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,D,2,2,Tottenham,1.8,1.4,47.0,60156.0,Martin Ødegaard,4-3-3,Robert Jones,Match Report,
8,2023-09-27,19:45,EFL Cup,Third round,Wed,Away,W,1,0,Brentford,,,60.0,16688.0,Jorginho,4-3-3,Darren Bond,Match Report,
9,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Away,W,4,0,Bournemouth,3.4,0.6,57.0,11193.0,Martin Ødegaard,4-3-3,Michael Salisbury,Match Report,


In [15]:
soup = BeautifulSoup(data.text)

In [16]:
links = soup.find_all('a')

In [17]:
links = [l.get("href") for l in links]

In [18]:
# We want the shooting, passing, and possession stats
links = [l for l in links if (l and ('all_comps/shooting/' in l or 'all_comps/passing/' in l or 'all_comps/possession/' in l))]

In [19]:
links

['/en/squads/18bb7c10/2023-2024/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2023-2024/matchlogs/all_comps/passing/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2023-2024/matchlogs/all_comps/possession/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2023-2024/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2023-2024/matchlogs/all_comps/passing/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2023-2024/matchlogs/all_comps/possession/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2023-2024/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2023-2024/matchlogs/all_comps/passing/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2023-2024/matchlogs/all_comps/possession/Arsenal-Match-Logs-All-Competitions',
 '/en/squads/18bb7c10/2023-2024/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions',
 '/en/s

In [20]:
shooting_data = requests.get(f'https://fbref.com{links[0]}')
passing_data = requests.get(f'https://fbref.com{links[1]}')
possession_data = requests.get(f'https://fbref.com{links[2]}')

In [23]:
top_shooting = pd.read_html(shooting_data.text, match='Shooting')[0]
top_passing = pd.read_html(passing_data.text, match='Passing')[0]
top_possession = pd.read_html(possession_data.text, match='Possession')[0]

In [24]:
top_shooting.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (4),1 (1),Manchester City,...,,,0,0,,,,,,Match Report
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,19.1,0.0,0,0,0.8,0.8,0.06,1.2,1.2,Match Report
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,16.4,0.0,1,1,2.0,1.2,0.09,-1.0,-1.2,Match Report
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,...,13.8,0.0,1,1,3.2,2.4,0.14,-1.2,-1.4,Match Report
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,15.0,0.0,0,0,2.3,2.3,0.13,0.7,0.7,Match Report


In [25]:
top_passing.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,...,Long,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Unnamed: 31_level_0,Unnamed: 32_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Cmp%,Ast,xAG,xA,KP,1/3,PPA,CrsPA,PrgP,Match Report
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (4),1 (1),Manchester City,...,,1,,,,,,,,Match Report
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,63.6,2,0.6,0.9,13.0,61.0,13.0,1.0,58.0,Match Report
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,53.5,0,0.9,0.7,8.0,46.0,13.0,2.0,40.0,Match Report
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,...,71.1,1,2.0,3.1,15.0,63.0,25.0,4.0,93.0,Match Report
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,55.4,3,2.0,1.2,15.0,50.0,12.0,0.0,45.0,Match Report


In [26]:
# Drop index level
top_shooting.columns = top_shooting.columns.droplevel()
top_passing.columns = top_passing.columns.droplevel()
top_possession.columns = top_possession.columns.droplevel()

In [27]:
top_shooting.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (4),1 (1),Manchester City,...,,,0,0,,,,,,Match Report
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,19.1,0.0,0,0,0.8,0.8,0.06,1.2,1.2,Match Report
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,16.4,0.0,1,1,2.0,1.2,0.09,-1.0,-1.2,Match Report
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,...,13.8,0.0,1,1,3.2,2.4,0.14,-1.2,-1.4,Match Report
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,15.0,0.0,0,0,2.3,2.3,0.13,0.7,0.7,Match Report


In [28]:
top_passing.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'Cmp', 'Att', 'Cmp%', 'TotDist', 'PrgDist', 'Cmp', 'Att',
       'Cmp%', 'Cmp', 'Att', 'Cmp%', 'Cmp', 'Att', 'Cmp%', 'Ast', 'xAG', 'xA',
       'KP', '1/3', 'PPA', 'CrsPA', 'PrgP', 'Match Report'],
      dtype='object')

In [29]:
top_passing.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Cmp%,Ast,xAG,xA,KP,1/3,PPA,CrsPA,PrgP,Match Report
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (4),1 (1),Manchester City,...,,1,,,,,,,,Match Report
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,63.6,2,0.6,0.9,13.0,61.0,13.0,1.0,58.0,Match Report
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,53.5,0,0.9,0.7,8.0,46.0,13.0,2.0,40.0,Match Report
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,...,71.1,1,2.0,3.1,15.0,63.0,25.0,4.0,93.0,Match Report
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,55.4,3,2.0,1.2,15.0,50.0,12.0,0.0,45.0,Match Report


In [30]:
top_possession.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,TotDist,PrgDist,PrgC,1/3,CPA,Mis,Dis,Rec,PrgR,Match Report
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (4),1 (1),Manchester City,...,,,,,,,,,,Match Report
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,3482.0,1908.0,34.0,24.0,6.0,10.0,9.0,712.0,58.0,Match Report
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,2182.0,1143.0,21.0,12.0,8.0,8.0,8.0,432.0,40.0,Match Report
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,...,2742.0,1518.0,28.0,17.0,11.0,18.0,8.0,639.0,92.0,Match Report
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,2127.0,1083.0,25.0,14.0,7.0,11.0,5.0,484.0,43.0,Match Report


In [31]:
top_possession.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'Poss', 'Touches', 'Def Pen', 'Def 3rd', 'Mid 3rd',
       'Att 3rd', 'Att Pen', 'Live', 'Att', 'Succ', 'Succ%', 'Tkld', 'Tkld%',
       'Carries', 'TotDist', 'PrgDist', 'PrgC', '1/3', 'CPA', 'Mis', 'Dis',
       'Rec', 'PrgR', 'Match Report'],
      dtype='object')

We need to merge all these data together in one df

In [32]:
top_data = top_matches.merge(top_shooting[['Date','Sh','SoT','G/Sh','PK','PKatt']], on="Date").merge(top_passing[['Date','Ast','xA','Cmp', 'Att', 'Cmp%','KP','PPA','PrgP']],on="Date").merge(top_possession[['Date','Poss', 'Touches','Att 3rd', 'Att Pen','PrgDist']], on="Date")

In [33]:
top_data.head(2)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Cmp%,Cmp%.1,KP,PPA,PrgP,Poss_y,Touches,Att 3rd,Att Pen,PrgDist
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (4),1 (1),Manchester City,...,,,,,,45.0,,,,
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,91.6,63.6,13.0,13.0,58.0,78.0,902.0,238.0,30.0,1908.0


In [34]:
unique_cols = []

# iterate over the column prefixes
for prefix in ['Cmp', 'Att', 'Cmp%']:
    # get all column names starting with the prefix
    cols = top_data.columns[top_data.columns.str.startswith(prefix)]
    # add the first unique column name to the list
    unique_cols.append(cols.unique()[0])

# drop the columns that have the same names
top_data = top_data.loc[:,~top_data.columns.duplicated()]
top_data

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Att,Cmp%,KP,PPA,PrgP,Poss_y,Touches,Att 3rd,Att Pen,PrgDist
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (4),1 (1),Manchester City,...,,,,,,45.0,,,,
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,807.0,88.7,13.0,13.0,58.0,78.0,902.0,238.0,30.0,1908.0
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,527.0,82.7,8.0,13.0,40.0,53.0,627.0,199.0,31.0,1143.0
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,...,741.0,86.9,15.0,25.0,93.0,71.0,845.0,336.0,57.0,1518.0
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,583.0,84.0,15.0,12.0,45.0,55.0,680.0,246.0,41.0,1083.0
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1,0,Everton,...,721.0,87.0,9.0,16.0,60.0,74.0,797.0,223.0,28.0,1209.0
6,2023-09-20,20:00,Champions Lg,Group stage,Wed,Home,W,4,0,nl PSV Eindhoven,...,618.0,87.9,16.0,16.0,49.0,58.0,726.0,196.0,41.0,1262.0
7,2023-09-24,14:00,Premier League,Matchweek 6,Sun,Home,D,2,2,Tottenham,...,423.0,81.8,8.0,5.0,32.0,47.0,529.0,174.0,25.0,833.0
8,2023-09-27,19:45,EFL Cup,Third round,Wed,Away,W,1,0,Brentford,...,,,,,,60.0,,,,
9,2023-09-30,15:00,Premier League,Matchweek 7,Sat,Away,W,4,0,Bournemouth,...,564.0,87.8,11.0,11.0,31.0,57.0,686.0,163.0,35.0,1130.0


In [35]:
top_data.columns

Index(['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'xG', 'xGA', 'Poss_x', 'Attendance', 'Captain', 'Formation',
       'Referee', 'Match Report', 'Notes', 'Sh', 'SoT', 'G/Sh', 'PK', 'PKatt',
       'Ast', 'xA', 'Cmp', 'Att', 'Cmp%', 'KP', 'PPA', 'PrgP', 'Poss_y',
       'Touches', 'Att 3rd', 'Att Pen', 'PrgDist'],
      dtype='object')

In [36]:
top_data = top_data[['Date', 'Time', 'Comp', 'Round', 'Day', 'Venue', 'Result', 'GF', 'GA',
       'Opponent', 'xG', 'xGA', 'Poss_x', 'Attendance', 'Captain', 'Formation',
       'Referee', 'Match Report', 'Notes', 'Sh', 'SoT', 'G/Sh', 'PK', 'PKatt',
       'Ast', 'xA', 'Cmp', 'Att', 'Cmp%', 'KP', 'PPA', 'PrgP','Touches', 'Att 3rd', 'Att Pen', 'PrgDist']]

Let's do the same thing but for all the PL teams

In [37]:
years = list(range(2024, 2021, -1))

In [38]:
years

[2024, 2023, 2022]

In [39]:
all_matches = []

In [40]:
standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'

In [41]:
for year in years:
  data = requests.get(standings_url)
  soup=BeautifulSoup(data.text)
  standings_table = soup.select('table.stats_table')[0]

  links= [l.get("href") for l in standings_table.find_all('a')]
  links = [l for l in links if '/squads/' in l]
  team_urls = [f'https://fbref.com{l}' for l in links]

  previous_season = soup.select("a.prev")[0].get("href")
  standings_url = f"https://fbref.com{previous_season}"

  for team_url in team_urls:
    team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-"," ") # Getting each team name from the team's link

    data = requests.get(team_url)
    matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

    soup = BeautifulSoup(data.text)
    links = [l.get("href") for l in soup.find_all('a')]
    links = [l for l in links if (l and ('all_comps/shooting/' in l or 'all_comps/passing/' in l or 'all_comps/possession/' in l))]
    shooting_data = requests.get(f'https://fbref.com{links[0]}')
    passing_data = requests.get(f'https://fbref.com{links[1]}')
    possession_data = requests.get(f'https://fbref.com{links[2]}')
    shooting = pd.read_html(shooting_data.text, match='Shooting')[0]
    passing = pd.read_html(passing_data.text, match='Passing')[0]
    possession = pd.read_html(possession_data.text, match='Possession')[0]
    shooting.columns = shooting.columns.droplevel()
    passing.columns = passing.columns.droplevel()
    possession.columns = possession.columns.droplevel()

    try:
        team_data = matches.merge(shooting[['Date','Sh']], on="Date").merge(passing[['Date','Ast']],on="Date").merge(possession[['Date','Poss']], on="Date")

    except ValueError:
      continue ## We use try and except to avoid error in case there is an empty table

    team_data = team_data[team_data["Comp"]== "Premier League"] # We want the premier league stats only
    team_data["Season"]= year # we add a column to identify the season in which the match has been played (21 for 20-21 season, 22 for 21-22 season, and 23 for 22-23 season)
    team_data["Team"] = team_name # We add a column with the team name in it to help with the analysis and predictions
    all_matches.append(team_data)
    time.sleep(5) # We add sleep for 5 seconds to avoid any blocking from the server due to high scraping frequency

In [42]:
match_df = pd.concat(all_matches)

In [43]:
match_df

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Captain,Formation,Referee,Match Report,Notes,Sh,Ast,Poss_y,Season,Team
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,Martin Ødegaard,4-3-3,Michael Oliver,Match Report,,15.0,2.0,78.0,2024,Arsenal
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,Martin Ødegaard,4-3-3,David Coote,Match Report,,13.0,0.0,53.0,2024,Arsenal
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,...,Martin Ødegaard,4-3-3,Paul Tierney,Match Report,,18.0,1.0,71.0,2024,Arsenal
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,17.0,3.0,55.0,2024,Arsenal
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1,0,Everton,...,Martin Ødegaard,4-3-3,Simon Hooper,Match Report,,13.0,1.0,74.0,2024,Arsenal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Away,L,0,2,Aston Villa,...,Grant Hanley,4-2-3-1,John Brooks,Match Report,,9.0,0.0,55.0,2022,Norwich City
39,2022-05-08,14:00,Premier League,Matchweek 36,Sun,Home,L,0,4,West Ham,...,Grant Hanley,4-2-3-1,Robert Jones,Match Report,,8.0,0.0,37.0,2022,Norwich City
40,2022-05-11,19:45,Premier League,Matchweek 21,Wed,Away,L,0,3,Leicester City,...,Grant Hanley,4-1-4-1,Simon Hooper,Match Report,,9.0,0.0,35.0,2022,Norwich City
41,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,1,1,Wolves,...,Grant Hanley,3-4-3,Tony Harrington,Match Report,,11.0,1.0,36.0,2022,Norwich City


In [45]:
match_df.tail(10)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Captain,Formation,Referee,Match Report,Notes,Sh,Ast,Poss_y,Season,Team
33,2022-03-13,14:00,Premier League,Matchweek 29,Sun,Away,L,1,2,Leeds United,...,Ben Gibson,4-3-3,Stuart Attwell,Match Report,,12.0,1.0,48.0,2022,Norwich City
34,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Away,D,0,0,Brighton,...,Grant Hanley,4-1-2-1-2◆,Simon Hooper,Match Report,,6.0,0.0,37.0,2022,Norwich City
35,2022-04-10,14:00,Premier League,Matchweek 32,Sun,Home,W,2,0,Burnley,...,Grant Hanley,4-2-3-1,Michael Oliver,Match Report,,17.0,1.0,45.0,2022,Norwich City
36,2022-04-16,15:00,Premier League,Matchweek 33,Sat,Away,L,2,3,Manchester Utd,...,Grant Hanley,4-2-3-1,Andy Madley,Match Report,,15.0,2.0,39.0,2022,Norwich City
37,2022-04-23,15:00,Premier League,Matchweek 34,Sat,Home,L,0,3,Newcastle Utd,...,Grant Hanley,4-2-3-1,Chris Kavanagh,Match Report,,5.0,0.0,46.0,2022,Norwich City
38,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Away,L,0,2,Aston Villa,...,Grant Hanley,4-2-3-1,John Brooks,Match Report,,9.0,0.0,55.0,2022,Norwich City
39,2022-05-08,14:00,Premier League,Matchweek 36,Sun,Home,L,0,4,West Ham,...,Grant Hanley,4-2-3-1,Robert Jones,Match Report,,8.0,0.0,37.0,2022,Norwich City
40,2022-05-11,19:45,Premier League,Matchweek 21,Wed,Away,L,0,3,Leicester City,...,Grant Hanley,4-1-4-1,Simon Hooper,Match Report,,9.0,0.0,35.0,2022,Norwich City
41,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,1,1,Wolves,...,Grant Hanley,3-4-3,Tony Harrington,Match Report,,11.0,1.0,36.0,2022,Norwich City
42,2022-05-22,16:00,Premier League,Matchweek 38,Sun,Home,L,0,5,Tottenham,...,Grant Hanley,5-4-1,Chris Kavanagh,Match Report,,9.0,0.0,41.0,2022,Norwich City


In [46]:
match_df = match_df.reset_index(drop=True)

In [47]:
match_df.to_csv("matches.csv")

In [48]:
files.download('matches.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>