### Getting the data for Premier League match results 

In [1]:
# libraries
import pandas as pd
import numpy as np
import time
import re

In [2]:
# creating list of urls for each top5 league club from the 18-19 season
url_list = open('urls/pl_club_results.txt').read().splitlines()
print(url_list[0])

https://fbref.com/en/squads/b8fd03ef/2018-2019/matchlogs/c9/schedule/Manchester-City-Scores-and-Fixtures-Premier-League


In [3]:
from urllib.parse import urlparse
# Initialize an empty list to store dataframes
dfs = []

# Set the delay between requests (in seconds)
delay_between_requests = 5  # Adjust this value as needed

for url in url_list:
    try:
        # Read HTML tables from the URL
        tables = pd.read_html(url)

        df = tables[0]
        
        # Extract club name from the URL using regex
        club_name_match = re.search(r"schedule/(.*?)-Scores-and-Fixtures-Premier-League", url)
        if club_name_match:
            club_name = club_name_match.group(1).replace('-', ' ')
        else:
            raise ValueError("Club name not found in URL pattern.")
        
        # Create a new column with the club name
        df['Club'] = club_name
        
        # Map result letters to points
        result_mapping = {"W": 3, "D": 1, "L": 0}
        df['Points'] = df['Result'].map(result_mapping)
        
        # Append the dataframe to the list
        dfs.append(df)
        
        # Introduce a delay before making the next request
        time.sleep(delay_between_requests)
    except Exception as e:
        print(f"Error reading data from {url}: {str(e)}")

# Concatenate all dataframes into one
total_df = pd.concat(dfs, ignore_index=True)

In [4]:
total_df.head()

Unnamed: 0,Date,Time,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Club,Points
0,2018-08-12,16:00,Matchweek 1,Sun,Away,W,2,0,Arsenal,1.7,0.5,58,59934,Fernandinho,4-2-3-1,Michael Oliver,Match Report,,Manchester City,3
1,2018-08-19,13:30,Matchweek 2,Sun,Home,W,6,1,Huddersfield,4.2,0.7,76,54021,Vincent Kompany,3-1-4-2,Andre Marriner,Match Report,,Manchester City,3
2,2018-08-25,12:30,Matchweek 3,Sat,Away,D,1,1,Wolves,1.6,1.0,71,31322,Vincent Kompany,4-3-3,Martin Atkinson,Match Report,,Manchester City,1
3,2018-09-01,17:30,Matchweek 4,Sat,Home,W,2,1,Newcastle Utd,2.0,0.5,78,53946,David Silva,4-1-3-2,Kevin Friend,Match Report,,Manchester City,3
4,2018-09-15,15:00,Matchweek 5,Sat,Home,W,3,0,Fulham,4.8,0.4,64,53307,David Silva,4-3-3,Stuart Attwell,Match Report,,Manchester City,3


In [5]:
df_clean_total = total_df.copy()

In [6]:
df_clean_total.insert(0, "Club", df_clean_total.pop("Club"))
df_clean_total.insert(9, "Points", df_clean_total.pop("Points"))

df_clean_total.head()

Unnamed: 0,Club,Date,Time,Round,Day,Venue,Result,GF,GA,Points,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,Manchester City,2018-08-12,16:00,Matchweek 1,Sun,Away,W,2,0,3,Arsenal,1.7,0.5,58,59934,Fernandinho,4-2-3-1,Michael Oliver,Match Report,
1,Manchester City,2018-08-19,13:30,Matchweek 2,Sun,Home,W,6,1,3,Huddersfield,4.2,0.7,76,54021,Vincent Kompany,3-1-4-2,Andre Marriner,Match Report,
2,Manchester City,2018-08-25,12:30,Matchweek 3,Sat,Away,D,1,1,1,Wolves,1.6,1.0,71,31322,Vincent Kompany,4-3-3,Martin Atkinson,Match Report,
3,Manchester City,2018-09-01,17:30,Matchweek 4,Sat,Home,W,2,1,3,Newcastle Utd,2.0,0.5,78,53946,David Silva,4-1-3-2,Kevin Friend,Match Report,
4,Manchester City,2018-09-15,15:00,Matchweek 5,Sat,Home,W,3,0,3,Fulham,4.8,0.4,64,53307,David Silva,4-3-3,Stuart Attwell,Match Report,


In [7]:
df_clean_total = df_clean_total.iloc[:, :-2]

In [8]:
df_clean_total.head()

Unnamed: 0,Club,Date,Time,Round,Day,Venue,Result,GF,GA,Points,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee
0,Manchester City,2018-08-12,16:00,Matchweek 1,Sun,Away,W,2,0,3,Arsenal,1.7,0.5,58,59934,Fernandinho,4-2-3-1,Michael Oliver
1,Manchester City,2018-08-19,13:30,Matchweek 2,Sun,Home,W,6,1,3,Huddersfield,4.2,0.7,76,54021,Vincent Kompany,3-1-4-2,Andre Marriner
2,Manchester City,2018-08-25,12:30,Matchweek 3,Sat,Away,D,1,1,1,Wolves,1.6,1.0,71,31322,Vincent Kompany,4-3-3,Martin Atkinson
3,Manchester City,2018-09-01,17:30,Matchweek 4,Sat,Home,W,2,1,3,Newcastle Utd,2.0,0.5,78,53946,David Silva,4-1-3-2,Kevin Friend
4,Manchester City,2018-09-15,15:00,Matchweek 5,Sat,Home,W,3,0,3,Fulham,4.8,0.4,64,53307,David Silva,4-3-3,Stuart Attwell


In [10]:
df_clean_total["Club"].value_counts()

Club
Manchester City             38
Liverpool                   38
Fulham                      38
Cardiff City                38
Brighton and Hove Albion    38
Southampton                 38
Burnley                     38
Bournemouth                 38
Newcastle United            38
Crystal Palace              38
Watford                     38
West Ham United             38
Leicester City              38
Everton                     38
Wolverhampton Wanderers     38
Manchester United           38
Arsenal                     38
Tottenham Hotspur           38
Chelsea                     38
Huddersfield Town           38
Name: count, dtype: int64

In [11]:
df_clean_total.to_csv("pl_club_results.csv", index=False)