# Scraping Data for **English Premier League 2023–24**
# In this project I scrape data from [Link](https://en.wikipedia.org/wiki/2023%E2%80%9324_Premier_League) from wikipedia.ue)

## Importing Main Libraries 

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

In [2]:
## specific url
url = 'https://en.wikipedia.org/wiki/2023%E2%80%9324_Premier_League'

In [3]:
## get response
response = requests.get(url)

In [4]:
# response content
# response.text

In [5]:
soup = BeautifulSoup(response.text,'html.parser')

In [6]:
## check html page
# soup

In [7]:
## Title
title = soup.find('head').find('title').get_text().split(' - ')[0]
title

'2023–24 Premier League'

# Teams

## Table 1
## Stadiums and locations

In [8]:
table_head_1 = soup.find_all('table')[1].find('tbody').find_all('th')
table_body_1 = soup.find_all('table')[1].find('tbody').find_all('tr')[1:]

In [9]:
## table row
# for row in table_body_1:
#     for r in row:
#         print(r.get_text())
#     print("-"*100)

In [10]:
field_names1 = []
for head in table_head_1:
    field_names1.append(head.getText())
i = 0
for name in field_names1:
    field_names1[i] = name[:len(name)-1]
    i+=1
field_names1
with open('Stadiums_and_locations.csv','w',newline='',encoding='utf-8') as f:
    writer = csv.DictWriter(f,fieldnames=field_names1)
    writer.writeheader()
    for tr in table_body_1:
        row = []
        for td in tr:
            row.append(td.get_text())
        writer.writerow({'Team':row[1].replace('\n',''),
                         'Location':row[3].replace('\n',''),
                         'Stadium':row[5].replace('\n',''),
                         'Capacity[47]':row[7].replace('\n','')})

In [11]:
Stadiums_and_locations = pd.read_csv('Stadiums_and_locations.csv')
Stadiums_and_locations

Unnamed: 0,Team,Location,Stadium,Capacity[47]
0,Arsenal,London (Holloway),Emirates Stadium,"60,704[48]"
1,Aston Villa,Birmingham,Villa Park,42657
2,Bournemouth,Bournemouth,Dean Court,11307
3,Brentford,London (Brentford),Brentford Community Stadium,17250
4,Brighton & Hove Albion,Falmer,Falmer Stadium,31876
5,Burnley,Burnley,Turf Moor,"21,944[49]"
6,Chelsea,London (Fulham),Stamford Bridge,"40,173[50]"
7,Crystal Palace,London (Selhurst),Selhurst Park,25486
8,Everton,Liverpool (Walton),Goodison Park,39414
9,Fulham,London (Fulham),Craven Cottage,24500


## Table 2
## Personnel and kits


In [12]:
table_head_2 = soup.find_all('table')[2].find('tr').find_all('th')
# table_head_2
table_body_2 = soup.find_all('table')[2].find('tbody').find_all('tr')[1:]
# table_body_2

In [13]:
# for tr in table_body_2:
#     for td in tr:
#         print(td)
#     print('-'*150)

In [14]:
field_names2 = []
for head in table_head_2:
    field_names2.append(head.get_text())
i = 0
for name in field_names2:
    field_names2[i] = name[:len(name)-1]
    i+=1
field_names2
with open('Personnel_and_kits.csv','w',newline='',encoding='utf-8') as f:
    writer = csv.DictWriter(f,fieldnames=field_names2)
    writer.writeheader()
    for tr in table_body_2:
        row = []
        for td in tr:
            row.append(td.get_text())
        # print(row)
        writer.writerow({'Team':row[1].replace('\n',''),
                         'Manager':row[3].replace('\n',''),
                         'Captain':row[5].replace('\n',''),
                         'Kit manufacturer':row[7].replace('\n',''),
                         'Shirt sponsor (chest)':row[9].replace('\n',''),
                         'Shirt sponsor (sleeve)':row[11].replace('\n','')})

In [15]:
Personnel_and_kits = pd.read_csv('Personnel_and_kits.csv')
Personnel_and_kits

Unnamed: 0,Team,Manager,Captain,Kit manufacturer,Shirt sponsor (chest),Shirt sponsor (sleeve)
0,Arsenal,Mikel Arteta,Martin Ødegaard[52],Adidas[53],Emirates[54],Visit Rwanda[55]
1,Aston Villa,Unai Emery,John McGinn[56],Castore[57],BK8[58],Trade Nation[59]
2,Bournemouth,Andoni Iraola,Neto[60],Umbro[61],Dafabet[62],DeWalt[63]
3,Brentford,Thomas Frank,Christian Nørgaard[64],Umbro[65],Hollywoodbets[66],PensionBee[67]
4,Brighton & Hove Albion,Roberto De Zerbi,Lewis Dunk[68],Nike[69],American Express[69],Snickers UK[70]
5,Burnley,Vincent Kompany,Jack Cork[71],Umbro[72],W88[73],Uphold[74]
6,Chelsea,Mauricio Pochettino,Reece James[75],Nike[76],Infinite Athlete[77],BingX[78]
7,Crystal Palace,Oliver Glasner,Joel Ward[79],Macron[80],Cinch[81],Kaiyun Sports[82]
8,Everton,Sean Dyche,Séamus Coleman[83],Hummel[84],Stake.com[85],KICK[86]
9,Fulham,Marco Silva,Tom Cairney[87],Adidas[88],SBOTOP[89],WebBeds[90]


## Table 3
## Managerial changes


In [16]:
table_head_3 = soup.find_all('table')[3].find('tr').find_all('th')
# table_head_3
table_body_3 = soup.find_all('table')[3].find('tbody').find_all('tr')[1:]
# table_body_3


In [17]:
# for tr in table_body_3:
#     for td in tr:
#         print(td.get_text())
#     print('-'*150)

In [18]:
field_names3 = []
for head in table_head_3:
    field_names3.append(head.get_text())
i = 0
for name in field_names3:
    field_names3[i] = name[:len(name)-1]
    i+=1
field_names3
with open('Managerial_changes.csv','w',newline='',encoding='utf-8') as f:
    writer = csv.DictWriter(f,fieldnames=field_names3)
    writer.writeheader()
    for tr in table_body_3:
        row = []
        for td in tr:
            row.append(td.get_text())
        # print(row)
        if(len(row)==14):
            writer.writerow({'Team':row[1].replace('\n',''),
                         'Outgoing manager':row[3].replace('\n',''),
                         'Manner of departure':row[5].replace('\n',''),
                         'Date of vacancy':row[7].replace('\n',''),
                         'Position in the table':row[9].replace('\n',''),
                         'Incoming manager':row[11].replace('\n',''),
                         'Date of appointment':row[13].replace('\n','')})
        

In [19]:
Managerial_changes = pd.read_csv('Managerial_changes.csv')
# Managerial_changes

## Table 4
## League table

In [20]:
table_head_4 = soup.find_all('table')[4].find('tr').find_all('th')
# table_head_4
table_body_4 = soup.find_all('table')[4].find('tbody').find_all('tr')[1:]
# table_body_4

In [21]:
# for tr in table_body_4:
#     # for td in tr:
#         # print(td.get_text())
#     print(tr)
#     print('-'*150)

In [22]:
field_names4 = []
for head in table_head_4:
    field_names4.append(head.get_text())
i = 0
for name in field_names4:
    field_names4[i] = name[:len(name)-1]
    i+=1
field_names4.remove('Qualification or relegation')
with open('League_table.csv','w',newline='',encoding='utf-8') as f:
    writer = csv.DictWriter(f,fieldnames=field_names4)
    writer.writeheader()
    for tr in table_body_4:
        row = []
        for td in tr:
            row.append(td.get_text())
        row=row[:20]
        # print(len(row))
        writer.writerow({'Pos':row[1].replace('\n',''),
                         'Team':row[3].replace('\n',''),
                         'Pld':row[5].replace('\n',''),
                         'W':row[7].replace('\n',''),
                         'D':row[9].replace('\n',''),
                         'L':row[11].replace('\n',''),
                         'GF':row[13].replace('\n',''),
                         'GA':row[15].replace('\n',''),
                         'GD':row[17].replace('\n',''),
                         'Pts':row[19].replace('\n','')})
        
        

In [23]:
League_table = pd.read_csv('League_table.csv').set_index('Pos')
League_table

Unnamed: 0_level_0,Team,Pld,W,D,L,GF,GA,GD,Pts
Pos,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Manchester City (C),38,28,7,3,96,34,+62,91
2,Arsenal,38,28,5,5,91,29,+62,89
3,Liverpool,38,24,10,4,86,41,+45,82
4,Aston Villa,38,20,8,10,76,61,+15,68
5,Tottenham Hotspur,38,20,6,12,74,61,+13,66
6,Chelsea,38,18,9,11,77,63,+14,63
7,Newcastle United,38,18,6,14,85,62,+23,60
8,Manchester United,38,18,6,14,57,58,−1,60
9,West Ham United,38,14,10,14,60,74,−14,52
10,Crystal Palace,38,13,10,15,57,58,−1,49


## Table 5
## Results

In [24]:
table_head_5 = soup.find_all('table')[5].find('tr').find_all('th')
# table_head_5
table_body_5 = soup.find_all('table')[5].find('tbody').find_all('tr')[1:]
# table_body_5

In [25]:
# for tr in table_body_5:
#     for td in tr:
#         print(td.get_text())
#     # print(tr)
#     print('-'*150)

In [26]:
Teams_name = []
field_names5 = []
for head in table_head_5:
    field_names5.append(head.get_text())
i = 0
for name in field_names5:
    field_names5[i] = name[:len(name)-1]
    i+=1
with open('Results.csv','w',newline='',encoding='utf-8') as f:
    writer = csv.DictWriter(f,fieldnames=field_names5)
    writer.writeheader()
    for tr in table_body_5:
        row = []
        for td in tr:
            row.append(td.get_text())
        Teams_name.append(row[1])
        writer.writerow({'Home \\ Away':row[1].replace('\n',''),
                         'ARS':row[3].replace('\n',''),
                         'AVL':row[5].replace('\n',''),
                         'BOU':row[7].replace('\n',''),
                         'BRE':row[9].replace('\n',''),
                         'BHA':row[11].replace('\n',''),
                         'BUR':row[13].replace('\n',''),
                         'CHE':row[15].replace('\n',''),
                         'CRY':row[17].replace('\n',''),
                         'EVE':row[19].replace('\n',''),
                         'FUL':row[5].replace('\n',''),
                         'LIV':row[7].replace('\n',''),
                         'LUT':row[9].replace('\n',''),
                         'MCI':row[11].replace('\n',''),
                         'MUN':row[13].replace('\n',''),
                         'NEW':row[15].replace('\n',''),
                         'NFO':row[17].replace('\n',''),
                         'SHU':row[19].replace('\n',''),
                        'TOT':row[5].replace('\n',''),
                         'WHU':row[7].replace('\n',''),
                         'WOL':row[9].replace('\n','')})

In [27]:
field_names5

['Home \\ Away',
 'ARS',
 'AVL',
 'BOU',
 'BRE',
 'BHA',
 'BUR',
 'CHE',
 'CRY',
 'EVE',
 'FUL',
 'LIV',
 'LUT',
 'MCI',
 'MUN',
 'NEW',
 'NFO',
 'SHU',
 'TOT',
 'WHU',
 'WOL']

In [28]:
Results = pd.read_csv('Results.csv')
Results

Unnamed: 0,Home \ Away,ARS,AVL,BOU,BRE,BHA,BUR,CHE,CRY,EVE,...,LIV,LUT,MCI,MUN,NEW,NFO,SHU,TOT,WHU,WOL
0,Arsenal,—,0–2,3–0,2–1,2–0,3–1,5–0,5–0,2–1,...,3–0,2–1,2–0,3–1,5–0,5–0,2–1,0–2,3–0,2–1
1,Aston Villa,1–0,—,3–1,3–3,6–1,3–2,2–2,3–1,4–0,...,3–1,3–3,6–1,3–2,2–2,3–1,4–0,—,3–1,3–3
2,Bournemouth,0–4,2–2,—,1–2,3–0,2–1,0–0,1–0,2–1,...,—,1–2,3–0,2–1,0–0,1–0,2–1,2–2,—,1–2
3,Brentford,0–1,1–2,2–2,—,0–0,3–0,2–2,1–1,1–3,...,2–2,—,0–0,3–0,2–2,1–1,1–3,1–2,2–2,—
4,Brighton & Hove Albion,0–3,1–0,3–1,2–1,—,1–1,1–2,4–1,1–1,...,3–1,2–1,—,1–1,1–2,4–1,1–1,1–0,3–1,2–1
5,Burnley,0–5,1–3,0–2,2–1,1–1,—,1–4,0–2,0–2,...,0–2,2–1,1–1,—,1–4,0–2,0–2,1–3,0–2,2–1
6,Chelsea,2–2,0–1,2–1,0–2,3–2,2–2,—,2–1,6–0,...,2–1,0–2,3–2,2–2,—,2–1,6–0,0–1,2–1,0–2
7,Crystal Palace,0–1,5–0,0–2,3–1,1–1,3–0,1–3,—,2–3,...,0–2,3–1,1–1,3–0,1–3,—,2–3,5–0,0–2,3–1
8,Everton,0–1,0–0,3–0,1–0,1–1,1–0,2–0,1–1,—,...,3–0,1–0,1–1,1–0,2–0,1–1,—,0–0,3–0,1–0
9,Fulham,2–1,1–2,3–1,0–3,3–0,0–2,0–2,1–1,0–0,...,3–1,0–3,3–0,0–2,0–2,1–1,0–0,1–2,3–1,0–3


# Season statistics

## Table 6
## Top scorers

In [29]:
table_head_6 = soup.find_all('table')[6].find('tr').find_all('th')
# table_head_6
table_body_6 = soup.find_all('table')[6].find('tbody').find_all('tr')[1:]
# table_body_6

In [30]:
# for tr in table_body_6:
#     for td in tr:
#         print(td.get_text())
#     # print(tr)
#     print('-'*150)

In [31]:
field_names6 = []
for head in table_head_6:
    field_names6.append(head.get_text())
i = 0
for name in field_names6:
    field_names6[i] = name[:len(name)-1]
    i+=1
with open('Top_scorers.csv','w',newline='',encoding='utf-8') as f:
    writer = csv.DictWriter(f,fieldnames=field_names6)
    writer.writeheader()
    for tr in table_body_6:
        row = []
        for td in tr:
            row.append(td.get_text())
        # print(row)
        if(len(row)==8):
            rank = row[1].replace('\n','')
            goals = row[7].replace('\n','')
            writer.writerow({'Rank':row[1].replace('\n',''),
                         'Player':row[3].replace('\n',''),
                         'Club':row[5].replace('\n',''),
                         'Goals[144]':row[7].replace('\n','')})
        else :
             writer.writerow({'Rank':rank,
                         'Player':row[1].replace('\n',''),
                         'Club':row[3].replace('\n',''),
                         'Goals[144]':goals})

In [32]:
field_names6

['Rank', 'Player', 'Club', 'Goals[144]']

In [33]:
Top_scorers = pd.read_csv('Top_scorers.csv')
Top_scorers

Unnamed: 0,Rank,Player,Club,Goals[144]
0,1,Erling Haaland,Manchester City,27
1,2,Cole Palmer,Manchester CityChelsea,22
2,3,Alexander Isak,Newcastle United,21
3,4,Phil Foden,Manchester City,19
4,4,Dominic Solanke,Bournemouth,19
5,4,Ollie Watkins,Aston Villa,19
6,7,Mohamed Salah,Liverpool,18
7,8,Son Heung-min,Tottenham Hotspur,17
8,9,Jarrod Bowen,West Ham United,16
9,9,Jean-Philippe Mateta,Crystal Palace,16


## Table 7
## Hat-tricks

In [34]:
table_head_7 = soup.find_all('table')[7].find('tr').find_all('th')
# table_head_7
table_body_7 = soup.find_all('table')[7].find('tbody').find_all('tr')[1:]
# table_body_7

In [35]:
# for tr in table_body_7:
#     for td in tr:
#         print(td.get_text())
#     # print(tr)
#     print('-'*150)

In [36]:
field_names7 = []
for head in table_head_7:
    field_names7.append(head.get_text())
i = 0
for name in field_names7:
    field_names7[i] = name[:len(name)-1] if name[-1]=='\n' else name
    i+=1
with open('Hat_tricks.csv','w',newline='',encoding='utf-8') as f:
    writer = csv.DictWriter(f,fieldnames=field_names7)
    writer.writeheader()
    for tr in table_body_7:
        row = []
        for td in tr:
            row.append(td.get_text())
        print(row)
        if(len(row)==10):
            # For = row[3].replace('\n','')
            # date = row[9].replace('\n','')
            # against = row[7].replace('\n','')
            writer.writerow({'Player':row[1].replace('\n',''),
                             'For':row[3].replace('\n',''),
                             'Against':row[5].replace('\n',''),
                             'Result':row[7].replace('\n',''),
                             'Date':row[9].replace('\n','')})
        # else :
        #         writer.writerow({'Player':row[1].replace('\n',''),
        #         'For':For if row[3] not in Teams_name else row[3].replace('\n',''),
        #         'Against':against if row[5] not in Teams_name else row[5].replace('\n','') ,
        #         'Result':row[7].replace('\n',''),
        #         'Date':date})


['\n', ' Son Heung-min\n', '\n', 'Tottenham Hotspur\n', '\n', 'Burnley\n', '\n', '5–2 (A)[145]\n', '\n', '2 September 2023\n']
['\n', ' Erling Haaland\n', '\n', 'Manchester City\n', '\n', 'Fulham\n', '\n', '5–1 (H)[146]\n']
['\n', ' Evan Ferguson\n', '\n', 'Brighton & Hove Albion\n', '\n', 'Newcastle United\n', '\n', '3–1 (H)[147]\n']
['\n', ' Ollie Watkins\n', '\n', 'Aston Villa\n', '\n', 'Brighton & Hove Albion\n', '\n', '6–1 (H)[148]\n', '\n', '30 September 2023\n']
['\n', ' Eddie Nketiah\n', '\n', 'Arsenal\n', '\n', 'Sheffield United\n', '\n', '5–0 (H)[149]\n', '\n', '28 October 2023\n']
['\n', ' Nicolas Jackson\n', '\n', 'Chelsea\n', '\n', 'Tottenham Hotspur\n', '\n', '4–1 (A)[150]\n', '\n', '6 November 2023\n']
['\n', ' Dominic Solanke\n', '\n', 'Bournemouth\n', '\n', 'Nottingham Forest\n', '\n', '3–2 (A)[151]\n', '\n', '23 December 2023\n']
['\n', ' Chris Wood\n', '\n', 'Nottingham Forest\n', '\n', 'Newcastle United\n', '\n', '3–1 (A)[152]\n', '\n', '26 December 2023\n']
['\n', 

In [37]:
field_names7

['Player', 'For', 'Against', 'Result', 'Date']

In [38]:
Hat_tricks = pd.read_csv('Hat_tricks.csv')
Hat_tricks.loc[1]=['Erling Haaland','Manchester City','Fulham','5–1 (H)[146]','2 September 2023']
Hat_tricks

Unnamed: 0,Player,For,Against,Result,Date
0,Son Heung-min,Tottenham Hotspur,Burnley,5–2 (A)[145],2 September 2023
1,Erling Haaland,Manchester City,Fulham,5–1 (H)[146],2 September 2023
2,Eddie Nketiah,Arsenal,Sheffield United,5–0 (H)[149],28 October 2023
3,Nicolas Jackson,Chelsea,Tottenham Hotspur,4–1 (A)[150],6 November 2023
4,Dominic Solanke,Bournemouth,Nottingham Forest,3–2 (A)[151],23 December 2023
5,Chris Wood,Nottingham Forest,Newcastle United,3–1 (A)[152],26 December 2023
6,Elijah Adebayo,Luton Town,Brighton & Hove Albion,4–0 (H)[153],30 January 2024
7,Matheus Cunha,Wolverhampton Wanderers,Chelsea,4–2 (A)[154],4 February 2024
8,Phil Foden,Manchester City,Brentford,3–1 (A)[155],5 February 2024
9,Phil Foden,Manchester City,Aston Villa,4–1 (H)[157],3 April 2024


## Table 8
## Clean sheets

In [39]:
table_head_8 = soup.find_all('table')[8].find('tr').find_all('th')
# table_head_8
table_body_8 = soup.find_all('table')[8].find('tbody').find_all('tr')[1:]
# table_body_8

In [40]:
# for tr in table_body_8:
#     for td in tr:
#         print(td.get_text('rowspan'))
#     # print(tr)
#     print('-'*150)

In [41]:
rank = 0
number_of_clean_sheet = 0
field_names8 = []
for head in table_head_8:
    field_names8.append(head.get_text())
i = 0
for name in field_names8:
    field_names8[i] = name[:len(name)-1] if name[-1]=='\n' else name
    i+=1
with open('Clean_sheets.csv','w',newline='',encoding='utf-8') as f:
    writer = csv.DictWriter(f,fieldnames=field_names8)
    writer.writeheader()
    for tr in table_body_8:
        row = []
        for td in tr:
            row.append(td.get_text())
            # print(td.get_text())
        # print(row)
        # # print(len(row))
        if(len(row)==8):
            rank = row[1].replace('\n','')
            number_of_clean_sheet = row[7].replace('\n','')
            writer.writerow({'Rank':row[1].replace('\n',''),
                             'Player':row[3].replace('\n',''),
                             'Club':row[5].replace('\n',''),
                             'Cleansheets[162]':row[7].replace('\n','')})

        else:
            writer.writerow({'Rank':rank,
                             'Player':row[1].replace('\n',''),
                             'Club':row[3].replace('\n',''),
                             'Cleansheets[162]':number_of_clean_sheet})
            


In [42]:
field_names8

['Rank', 'Player', 'Club', 'Cleansheets[162]']

In [43]:
Clean_sheets = pd.read_csv('Clean_sheets.csv')
Clean_sheets

Unnamed: 0,Rank,Player,Club,Cleansheets[162]
0,1,David Raya,Arsenal,16
1,2,Jordan Pickford,Everton,13
2,3,Bernd Leno,Fulham,10
3,3,Ederson,Manchester City,10
4,5,André Onana,Manchester United,9
5,6,Alisson,Liverpool,8
6,6,Emiliano Martínez,Aston Villa,8
7,8,Mark Flekken,Brentford,7
8,8,Neto,Bournemouth,7
9,8,Guglielmo Vicario,Tottenham Hotspur,7


### Thank you for your interest and reaching this point