In [1]:
import pandas as pd
import requests as rq
from bs4 import BeautifulSoup as bs
import lxml
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

In [2]:
html_text = rq.get('https://www.premierleague.com/tables').text
soup = bs(html_text, 'lxml')

In [3]:
thead = soup.find('thead').text

In [4]:
columns = thead.split()

In [5]:
to_be_removed = ['More', 'Pos', 'Pl', 'W', 'D', 'L', 'Pts', 'Form', 'Next']

for item in to_be_removed:
    columns.remove(item)

In [6]:
columns

['Position',
 'Club',
 'Played',
 'Won',
 'Drawn',
 'Lost',
 'GF',
 'GA',
 'GD',
 'Points']

In [7]:
tbody = soup.find('tbody')

In [8]:
expandable = tbody.find_all(class_='expandable')
links = []
for el in expandable:
    a_tag = el.find('a', class_='expandableTeam', href=True)
    links.append(a_tag['href'])
    el.decompose()

links = [f"https://www.premierleague.com{link.replace('overview', 'stats?se=489')}" for link in links]

In [9]:
tr = tbody.find_all('tr')

In [10]:
links

['https://www.premierleague.com/clubs/1/Arsenal/stats?se=489',
 'https://www.premierleague.com/clubs/11/Manchester-City/stats?se=489',
 'https://www.premierleague.com/clubs/23/Newcastle-United/stats?se=489',
 'https://www.premierleague.com/clubs/12/Manchester-United/stats?se=489',
 'https://www.premierleague.com/clubs/21/Tottenham-Hotspur/stats?se=489',
 'https://www.premierleague.com/clubs/10/Liverpool/stats?se=489',
 'https://www.premierleague.com/clubs/34/Fulham/stats?se=489',
 'https://www.premierleague.com/clubs/131/Brighton-and-Hove-Albion/stats?se=489',
 'https://www.premierleague.com/clubs/130/Brentford/stats?se=489',
 'https://www.premierleague.com/clubs/4/Chelsea/stats?se=489',
 'https://www.premierleague.com/clubs/2/Aston-Villa/stats?se=489',
 'https://www.premierleague.com/clubs/6/Crystal-Palace/stats?se=489',
 'https://www.premierleague.com/clubs/26/Leicester-City/stats?se=489',
 'https://www.premierleague.com/clubs/9/Leeds-United/stats?se=489',
 'https://www.premierleague

In [11]:
position = []
teams = []
played = []
won = []
drawn = []
lost = []
gf = []
ga = []
gd = []
points = []

for el in tr:
    td = el.find_all('td')
    position.append(td[1].find('span').text)
    teams.append(td[2].find_all('span')[1].text)
    played.append(td[3].text)
    won.append(td[4].text)
    drawn.append(td[5].text)
    lost.append(td[6].text)
    gf.append(td[7].text)
    ga.append(td[8].text)
    gd.append(td[9].text.split()[0])
    points.append(td[10].text)
    

In [12]:
df = pd.DataFrame(columns=columns)

df[df.columns[0]] = position
df[df.columns[1]] = teams
df[df.columns[2]] = played
df[df.columns[3]] = won
df[df.columns[4]] = drawn
df[df.columns[5]] = lost
df[df.columns[6]] = gf
df[df.columns[7]] = ga
df[df.columns[8]] = gd
df[df.columns[9]] = points

basic_standings = df
basic_standings

Unnamed: 0,Position,Club,Played,Won,Drawn,Lost,GF,GA,GD,Points
0,1,Arsenal,17,14,2,1,40,14,26,44
1,2,Manchester City,17,12,3,2,45,16,29,39
2,3,Newcastle United,18,9,8,1,32,11,21,35
3,4,Manchester United,17,11,2,4,27,20,7,35
4,5,Tottenham Hotspur,18,10,3,5,37,25,12,33
5,6,Liverpool,17,8,4,5,34,22,12,28
6,7,Fulham,18,8,4,6,30,27,3,28
7,8,Brighton and Hove Albion,17,8,3,6,32,25,7,27
8,9,Brentford,18,6,8,4,30,28,2,26
9,10,Chelsea,17,7,4,6,20,19,1,25


In [22]:
goals = []
goals_per_match = []
shots = []
shots_ot = []
shooting_acc = []
pen_scored = []
bigchances = []
hit_woodwork = []

passes = []
passes_per_match = []
pass_acc = []
crosses = []
crosses_acc = []

clean_sheets = []
ga = []
ga_per_match = []
saves = []
tackles = []
tackle_success = []
blocked_shots = []
interceptions = []
clearances = []
headed_clearance = []
duels_won = []
errors_leading_to_goal = []
own_goals = []

yellow_cards = []
red_cards = []
fouls = []
offsides = []

driver = webdriver.Firefox()
driver.get(links[0])

columns = []

container = driver.find_element(by=By.XPATH, value = "//*[@id='mainContent']/div[3]/div/div/ul")
child_els = container.find_elements(by=By.XPATH, value = "//div[@class='statsListBlock']//div[@class='normalStat']")

for child_el in child_els:
    columns.append(" ".join(child_el.find_element(by=By.CLASS_NAME, value = "stat").text.split()[:-1]))

for link in links:
    driver.get(link)
    time.sleep(3)
    stats_divs = driver.find_element(by=By.XPATH, value = "//div[@class='statsListBlock']")

    attack = driver.find_element(by=By.XPATH, value = "/html/body/main/div[3]/div/div/ul/li[1]/div")
    team_play = driver.find_element(by=By.XPATH, value = "/html/body/main/div[3]/div/div/ul/li[2]/div")
    defence = driver.find_element(by=By.XPATH, value = "/html/body/main/div[3]/div/div/ul/li[3]/div")
    discipline = driver.find_element(by=By.XPATH, value = "/html/body/main/div[3]/div/div/ul/li[4]/div")
    
    
    goals.append(attack.find_element(by=By.XPATH, value = '//div[2]/span/span').text.split()[0])
    goals_per_match.append(attack.find_element(by=By.XPATH, value = '//div[3]/span/span').text.split()[0])
    shots.append(attack.find_element(by=By.XPATH, value = '//div[4]/span/span').text.split()[0])
    shooting_acc.append(attack.find_element(by=By.XPATH, value = '//div[5]/span/span').text.split()[0])
    pen_scored.append(attack.find_element(by=By.XPATH, value = '//div[6]/span/span').text.split()[0])
    bigchances.append(attack.find_element(by=By.XPATH, value = '//div[7]/span/span').text.split()[0])
    hit_woodwork.append(attack.find_element(by=By.XPATH, value = '//div[8]/span/span').text.split()[0])
    
    passes.append(team_play.find_element(by=By.XPATH, value = '//div[2]/span/span').text.split()[0])
    passes_per_match.append(team_play.find_element(by=By.XPATH, value = '//div[3]/span/span').text.split()[0])
    pass_acc.append(team_play.find_element(by=By.XPATH, value = '//div[4]/span/span').text.split()[0])
    crosses.append(team_play.find_element(by=By.XPATH, value = '//div[5]/span/span').text.split()[0])
    crosses_acc.append(team_play.find_element(by=By.XPATH, value = '//div[6]/span/span').text.split()[0])
    
    clean_sheets.append(defence.find_element(by=By.XPATH, value = '//div[2]/span/span').text.split()[0])
    ga.append(defence.find_element(by=By.XPATH, value = '//div[3]/span/span').text.split()[0])
    ga_per_match.append(defence.find_element(by=By.XPATH, value = '//div[4]/span/span').text.split()[0])
    saves.append(defence.find_element(by=By.XPATH, value = '//div[5]/span/span').text.split()[0])
    tackles.append(defence.find_element(by=By.XPATH, value = '//div[6]/span/span').text.split()[0])
    tackle_success.append(defence.find_element(by=By.XPATH, value = '//div[7]/span/span').text.split()[0])
    blocked_shots.append(defence.find_element(by=By.XPATH, value = '//div[8]/span/span').text.split()[0])
    interceptions.append(defence.find_element(by=By.XPATH, value = '//div[9]/span/span').text.split()[0])
    clearances.append(defence.find_element(by=By.XPATH, value = '//div[10]/span/span').text.split()[0])
    headed_clearance.append(defence.find_element(by=By.XPATH, value = '//div[11]/span/span').text.split()[0])
    duels_won.append(defence.find_element(by=By.XPATH, value = '//div[12]/span/span').text.split()[0])
    errors_leading_to_goal.append(defence.find_element(by=By.XPATH, value = '//div[13]/span/span').text.split()[0])
    own_goals.append(defence.find_element(by=By.XPATH, value = '//div[14]/span/span').text.split()[0])

    yellow_cards.append(discipline.find_element(by=By.XPATH, value = '//div[2]/span/span').text.split()[0])
    red_cards.append(discipline.find_element(by=By.XPATH, value = '//div[3]/span/span').text.split()[0])
    fouls.append(discipline.find_element(by=By.XPATH, value = '//div[4]/span/span').text.split()[0])
    offsides.append(discipline.find_element(by=By.XPATH, value = '//div[5]/span/span').text.split()[0])
    
driver.quit()

extended_stats = pd.DataFrame(columns = ["Team"] + columns)
extended_stats[extended_stats.columns[0]] = teams
extended_stats[extended_stats.columns[1]] = goals
extended_stats[extended_stats.columns[2]] = goals_per_match
extended_stats[extended_stats.columns[3]] = shots
extended_stats[extended_stats.columns[4]] = shots_ot
extended_stats[extended_stats.columns[5]] = shooting_acc
extended_stats[extended_stats.columns[6]] = pen_scored
extended_stats[extended_stats.columns[7]] = bigchances
extended_stats[extended_stats.columns[8]] = hit_woodwork

extended_stats[extended_stats.columns[9]] = passes
extended_stats[extended_stats.columns[10]] = passes_per_match
extended_stats[extended_stats.columns[11]] = pass_acc
extended_stats[extended_stats.columns[12]] = crosses
extended_stats[extended_stats.columns[13]] = crosses_acc

extended_stats[extended_stats.columns[14]] = clean_sheets
extended_stats[extended_stats.columns[15]] = ga
extended_stats[extended_stats.columns[16]] = ga_per_match
extended_stats[extended_stats.columns[17]] = saves
extended_stats[extended_stats.columns[18]] = tackles
extended_stats[extended_stats.columns[19]] = tackle_success
extended_stats[extended_stats.columns[20]] = blocked_shots
extended_stats[extended_stats.columns[21]] = interceptions
extended_stats[extended_stats.columns[22]] = clearances
extended_stats[extended_stats.columns[23]] = headed_clearance
extended_stats[extended_stats.columns[24]] = duels_won
extended_stats[extended_stats.columns[25]] = errors_leading_to_goal
extended_stats[extended_stats.columns[26]] = own_goals

extended_stats[extended_stats.columns[27]] = yellow_cards
extended_stats[extended_stats.columns[28]] = red_cards
extended_stats[extended_stats.columns[29]] = fouls
extended_stats[extended_stats.columns[30]] = offsides

IndexError: list index out of range

In [None]:
print(extended_stats.columns)
extended_stats

In [None]:
extended_stats.to_csv('extended_stats.csv', index=False)