In [1]:
import pandas as pd
import requests as rq
from bs4 import BeautifulSoup as bs
import lxml
import time

In [2]:
html_text = rq.get('https://www.premierleague.com/tables').text
soup = bs(html_text, 'lxml')

In [3]:
thead = soup.find('thead').text

In [4]:
columns = thead.split()

In [5]:
to_be_removed = ['More', 'Pos', 'Pl', 'W', 'D', 'L', 'Pts', 'Form', 'Next']

for item in to_be_removed:
    columns.remove(item)

In [6]:
columns

['Position',
 'Club',
 'Played',
 'Won',
 'Drawn',
 'Lost',
 'GF',
 'GA',
 'GD',
 'Points']

In [7]:
tbody = soup.find('tbody')

In [8]:
expandable = tbody.find_all(class_='expandable')
links = []
for el in expandable:
    a_tag = el.find('a', class_='expandableTeam', href=True)
    links.append(a_tag['href'])
    el.decompose()

In [9]:
tr = tbody.find_all('tr')

In [10]:
links

['/clubs/1/Arsenal/overview',
 '/clubs/11/Manchester-City/overview',
 '/clubs/23/Newcastle-United/overview',
 '/clubs/12/Manchester-United/overview',
 '/clubs/21/Tottenham-Hotspur/overview',
 '/clubs/10/Liverpool/overview',
 '/clubs/34/Fulham/overview',
 '/clubs/131/Brighton-and-Hove-Albion/overview',
 '/clubs/130/Brentford/overview',
 '/clubs/4/Chelsea/overview',
 '/clubs/6/Crystal-Palace/overview',
 '/clubs/2/Aston-Villa/overview',
 '/clubs/26/Leicester-City/overview',
 '/clubs/9/Leeds-United/overview',
 '/clubs/127/Bournemouth/overview',
 '/clubs/7/Everton/overview',
 '/clubs/25/West-Ham-United/overview',
 '/clubs/15/Nottingham-Forest/overview',
 '/clubs/38/Wolverhampton-Wanderers/overview',
 '/clubs/20/Southampton/overview']

In [11]:
position = []
teams = []
played = []
won = []
drawn = []
lost = []
gf = []
ga = []
gd = []
points = []

for el in tr:
    td = el.find_all('td')
    position.append(td[1].find('span').text)
    teams.append(td[2].find_all('span')[1].text)
    played.append(td[3].text)
    won.append(td[4].text)
    drawn.append(td[5].text)
    lost.append(td[6].text)
    gf.append(td[7].text)
    ga.append(td[8].text)
    gd.append(td[9].text.split()[0])
    points.append(td[10].text)
    

In [12]:
df = pd.DataFrame(columns=columns)

df[df.columns[0]] = position
df[df.columns[1]] = teams
df[df.columns[2]] = played
df[df.columns[3]] = won
df[df.columns[4]] = drawn
df[df.columns[5]] = lost
df[df.columns[6]] = gf
df[df.columns[7]] = ga
df[df.columns[8]] = gd
df[df.columns[9]] = points

basic_standings = df
basic_standings

Unnamed: 0,Position,Club,Played,Won,Drawn,Lost,GF,GA,GD,Points
0,1,Arsenal,17,14,2,1,40,14,26,44
1,2,Manchester City,16,11,3,2,44,16,28,36
2,3,Newcastle United,18,9,8,1,32,11,21,35
3,4,Manchester United,17,11,2,4,27,20,7,35
4,5,Tottenham Hotspur,17,9,3,5,33,25,8,30
5,6,Liverpool,17,8,4,5,34,22,12,28
6,7,Fulham,18,8,4,6,30,27,3,28
7,8,Brighton and Hove Albion,17,8,3,6,32,25,7,27
8,9,Brentford,18,6,8,4,30,28,2,26
9,10,Chelsea,16,7,4,5,20,18,2,25


In [13]:
goals_per_match = []
shots = []
shots_ot = []
shooting_acc = []
pen_scored = []
bigchances = []
hit_woodwork = []

passes = []
passes_per_match = []
pass_acc = []
crosses = []
crosses_acc = []

clean_sheets = []
ga_per_match = []
saves = []
tackles = []
tackle_success = []
blocked_shots = []
interceptions = []
clearances = []
headed_clearance = []
duels_won = []
errors_leading_to_goal = []
own_goals = []

yellow_cards = []
red_cards = []
fouls = []
offsides = []

links = [f"https://www.premierleague.com{link.replace('overview', 'stats?se=489')}" for link in links]

for link in links:
    print(link)
    time.sleep(5)
    html_text = rq.get(link).text
    soup = bs(html_text, 'lxml')
    stats_divs = soup.find_all('div', class_='statsListBlock')
    
    attack = stats_divs[0].find_all('div', class_='normalStat')
    team_play = stats_divs[1].find_all('div', class_='normalStat')
    defence = stats_divs[2].find_all('div', class_='normalStat')
    discipline = stats_divs[3].find_all('div', class_='normalStat')
    
    columns = []
    
    for div in attack:
        columns.append(" ".join(div.find('span', class_='stat').text.split()[:-1]))
    for div in team_play:
        columns.append(" ".join(div.find('span', class_='stat').text.split()[:-1]))
    for div in defence:
        columns.append(" ".join(div.find('span', class_='stat').text.split()[:-1]))
    for div in discipline:
        columns.append(" ".join(div.find('span', class_='stat').text.split()[:-1]))
        
    goals_per_match.append(attack[1].find_all('span')[1].text.split()[0])
    shots.append(attack[2].find_all('span')[1].text.split()[0])
    shots_ot.append(attack[3].find_all('span')[1].text.split()[0])
    shooting_acc.append(attack[4].find_all('span')[1].text.split()[0])
    pen_scored.append(attack[5].find_all('span')[1].text.split()[0])
    bigchances.append(attack[6].find_all('span')[1].text.split()[0])
    hit_woodwork.append(attack[7].find_all('span')[1].text.split()[0])
    
    passes.append(team_play[0].find_all('span')[1].text.split()[0])
    passes_per_match.append(team_play[1].find_all('span')[1].text.split()[0])
    pass_acc.append(team_play[2].find_all('span')[1].text.split()[0])
    crosses.append(team_play[3].find_all('span')[1].text.split()[0])
    crosses_acc.append(team_play[4].find_all('span')[1].text.split()[0])
    
    clean_sheets.append(defence[0].find_all('span')[1].text.split()[0])
    ga_per_match.append(defence[2].find_all('span')[1].text.split()[0])
    saves.append(defence[3].find_all('span')[1].text.split()[0])
    tackles.append(defence[4].find_all('span')[1].text.split()[0])
    tackle_success.append(defence[5].find_all('span')[1].text.split()[0])
    blocked_shots.append(defence[6].find_all('span')[1].text.split()[0])
    interceptions.append(defence[7].find_all('span')[1].text.split()[0])
    clearances.append(defence[8].find_all('span')[1].text.split()[0])
    headed_clearance.append(defence[9].find_all('span')[1].text.split()[0])
    duels_won.append(defence[10].find_all('span')[1].text.split()[0])
    errors_leading_to_goal.append(defence[11].find_all('span')[1].text.split()[0])
    own_goals.append(defence[12].find_all('span')[1].text.split()[0])

    yellow_cards.append(discipline[0].find_all('span')[1].text.split()[0])
    red_cards.append(discipline[1].find_all('span')[1].text.split()[0])
    fouls.append(discipline[2].find_all('span')[1].text.split()[0])
    offsides.append(discipline[3].find_all('span')[1].text.split()[0])
    
extended_stats = pd.DataFrame(columns = columns)
extended_stats[extended_stats.columns[0]] = teams
extended_stats[extended_stats.columns[1]] = goals_per_match
extended_stats[extended_stats.columns[2]] = shots
extended_stats[extended_stats.columns[3]] = shooting_acc
extended_stats[extended_stats.columns[4]] = pen_scored
extended_stats[extended_stats.columns[5]] = bigchances
extended_stats[extended_stats.columns[6]] = hit_woodwork

extended_stats[extended_stats.columns[7]] = passes
extended_stats[extended_stats.columns[8]] = passes_per_match
extended_stats[extended_stats.columns[9]] = pass_acc
extended_stats[extended_stats.columns[10]] = crosses
extended_stats[extended_stats.columns[11]] = crosses_acc

extended_stats[extended_stats.columns[12]] = clean_sheets
extended_stats[extended_stats.columns[13]] = ga_per_match
extended_stats[extended_stats.columns[14]] = saves
extended_stats[extended_stats.columns[15]] = tackles
extended_stats[extended_stats.columns[16]] = tackle_success
extended_stats[extended_stats.columns[17]] = blocked_shots
extended_stats[extended_stats.columns[18]] = interceptions
extended_stats[extended_stats.columns[19]] = clearances
extended_stats[extended_stats.columns[20]] = headed_clearance
extended_stats[extended_stats.columns[21]] = duels_won
extended_stats[extended_stats.columns[22]] = errors_leading_to_goal
extended_stats[extended_stats.columns[23]] = own_goals

extended_stats[extended_stats.columns[24]] = yellow_cards
extended_stats[extended_stats.columns[25]] = red_cards
extended_stats[extended_stats.columns[26]] = fouls
extended_stats[extended_stats.columns[27]] = offsides

https://www.premierleague.com/clubs/1/Arsenal/stats?se=489
https://www.premierleague.com/clubs/11/Manchester-City/stats?se=489
https://www.premierleague.com/clubs/23/Newcastle-United/stats?se=489
https://www.premierleague.com/clubs/12/Manchester-United/stats?se=489
https://www.premierleague.com/clubs/21/Tottenham-Hotspur/stats?se=489
https://www.premierleague.com/clubs/10/Liverpool/stats?se=489
https://www.premierleague.com/clubs/34/Fulham/stats?se=489
https://www.premierleague.com/clubs/131/Brighton-and-Hove-Albion/stats?se=489
https://www.premierleague.com/clubs/130/Brentford/stats?se=489
https://www.premierleague.com/clubs/4/Chelsea/stats?se=489
https://www.premierleague.com/clubs/6/Crystal-Palace/stats?se=489
https://www.premierleague.com/clubs/2/Aston-Villa/stats?se=489
https://www.premierleague.com/clubs/26/Leicester-City/stats?se=489
https://www.premierleague.com/clubs/9/Leeds-United/stats?se=489
https://www.premierleague.com/clubs/127/Bournemouth/stats?se=489
https://www.premie

In [16]:
print(extended_stats.columns)
extended_stats

Index(['Goals', 'Goals per match', 'Shots', 'Shots on target',
       'Shooting accuracy %', 'Penalties scored', 'Big Chances Created',
       'Hit woodwork', 'Passes', 'Passes per match', 'Pass accuracy %',
       'Crosses', 'Cross accuracy %', 'Clean sheets', 'Goals Conceded',
       'Goals conceded per match', 'Saves', 'Tackles', 'Tackle success %',
       'Blocked shots', 'Interceptions', 'Clearances', 'Headed Clearance',
       'Aerial Battles/Duels Won', 'Errors leading to goal', 'Own goals',
       'Yellow cards', 'Red cards', 'Fouls', 'Offsides'],
      dtype='object')


Unnamed: 0,Goals,Goals per match,Shots,Shots on target,Shooting accuracy %,Penalties scored,Big Chances Created,Hit woodwork,Passes,Passes per match,...,Interceptions,Clearances,Headed Clearance,Aerial Battles/Duels Won,Errors leading to goal,Own goals,Yellow cards,Red cards,Fouls,Offsides
0,Arsenal,1.76,9559,36%,72,856,273,333030,284.88,84%,...,6154,43013,127,47,1769,103,1948,1386,,
1,Manchester City,1.74,10204,35%,89,1026,276,348824,356.67,85%,...,5474,40218,72,40,1426,75,1943,1378,,
2,Newcastle United,1.34,6748,33%,43,512,168,205889,195.71,76%,...,5970,38904,89,36,1567,91,1807,1085,,
3,Manchester United,1.89,9572,36%,85,809,237,322986,276.06,83%,...,6891,41315,73,44,1728,69,2230,1450,,
4,Tottenham Hotspur,1.52,9562,35%,65,762,232,299041,255.81,81%,...,7101,43473,129,42,1736,71,2119,1356,,
5,Liverpool,1.76,10664,34%,82,892,297,336525,287.63,82%,...,5677,42566,108,42,1444,61,1891,1385,,
6,Fulham,1.12,4995,33%,36,286,126,165644,281.71,77%,...,4252,25965,64,39,885,42,693,873,,
7,Brighton and Hove Albion,1.08,2423,31%,24,247,55,92732,447.98,80%,...,2251,14243,18,14,315,16,1247,393,,
8,Brentford,1.39,626,38%,10,85,17,21099,376.77,74%,...,606,3796,4,4,86,3,283,100,,
9,Chelsea,1.71,10440,34%,90,774,245,333187,285.26,84%,...,5860,42743,82,41,1832,84,1932,1497,,


In [17]:
extended_stats.to_csv('extended_stats.csv', index=False)