In [1]:
import requests
import json
import pandas as pd
import os

In [2]:
# season_names = ['2014', '2015', '2016', '2017', '2018', '2019']
season_names = ['2019', '2020', '2021', '2022', '2023']

gws = ['3', '5', '10']
# leagues = ['EPL', 'La_liga', 'Bundesliga', 'Serie_A', 'Ligue_1']
leagues = ['EPL', 'La_liga', 'Bundesliga', 'Serie_A', 'Ligue_1']


In [3]:
def scrape_understat(payload):
    #Build request using url, headers (mimicking what Firefox does normally)
    #Works best with verify=True as you won't get the ssl errors. Payload is 
    #taylored for each request
    url = 'https://understat.com/main/getPlayersStats/'
    headers = {'content-type':'application/json; charset=utf-8',
    'Host': 'understat.com',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate, br',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'X-Requested-With': 'XMLHttpRequest',
    'Content-Length': '39',
    'Origin': 'https: // understat.com',
    'Connection': 'keep - alive',
    'Referer': 'https: // understat.com / league / EPL'
    }
    response = requests.post(url, data=payload, headers = headers, verify=True)
    response_json = response.json()
    inner_wrapper = response_json['response']
    json_player_data = inner_wrapper['players']
    return json_player_data

In [4]:
def clean_df(player_df, weeks):
    # Get rid of the columns that we don't care about
    #player_df.drop(['yellow_cards','red_cards', 'xGChain','xGBuildup','games','time'], axis=1, inplace=True)
    player_df  = player_df.rename(columns={'goals':'goals_'+weeks,'xG':'xG_'+weeks,'assists':'assists_'+weeks, 'xA':'xA_'+weeks, 'shots':'shots_'+weeks, 'key_passes':
        'key_passes_'+weeks,'npg':'npg_'+weeks,'npxG':'npxG_'+weeks})
    
    return(player_df)

In [6]:
def gw_data(season , league,  no_of_gw):
#     Create Pandas dataframes from each html table
    print('Getting data for last {} matches'.format(no_of_gw))
    json_player_data = scrape_understat({'league':'EPL', 'season':season, 'n_last_matches': no_of_gw})
    gw_table = pd.DataFrame(json_player_data)
    gw_df = clean_df(gw_table,'3wks')
    #Replace Position indentifiers with something more useful
    gw_df['position'] = gw_df['position'].str.slice(0,1)
    position_map = {'D':'DEF', 'F':'FWD', 'M':'MID', 'G':'GK', 'S':'FWD'}
    gw_df = gw_df.replace({'position': position_map})
    # gw_df.to_csv(r'C:\Users\Asus\PycharmProjects\Understat Data Scraper\Data\Player_Data\gw_data\last_{}_gw_data.csv'.format(no_of_gw), encoding='utf-8', index=False)
    gw_df.to_csv(r'data/understat/last_{}_gw_data.csv'.format(no_of_gw), encoding='utf-8', index=False)
    print('last {} matches csv data written'.format(no_of_gw))
    return gw_df

In [19]:
last_3_gw_data_EPL = gw_data(season_names[-1], leagues[0], gws[0])
last_5_gw_data_EPL = gw_data(season_names[-1], leagues[0], gws[1])
last_10_gw_data_EPL = gw_data(season_names[-1], leagues[0], gws[2])

Getting data for last 3 matches
last 3 matches csv data written
Getting data for last 5 matches
last 5 matches csv data written
Getting data for last 10 matches
last 10 matches csv data written


In [7]:
def season_data(season, league):
    print('Getting data for {} season'.format(season))
    json_player_data = scrape_understat({'league': league, 'season':season})
    season_table = pd.DataFrame(json_player_data)
    season_df = clean_df(season_table, 'season')
    # season_df.to_csv(r'C:\Users\Asus\PycharmProjects\Understat Data Scraper\Data\Player_Data\season_data\{}_whole_season_data.csv'.format(season), encoding='utf-8', index=False)
    season_df.to_csv(r'data/understat/{}_whole_season_data.csv'.format(season), encoding='utf-8', index=False)
    print('csv file for {} season written'.format(season))
    return season_df

In [21]:
# season_1415 = season_data(season_names[0], leagues[0])
# season_1516 = season_data(season_names[1], leagues[0])
# season_1617 = season_data(season_names[2], leagues[0])
# season_1718 = season_data(season_names[3], leagues[0])
# season_1819 = season_data(season_names[4], leagues[0])
# season_1920 = season_data(season_names[5], leagues[0])

season_1920 = season_data(season_names[0], leagues[0])
season_2021 = season_data(season_names[1], leagues[0])
season_2122 = season_data(season_names[2], leagues[0])
season_2223 = season_data(season_names[3], leagues[0])
season_2324 = season_data(season_names[4], leagues[0])
# season_1920 = season_data(season_names[5], leagues[0])

Getting data for 2019 season
csv file for 2019 season written
Getting data for 2020 season
csv file for 2020 season written
Getting data for 2021 season
csv file for 2021 season written
Getting data for 2022 season
csv file for 2022 season written
Getting data for 2023 season
csv file for 2023 season written


In [8]:
season_names

['2019', '2020', '2021', '2022', '2023']

In [9]:
csv_list = []
for league in leagues:
    for season_name in season_names:
        df_name = f"{league}_{season_name}".lower()

        globals()[f"{df_name}"] = season_data(season_name, league)
        csv_list.append(df_name)

Getting data for 2019 season
csv file for 2019 season written
Getting data for 2020 season
csv file for 2020 season written
Getting data for 2021 season
csv file for 2021 season written
Getting data for 2022 season
csv file for 2022 season written
Getting data for 2023 season
csv file for 2023 season written
Getting data for 2019 season
csv file for 2019 season written
Getting data for 2020 season
csv file for 2020 season written
Getting data for 2021 season
csv file for 2021 season written
Getting data for 2022 season
csv file for 2022 season written
Getting data for 2023 season
csv file for 2023 season written
Getting data for 2019 season
csv file for 2019 season written
Getting data for 2020 season
csv file for 2020 season written
Getting data for 2021 season
csv file for 2021 season written
Getting data for 2022 season
csv file for 2022 season written
Getting data for 2023 season
csv file for 2023 season written
Getting data for 2019 season
csv file for 2019 season written
Getting 

In [10]:
csv_list

['epl_2019',
 'epl_2020',
 'epl_2021',
 'epl_2022',
 'epl_2023',
 'la_liga_2019',
 'la_liga_2020',
 'la_liga_2021',
 'la_liga_2022',
 'la_liga_2023',
 'bundesliga_2019',
 'bundesliga_2020',
 'bundesliga_2021',
 'bundesliga_2022',
 'bundesliga_2023',
 'serie_a_2019',
 'serie_a_2020',
 'serie_a_2021',
 'serie_a_2022',
 'serie_a_2023',
 'ligue_1_2019',
 'ligue_1_2020',
 'ligue_1_2021',
 'ligue_1_2022',
 'ligue_1_2023']

In [11]:
epl_2019.to_csv("data/understat/epl_2019_whole_season_data.csv")
epl_2020.to_csv("data/understat/epl_2020_whole_season_data.csv")
epl_2021.to_csv("data/understat/epl_2021_whole_season_data.csv")
epl_2022.to_csv("data/understat/epl_2022_whole_season_data.csv")
epl_2023.to_csv("data/understat/epl_2023_whole_season_data.csv")
la_liga_2019.to_csv("data/understat/la_liga_2019_whole_season_data.csv")
la_liga_2020.to_csv("data/understat/la_liga_2020_whole_season_data.csv")
la_liga_2021.to_csv("data/understat/la_liga_2021_whole_season_data.csv")
la_liga_2022.to_csv("data/understat/la_liga_2022_whole_season_data.csv")
la_liga_2023.to_csv("data/understat/la_liga_2023_whole_season_data.csv")
bundesliga_2019.to_csv("data/understat/bundesliga_2019_whole_season_data.csv")
bundesliga_2020.to_csv("data/understat/bundesliga_2020_whole_season_data.csv")
bundesliga_2021.to_csv("data/understat/bundesliga_2021_whole_season_data.csv")
bundesliga_2022.to_csv("data/understat/bundesliga_2022_whole_season_data.csv")
bundesliga_2023.to_csv("data/understat/bundesliga_2023_whole_season_data.csv")
serie_a_2019.to_csv("data/understat/serie_a_2019_whole_season_data.csv")
serie_a_2020.to_csv("data/understat/serie_a_2020_whole_season_data.csv")
serie_a_2021.to_csv("data/understat/serie_a_2021_whole_season_data.csv")
serie_a_2022.to_csv("data/understat/serie_a_2022_whole_season_data.csv")
serie_a_2023.to_csv("data/understat/serie_a_2023_whole_season_data.csv")
ligue_1_2019.to_csv("data/understat/ligue_1_2019_whole_season_data.csv")
ligue_1_2020.to_csv("data/understat/ligue_1_2020_whole_season_data.csv")
ligue_1_2021.to_csv("data/understat/ligue_1_2021_whole_season_data.csv")
ligue_1_2022.to_csv("data/understat/ligue_1_2022_whole_season_data.csv")
ligue_1_2023.to_csv("data/understat/ligue_1_2023_whole_season_data.csv")

In [12]:
season_2324[season_2324["team_title"]=="Arsenal"]

NameError: name 'season_2324' is not defined

In [11]:
season_df

NameError: name 'season_df' is not defined

In [None]:
season_1415

In [None]:
season_1516

In [None]:
season_1617

In [None]:
season_1718

In [None]:
season_1819

In [None]:
season_1920