# Scrapping Data from [Understat](https://https://understat.com/)

In [2]:
# Import libraries:
import json
import pandas as pd
from copy import deepcopy
from bs4 import BeautifulSoup
from urllib.request import urlopen

# Import plotting libraries:
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.patches import Arc

In [3]:
# Setting boundaries:
x_lims = [0, 1.15]
y_lims = [0, 0.74]

In [4]:
#selecting europe's Top 5 leagues
leagues = ['EPL/2022','La_liga/2022','Bundesliga/2022','Ligue_1/2022','Serie_A/2022']

In [5]:
#extracting all players id from the leagues
player_id_list =[]
final_jsonfinal_df= pd.DataFrame()
for i in leagues:
  scrape_url = "https://understat.com/league/{}".format(i)
  page_connect = urlopen(scrape_url)
  page_html = BeautifulSoup(page_connect, "html.parser")
  json_raw_string = page_html.findAll(name="script")[3].text
  start_ind = json_raw_string.index("\\")
  stop_ind = json_raw_string.index("')")
  json_data = json_raw_string[start_ind:stop_ind]
  json_data = json_data.encode("utf8").decode("unicode_escape")
  final_json_df = pd.json_normalize(json.loads(json_data))
  final_jsonfinal_df = pd.concat([final_jsonfinal_df, final_json_df], ignore_index=True)

player_id_list = final_jsonfinal_df["id"].to_list()
print(player_id_list)

['8260', '647', '998', '1250', '468', '556', '2517', '7752', '8865', '773', '7322', '2381', '6681', '482', '5543', '6055', '7420', '453', '5232', '6818', '7814', '8379', '8706', '239', '843', '6552', '10720', '10846', '314', '986', '1228', '7698', '10760', '204', '447', '522', '5220', '5786', '6034', '6756', '6854', '10806', '11296', '87', '501', '553', '618', '1679', '1776', '3585', '6049', '6122', '6345', '6853', '8291', '8941', '10177', '531', '620', '675', '700', '750', '1726', '2182', '2203', '3697', '6108', '6857', '7166', '7395', '7700', '8720', '10741', '10804', '10866', '343', '672', '762', '822', '922', '2248', '3635', '5221', '5553', '6063', '6482', '6827', '7365', '7892', '8150', '8327', '9492', '9738', '10405', '10408', '10716', '10743', '11058', '11094', '11297', '755', '757', '833', '910', '1654', '2328', '2335', '4456', '5613', '5956', '6253', '6492', '6630', '6665', '6912', '7768', '8845', '9040', '9678', '9680', '10552', '10715', '10746', '10750', '11317', '486', '500

### Get shot dataframes for each player and arrange in a list

In [9]:
player_shot_df_list = []
for p_id in player_id_list:
    # Scrape player stats:
    scrape_url = "https://understat.com/player/{}".format(p_id)
    page_connect = urlopen(scrape_url)
    page_html = BeautifulSoup(page_connect, "html.parser")

    json_raw_string = page_html.findAll(name="script")[3].text
    start_ind = json_raw_string.index("\\")
    stop_ind = json_raw_string.index("')")

    json_data = json_raw_string[start_ind:stop_ind]
    json_data = json_data.encode("utf8").decode("unicode_escape")

    shots_df = pd.json_normalize(json.loads(json_data))
    shots_df = shots_df.apply(pd.to_numeric, errors="ignore")

    full_pitch_shots_df = deepcopy(shots_df)

    player_shot_df_list.append(full_pitch_shots_df)

In [49]:
player_shots = pd.concat(player_shot_df_list, ignore_index=True)
player_shots2022_2023 =player_shots[player_shots['season']==2022]
player_shots2022_2023.to_csv('player_shots2022_2023.csv')
player_shots2022_2023

Unnamed: 0,id,minute,result,X,Y,xG,player,h_a,player_id,situation,season,shotType,match_id,h_team,a_team,h_goals,a_goals,date,player_assisted,lastAction
205,479846,20,MissedShots,0.936,0.503,0.484085,Erling Haaland,a,8260,OpenPlay,2022,Head,18211,West Ham,Manchester City,0,2,2022-08-07 15:30:00,Phil Foden,Cross
206,479847,35,Goal,0.885,0.500,0.761169,Erling Haaland,a,8260,Penalty,2022,LeftFoot,18211,West Ham,Manchester City,0,2,2022-08-07 15:30:00,,Standard
207,479854,64,Goal,0.864,0.405,0.328053,Erling Haaland,a,8260,OpenPlay,2022,LeftFoot,18211,West Ham,Manchester City,0,2,2022-08-07 15:30:00,Kevin De Bruyne,Throughball
208,479855,69,MissedShots,0.953,0.552,0.526608,Erling Haaland,a,8260,OpenPlay,2022,Head,18211,West Ham,Manchester City,0,2,2022-08-07 15:30:00,Jack Grealish,Chipped
209,479856,75,BlockedShot,0.894,0.710,0.066443,Erling Haaland,a,8260,OpenPlay,2022,LeftFoot,18211,West Ham,Manchester City,0,2,2022-08-07 15:30:00,Ilkay Gündogan,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267770,522509,86,BlockedShot,0.868,0.513,0.030797,Raimonds Krollis,h,11326,OpenPlay,2022,OtherBodyPart,18900,Spezia,Monza,0,2,2023-04-28 18:45:00,Tio Cipot,Cross
267771,505551,90,SavedShot,0.733,0.560,0.017026,Tio Cipot,a,11327,FromCorner,2022,RightFoot,18773,Bologna,Spezia,2,0,2023-01-27 17:30:00,Emmanuel Gyasi,Pass
267772,514577,85,MissedShots,0.744,0.546,0.013358,Tio Cipot,a,11327,SetPiece,2022,LeftFoot,18849,Sassuolo,Spezia,1,0,2023-03-17 17:30:00,,
267773,521476,93,SavedShot,0.923,0.528,0.067531,Alberto Basso,a,11482,OpenPlay,2022,Head,18887,Udinese,Cremonese,3,0,2023-04-23 13:00:00,Emanuele Valeri,Aerial
