In [1]:
#Standard Python libraries for data and visualisation
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import plotly as ply
import os

#Web scraping libraries
import re
import requests
import json
from IPython.core.display import HTML
from bs4 import BeautifulSoup

# Match Report extraction

In [2]:
def get_match_table(link_string):
    req = requests.get(link_string)
    
    html = req.text
    
    # extract first table in the page
    start = html.index('<table')
    end = html[start:].index('</table>') + start
    
    table = html[start:end]
    
    return table
        
def get_report_links(table):
    contain_links = [cl.group() for cl in list(re.finditer('<td class="left group_start".*</a></td>', table))]
    
    links = []
    
    for cl in contain_links:

        link = re.search('href=".*"', cl).group().replace('href=', '').replace('"', '')
        
        link = 'https://fbref.com' + link
        
        links.append(link)
    
    # for some reason there are two of each.
    return links[::2]

# Shot Table Extraction

In [3]:
def get_match_timestamp(html):
    date = re.search('data-venue-date="[^"]*"', html)\
            .group()\
            .replace('data-venue-date=', '')\
            .replace('"', '')
    time = re.search('data-venue-time="[^"]*"', html)\
            .group()\
            .replace('data-venue-time=', '')\
            .replace('"', '')
    
    timestamp = pd.to_datetime(date + ' ' + time)
    
    return timestamp

def get_shot_table(link_string):
    
    req = requests.get(link_string)
    print(req)
    html = req.text
    
    timestamp = get_match_timestamp(html)
    
    # extract table with id 'shots_all'
    start = re.search('<table .* id="shots_all"', html).span()[0]
    end = html[start:].index('</table>') + start
    
    table = html[start:end]
    
    return table, timestamp

# Season and Team Ids

In [4]:
html = requests.get('https://fbref.com/en/comps/9/Premier-League-Stats#all_stats_shooting_squads').text

In [5]:
start = html.index('<table')
end = html[start:].index('</table>') + start

In [6]:
table = html[start: end]

In [7]:
links = re.findall('href="[^"]*squads[^"]*"', table)
props = np.asarray([l.replace('href=', '')
                    .replace('"', '')
                    .replace('-Stats', '')
                    .split('/') for l in links])

In [8]:
ids = props[:, 3]
names = np.asarray([n.replace('-', ' ') for n in props[:, 4]])

In [9]:
seasons = {'s10728':'2020-2021',
          's3232':'2019-2020',
          's1889':'2018-2019',
          's1631':'2017-2018',
          's1526':'2016-2017',
          's1467':'2015-2016',
          's733':'2014-2015'}

# Generate Links

In [10]:
from urllib.parse import urlparse

In [11]:
premier_league_link = 'https://fbref.com/en/squads/7c21e445/2014-2015/matchlogs/s733/shooting/West-Ham-United-Match-Logs-Premier-League'

path = urlparse(premier_league_link).path

In [12]:
path = path.split('/')

In [13]:
team_links = []

for k, v in seasons.items():
    for n, i in zip(names, ids):
        path[3] = i
        path[4] = v
        path[6] = k
        path[8] = n.replace(' ', '-') + '-Match-Logs-Premier-League'
        
        team_links.append('/'.join(path))

In [14]:
team_links = ['https://fbref.com' + tl for tl in team_links]

# Shot table to df

In [15]:
def get_shot_table_df(report_link):

    table_html, timestamp = get_shot_table(report_link)

    # gives a list of tables, only one table is given
    table = pd.read_html(table_html)[0]

    table.columns = [c[1] if 'Unnamed' in c[0] else c[0] + ' ' + c[1] for c in table.columns]

    table = table.dropna(how='all')

    # set minutes as ints
    table['Minute'] = table['Minute'].astype(str)
    table.loc[:, 'Minute'] = [minute_plus(m) for m in table['Minute']]
    table['Minute'] = table['Minute'].astype(float).astype(int)

    # opposite in match
    map_ = table['Squad'].unique()
    table['Against'] = [map_[int(o)] for o in ~(table['Squad'] == map_[1])]

    cols = table.columns.tolist()

    table = table[cols[:3] + cols[-1:] + cols[3:-1]]

    table['Timestamp'] = [timestamp for i in table['Squad']]
    
    return table


def extract_tables(team_link):
    tables = []
    event_dfs = []

    match_table = get_match_table(team_link)

    report_links = get_report_links(match_table)
    
    for rl in report_links:
        table = get_shot_table_df(rl)
        
        tables.append(table)
            
    return tables, event_dfs

# Create Event Tables functions

In [16]:
def soup_get_lines(soup):

    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = [line.strip() for line in text.splitlines()]
    # break multi-headlines into a line each
    chunks = [phrase.strip() for line in lines for phrase in line.split("  ")]
    # remove empty elements
    chunks = [chunk for chunk in chunks if len(chunk) > 0]
    
    return chunks

def get_team_names(full_soup):
    
    lines = soup_get_lines(full_soup)
    
    return lines[1], lines[2]

def event_to_list(event_soup, team_name, opponent_team, is_team_a):
    # to list
    event = soup_get_lines(event_soup)
    # correct time
    event[0] = event[0].replace('&rsquor;', '')
    # correct event desc
    event[-1] = event[-1].replace('—\xa0', '')
    # even out length of list
    while len(event) < 6:
        event.append(None)
    # add team name
    event.append(team_name)
    # add opponent team name
    event.append(opponent_team)
    # set score to the difference in goals
    a, b = event[1].split(':')[0], event[1].split(':')[-1]
    dif = int(a) - int(b)
    # in case of team a: goals_a - goals_b
    event[1] = dif if is_team_a else -dif
    
    return event

def list_events(link):
    req = requests.get(link)
    
    print(req)
    
    html = req.text
    
    start = re.search('<div[\\s]+id="events_wrap">', html).span()[0]
    
    end = re.search('<div[\\s]+id="team_stats">', html).span()[0]
    
    all_events = html[start:end]
    
    full_soup = BeautifulSoup(all_events)
    
    team_a, team_b = get_team_names(full_soup)
    
    # extract team_a events
    
    a_soups = full_soup.findAll("div", {"class": "event a"})
    
    a_events = [event_to_list(a_soup, team_a, team_b, True) for a_soup in a_soups]
    
    # extract team_b events
    
    b_soups = full_soup.findAll("div", {"class": "event b"})
    
    b_events = [event_to_list(b_soup, team_b, team_a, False) for b_soup in b_soups]
    
    events = a_events + b_events
    
    return events, get_match_timestamp(html)

def minute_plus(minute):
    # to turn minute values with + to ints
    # 100 is added to values greater than 45 to separate the first half
    if '+' in minute:
        ms = minute.split('+')

        if int(float(ms[0])) > 45:
            return int(float(ms[0])) + int(float(ms[1])) + 100
        else:
            return int(float(ms[0])) + int(float(ms[1]))
    else:
        if int(float(minute)) > 45:
            return int(float(minute)) + 100
        else:
            return int(float(minute))

def get_event_df(link):
    
    events, timestamp = list_events(link)    
    
    events_df = pd.DataFrame(events)
    
    # drop substitution rows
    events_df = events_df[np.logical_not(['for ' in e for e in events_df[3]])].sort_values(0).reset_index(drop=True)
    
    # name columns
    events_df.columns = ['Minute', 'Score', 'Player', 'Notes', 'SCA 1 Player', 'SCA 1 Event', 'Squad', 'Against']
    
    # set timestamp
    events_df['Timestamp'] = timestamp
    
    # move 'assist' to SCA 1 Event
    assists = events_df['Notes'] == 'Assist:'
    events_df.loc[assists, 'Notes'] = events_df[assists]['SCA 1 Event']
    events_df.loc[assists, 'SCA 1 Event'] = 'Assist'
    
    # set minutes as ints
    events_df.loc[:, 'Minute'] = [minute_plus(m) for m in events_df.loc[:, 'Minute']]
    events_df['Minute'] = events_df['Minute'].astype('int64')
    
    # add player advantage feature
    events_df['Player Advantage'] = np.zeros(len(events_df))
    red_cards = events_df[events_df['Notes'] == 'Red Card']
    for i, red_card in red_cards.iterrows():
        time = events_df['Minute'] >= red_card['Minute']
        team = events_df['Squad'] == red_card['Squad']
        events_df.loc[np.logical_and(time, team), 'Player Advantage'] -= 1
        events_df.loc[np.logical_and(time, ~team), 'Player Advantage'] += 1
    
    return events_df.sort_values('Minute').reset_index(drop=True)

def extract_team_event_dfs(team_link):
    match_table = get_match_table(team_link)

    report_links = get_report_links(match_table)

    return pd.concat([get_event_df(rl) for rl in report_links], ignore_index=True)

# Function for Generating Dataframe

In [17]:
def extract_shooting_data(team_link):
    match_table = get_match_table(team_link)

    report_links = get_report_links(match_table)
    
    all_dfs = []
    
    for rl in report_links:
        event_df = get_event_df(rl)
        
        shot_df = get_shot_table_df(rl)
        
        shot_df['Score'] = np.zeros(len(shot_df))
        shot_df['Player Advantage'] = np.zeros(len(shot_df))
        for i, event in event_df.iterrows():
            time = shot_df['Minute'] >= event['Minute']
            shot_df.loc[time, 'Score'] = event['Score']
            shot_df.loc[time, 'Player Advantage'] = event['Player Advantage']
            
        all_dfs.append(shot_df)
        
    concat_df = pd.concat(all_dfs, ignore_index=True)
    
    return concat_df