In [3]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import torch as tor
import plotly as ply
import os
import re
import requests
import json
from IPython.core.display import HTML
from bs4 import BeautifulSoup

# Match Report extraction

In [2]:
def get_match_table(link_string):
    req = requests.get(link_string)
    
    html = req.text
    
    # extract first table in the page
    start = html.index('<table')
    end = html[start:].index('</table>') + start
    
    table = html[start:end]
    
    return table
        
def get_report_links(table):
    contain_links = [cl.group() for cl in list(re.finditer('<td class="left group_start".*</a></td>', table))]
    
    links = []
    
    for cl in contain_links:

        link = re.search('href=".*"', cl).group().replace('href=', '').replace('"', '')
        
        link = 'https://fbref.com' + link
        
        links.append(link)
    
    # for some reason there are two of each.
    return links[::2]

# Shot Table Extraction

In [3]:
def get_match_timestamp(html):
    date = re.search('data-venue-date="[^"]*"', html)\
            .group()\
            .replace('data-venue-date=', '')\
            .replace('"', '')
    time = re.search('data-venue-time="[^"]*"', html)\
            .group()\
            .replace('data-venue-time=', '')\
            .replace('"', '')
    
    timestamp = pd.to_datetime(date + ' ' + time)
    
    return timestamp

def get_shot_table(link_string):
    
    req = requests.get(link_string)
    print(req)
    html = req.text
    
    timestamp = get_match_timestamp(html)
    
    # extract table with id 'shots_all'
    start = re.search('<table .* id="shots_all"', html).span()[0]
    end = html[start:].index('</table>') + start
    
    table = html[start:end]
    
    return table, timestamp

# Season and Team Ids

In [4]:
html = requests.get('https://fbref.com/en/comps/9/Premier-League-Stats#all_stats_shooting_squads').text

In [5]:
start = html.index('<table')
end = html[start:].index('</table>') + start

In [6]:
table = html[start: end]

In [7]:
links = re.findall('href="[^"]*squads[^"]*"', table)
props = np.asarray([l.replace('href=', '')
                    .replace('"', '')
                    .replace('-Stats', '')
                    .split('/') for l in links])

In [8]:
ids = props[:, 3]
names = np.asarray([n.replace('-', ' ') for n in props[:, 4]])

In [9]:
seasons = {'s10728':'2020-2021',
          's3232':'2019-2020',
          's1889':'2018-2019',
          's1631':'2017-2018',
          's1526':'2016-2017',
          's1467':'2015-2016',
          's733':'2014-2015'}

# Generate Links

In [10]:
from urllib.parse import urlparse

In [11]:
premier_league_link = 'https://fbref.com/en/squads/7c21e445/2014-2015/matchlogs/s733/shooting/West-Ham-United-Match-Logs-Premier-League'

path = urlparse(premier_league_link).path

In [12]:
path = path.split('/')

In [13]:
team_links = []

for k, v in seasons.items():
    for n, i in zip(names, ids):
        path[3] = i
        path[4] = v
        path[6] = k
        path[8] = n.replace(' ', '-') + '-Match-Logs-Premier-League'
        
        team_links.append('/'.join(path))

In [14]:
team_links = ['https://fbref.com' + tl for tl in team_links]

# Shot table to df

In [15]:
def get_shot_table_df(report_link):

    table_html, timestamp = get_shot_table(report_link)

    # gives a list of tables, only one table is given
    table = pd.read_html(table_html)[0]

    table.columns = [c[1] if 'Unnamed' in c[0] else c[0] + ' ' + c[1] for c in table.columns]

    table = table.dropna(how='all')

    # set minutes as ints
    table['Minute'] = table['Minute'].astype(str)
    table.loc[:, 'Minute'] = [minute_plus(m) for m in table['Minute']]
    table['Minute'] = table['Minute'].astype(float).astype(int)

    # opposite in match
    map_ = table['Squad'].unique()
    table['Against'] = [map_[int(o)] for o in ~(table['Squad'] == map_[1])]

    cols = table.columns.tolist()

    table = table[cols[:3] + cols[-1:] + cols[3:-1]]

    table['Timestamp'] = [timestamp for i in table['Squad']]
    
    return table


def extract_tables(team_link):
    tables = []
    event_dfs = []

    match_table = get_match_table(team_link)

    report_links = get_report_links(match_table)
    
    for rl in report_links:
        table = get_shot_table_df(rl)
        
        tables.append(table)
            
    return tables, event_dfs

# Create Event Tables functions

In [16]:
def soup_get_lines(soup):

    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = [line.strip() for line in text.splitlines()]
    # break multi-headlines into a line each
    chunks = [phrase.strip() for line in lines for phrase in line.split("  ")]
    # remove empty elements
    chunks = [chunk for chunk in chunks if len(chunk) > 0]
    
    return chunks

def get_team_names(full_soup):
    
    lines = soup_get_lines(full_soup)
    
    return lines[1], lines[2]

def event_to_list(event_soup, team_name, opponent_team, is_team_a):
    # to list
    event = soup_get_lines(event_soup)
    # correct time
    event[0] = event[0].replace('&rsquor;', '')
    # correct event desc
    event[-1] = event[-1].replace('—\xa0', '')
    # even out length of list
    while len(event) < 6:
        event.append(None)
    # add team name
    event.append(team_name)
    # add opponent team name
    event.append(opponent_team)
    # set score to the difference in goals
    a, b = event[1].split(':')[0], event[1].split(':')[-1]
    dif = int(a) - int(b)
    # in case of team a: goals_a - goals_b
    event[1] = dif if is_team_a else -dif
    
    return event

def list_events(link):
    req = requests.get(link)
    
    print(req)
    
    html = req.text
    
    start = re.search('<div[\\s]+id="events_wrap">', html).span()[0]
    
    end = re.search('<div[\\s]+id="team_stats">', html).span()[0]
    
    all_events = html[start:end]
    
    full_soup = BeautifulSoup(all_events)
    
    team_a, team_b = get_team_names(full_soup)
    
    # extract team_a events
    
    a_soups = full_soup.findAll("div", {"class": "event a"})
    
    a_events = [event_to_list(a_soup, team_a, team_b, True) for a_soup in a_soups]
    
    # extract team_b events
    
    b_soups = full_soup.findAll("div", {"class": "event b"})
    
    b_events = [event_to_list(b_soup, team_b, team_a, False) for b_soup in b_soups]
    
    events = a_events + b_events
    
    return events, get_match_timestamp(html)

def minute_plus(minute):
    # to turn minute values with + to ints
    # 100 is added to values greater than 45 to separate the first half
    if '+' in minute:
        ms = minute.split('+')

        if int(float(ms[0])) > 45:
            return int(float(ms[0])) + int(float(ms[1])) + 100
        else:
            return int(float(ms[0])) + int(float(ms[1]))
    else:
        if int(float(minute)) > 45:
            return int(float(minute)) + 100
        else:
            return int(float(minute))

def get_event_df(link):
    
    events, timestamp = list_events(link)    
    
    events_df = pd.DataFrame(events)
    
    # drop substitution rows
    events_df = events_df[np.logical_not(['for ' in e for e in events_df[3]])].sort_values(0).reset_index(drop=True)
    
    # name columns
    events_df.columns = ['Minute', 'Score', 'Player', 'Notes', 'SCA 1 Player', 'SCA 1 Event', 'Squad', 'Against']
    
    # set timestamp
    events_df['Timestamp'] = timestamp
    
    # move 'assist' to SCA 1 Event
    assists = events_df['Notes'] == 'Assist:'
    events_df.loc[assists, 'Notes'] = events_df[assists]['SCA 1 Event']
    events_df.loc[assists, 'SCA 1 Event'] = 'Assist'
    
    # set minutes as ints
    events_df.loc[:, 'Minute'] = [minute_plus(m) for m in events_df.loc[:, 'Minute']]
    events_df['Minute'] = events_df['Minute'].astype('int64')
    
    # add player advantage feature
    events_df['Player Advantage'] = np.zeros(len(events_df))
    red_cards = events_df[events_df['Notes'] == 'Red Card']
    for i, red_card in red_cards.iterrows():
        time = events_df['Minute'] >= red_card['Minute']
        team = events_df['Squad'] == red_card['Squad']
        events_df.loc[np.logical_and(time, team), 'Player Advantage'] -= 1
        events_df.loc[np.logical_and(time, ~team), 'Player Advantage'] += 1
    
    return events_df.sort_values('Minute').reset_index(drop=True)

def extract_team_event_dfs(team_link):
    match_table = get_match_table(team_link)

    report_links = get_report_links(match_table)

    return pd.concat([get_event_df(rl) for rl in report_links], ignore_index=True)

# Function for Generating Dataframe
given team link

In [17]:
def extract_shooting_data(team_link):
    match_table = get_match_table(team_link)

    report_links = get_report_links(match_table)
    
    all_dfs = []
    
    for rl in report_links:
        event_df = get_event_df(rl)
        
        shot_df = get_shot_table_df(rl)
        
        shot_df['Score'] = np.zeros(len(shot_df))
        shot_df['Player Advantage'] = np.zeros(len(shot_df))
        for i, event in event_df.iterrows():
            time = shot_df['Minute'] >= event['Minute']
            shot_df.loc[time, 'Score'] = event['Score']
            shot_df.loc[time, 'Player Advantage'] = event['Player Advantage']
            
        all_dfs.append(shot_df)
        
    concat_df = pd.concat(all_dfs, ignore_index=True)
    
    return concat_df

# Generate all tables

In [18]:
# all_tables = [ ]
# errors = []
# for team_link in team_links:
#     try:
#         all_tables.append(extract_shooting_data(team_link))
#     except:
#         print('error')
#         errors.append(team_link)

In [19]:
# final_df = pd.concat(all_tables, ignore_index=True)
# final_df.to_csv('shot_data_raw.csv')

In [21]:
shot_data_raw = shot_data_raw = pd.read_csv('data/fantasy-league/shot_data_raw.csv', index_col=0)
shot_data_raw = shot_data_raw.drop_duplicates().reset_index(drop=True)

In [22]:
shot_data_raw.head()

Unnamed: 0,Minute,Player,Squad,Against,Outcome,Distance,Body Part,Notes,SCA 1 Player,SCA 1 Event,SCA 2 Player,SCA 2 Event,Timestamp,Score,Player Advantage
0,7,Andros Townsend,Crystal Palace,Manchester Utd,Goal,8.0,Right Foot,,Jeffrey Schlupp,Pass (Live),Tyrick Mitchell,Pass (Live),2020-09-19 17:30:00,1.0,0.0
1,11,Andros Townsend,Crystal Palace,Manchester Utd,Off Target,29.0,Left Foot,,Jeffrey Schlupp,Pass (Live),Wilfried Zaha,Pass (Live),2020-09-19 17:30:00,1.0,0.0
2,13,Timothy Fosu-Mensah,Manchester Utd,Crystal Palace,Blocked,28.0,Right Foot,,Bruno Fernandes,Pass (Live),Marcus Rashford,Pass (Live),2020-09-19 17:30:00,1.0,0.0
3,20,Paul Pogba,Manchester Utd,Crystal Palace,Saved,21.0,Right Foot,,Marcus Rashford,Pass (Live),Luke Shaw,Pass (Live),2020-09-19 17:30:00,1.0,0.0
4,22,Scott McTominay,Manchester Utd,Crystal Palace,Blocked,22.0,Right Foot,,Bruno Fernandes,Pass (Live),Marcus Rashford,Pass (Live),2020-09-19 17:30:00,1.0,0.0


# Penalty Dataframe

In [23]:
# # extract all event tables
# event_table = []
# event_errors = []
# for team_link in team_links:
#     try:
#         event_table.append(extract_team_event_dfs(team_link))
#     except:
#         print('error')
#         event_errors.append(team_link)

# event_df = pd.concat(event_table, ignore_index=True).drop_duplicates().reset_index(drop=True)

In [24]:
event_df = pd.read_csv('data/fantasy-league/events_raw.csv', index_col=0)
event_df['Timestamp'] = pd.to_datetime(event_df['Timestamp'])

In [25]:
# extract penalties
penalty_df = event_df[event_df['Notes'] == 'Penalty Kick'].reset_index()
# adds column to indicate whether a penalty was a goal or not
same_timestamp = penalty_df['Timestamp'].to_numpy() == event_df.iloc[penalty_df['index'] - 1]['Timestamp'].to_numpy()
penalty_df.loc[same_timestamp, 'Goal'] = abs(abs(penalty_df.loc[same_timestamp, 'Score'].to_numpy()) - abs(event_df.iloc[penalty_df.loc[same_timestamp, 'index'] - 1, :]['Score'].to_numpy()))
penalty_df.loc[np.logical_not(same_timestamp), 'Goal'] = penalty_df.loc[np.logical_not(same_timestamp), 'Score']

# Set outcome table
penalty_df['Outcome'] = ['Goal' if g != 0 else 'Blocked' for g in penalty_df['Goal']]
penalty_df = penalty_df.drop(columns=['Goal', 'index'])

penalty_df.head()

Unnamed: 0,Minute,Score,Player,Notes,SCA 1 Player,SCA 1 Event,Squad,Against,Timestamp,Player Advantage,Outcome
0,174,2,Wilfried Zaha,Penalty Kick,Substitute,,Crystal Palace,Manchester United,2020-09-19 17:30:00,0.0,Goal
1,40,1,Neal Maupay,Penalty Kick,Yellow Card,,Brighton & Hove Albion,Manchester United,2020-09-26 12:30:00,0.0,Goal
2,200,1,Bruno Fernandes,Penalty Kick,Yellow Card,,Manchester United,Brighton & Hove Albion,2020-09-26 12:30:00,0.0,Goal
3,2,1,Bruno Fernandes,Penalty Kick,—,,Manchester United,Tottenham Hotspur,2020-10-04 16:30:00,0.0,Goal
4,179,5,Harry Kane,Penalty Kick,Substitute,,Tottenham Hotspur,Manchester United,2020-10-04 16:30:00,1.0,Goal


In [26]:
# combine
shot_data_raw = pd.concat([shot_data_raw, penalty_df], ignore_index=True, axis=0)
shot_data_raw = shot_data_raw.reset_index(drop=True)
shot_data_raw = shot_data_raw.sort_values(['Timestamp', 'Minute'])
shot_data_raw = shot_data_raw.reset_index(drop=True)

In [27]:
shot_data_raw.head()

Unnamed: 0,Minute,Player,Squad,Against,Outcome,Distance,Body Part,Notes,SCA 1 Player,SCA 1 Event,SCA 2 Player,SCA 2 Event,Timestamp,Score,Player Advantage
0,42,Saido Berahino,West Bromwich Albion,Sunderland,Goal,,,Penalty Kick,Yellow Card,,,,2014-08-16 15:00:00,0.0,0.0
1,149,Steven Gerrard,Liverpool,Tottenham Hotspur,Goal,,,Penalty Kick,Yellow Card,,,,2014-08-31 13:30:00,2.0,0.0
2,163,David Nugent,Leicester City,Manchester United,Goal,,,Penalty Kick,Goal,,,,2014-09-21 13:30:00,-1.0,0.0
3,183,Leonardo Ulloa,Leicester City,Manchester United,Goal,,,Penalty Kick,Red Card,,,,2014-09-21 13:30:00,2.0,1.0
4,30,Mile Jedinak,Crystal Palace,Everton,Goal,,,Penalty Kick,Goal,,,,2014-09-21 16:00:00,0.0,0.0


In [28]:
# shot_data_raw.to_csv('shot_penalty_raw.csv')

# Sanitize Data

In [29]:
shot_data_raw = pd.read_csv('data/fantasy-league/shot_penalty_raw.csv', index_col=0)

In [30]:
shot_data_raw.head()

Unnamed: 0,Minute,Player,Squad,Against,Outcome,Distance,Body Part,Notes,SCA 1 Player,SCA 1 Event,SCA 2 Player,SCA 2 Event,Timestamp,Score,Player Advantage
0,42,Saido Berahino,West Bromwich Albion,Sunderland,Goal,,,Penalty Kick,Yellow Card,,,,2014-08-16 15:00:00,0.0,0.0
1,149,Steven Gerrard,Liverpool,Tottenham Hotspur,Goal,,,Penalty Kick,Yellow Card,,,,2014-08-31 13:30:00,2.0,0.0
2,163,David Nugent,Leicester City,Manchester United,Goal,,,Penalty Kick,Goal,,,,2014-09-21 13:30:00,-1.0,0.0
3,183,Leonardo Ulloa,Leicester City,Manchester United,Goal,,,Penalty Kick,Red Card,,,,2014-09-21 13:30:00,2.0,1.0
4,30,Mile Jedinak,Crystal Palace,Everton,Goal,,,Penalty Kick,Goal,,,,2014-09-21 16:00:00,0.0,0.0


In [31]:
shot_data_raw['Timestamp'] = pd.to_datetime(shot_data_raw['Timestamp'])

In [32]:
# remove data from before 2016 since no shot data exists (only penalty data)
shot_data_raw = shot_data_raw[shot_data_raw['Timestamp'] > pd.to_datetime('2016-08-01')].reset_index(drop=True)

In [33]:
# set Timestamp as the first column
timestamp_index = shot_data_raw.columns.tolist().index('Timestamp')
columns = shot_data_raw.columns.tolist()
new_columns = columns[timestamp_index:] + columns[:timestamp_index]
shot_data_raw = shot_data_raw[new_columns]
shot_data_raw.head()

Unnamed: 0,Timestamp,Score,Player Advantage,Minute,Player,Squad,Against,Outcome,Distance,Body Part,Notes,SCA 1 Player,SCA 1 Event,SCA 2 Player,SCA 2 Event
0,2016-08-13 12:30:00,0.0,0.0,147,Riyad Mahrez,Leicester City,Hull City,Goal,,,Penalty Kick,Goal,,,
1,2016-08-13 17:30:00,1.0,0.0,4,Sergio Agüero,Manchester City,Sunderland,Goal,,,Penalty Kick,—,,,
2,2016-08-15 20:00:00,1.0,0.0,147,Eden Hazard,Chelsea,West Ham United,Goal,,,Penalty Kick,Yellow Card,,,
3,2016-08-19 20:00:00,2.0,0.0,152,Zlatan Ibrahimović,Manchester United,Southampton,Goal,,,Penalty Kick,Goal,,,
4,2016-08-20 12:30:00,1.0,0.0,27,Sergio Agüero,Manchester City,Stoke City,Goal,,,Penalty Kick,Yellow Card,,,


## Add threat of player

In [34]:
def split_into_name(name):
    names = re.split('[\\s]+', name)
    first_name = names[0]
    second_name = ' '.join(names[1:])
    
    return first_name, second_name

def get_player_threat(player_csv, player_name):

    f_name_match = player_csv['full_name'] == player_name
    
    threat = player_csv[f_name_match]['threat']
    
    if threat.empty:
        return np.NaN
    else:
        return threat.iloc[0]

In [35]:
matches_2020_21 = np.logical_and(shot_data_raw['Timestamp'] > pd.to_datetime('2020-08-01'), shot_data_raw['Timestamp'] < pd.to_datetime('2021-08-01'))
matches_2019_20 = np.logical_and(shot_data_raw['Timestamp'] > pd.to_datetime('2019-08-01'), shot_data_raw['Timestamp'] < pd.to_datetime('2020-08-01'))
matches_2018_19 = np.logical_and(shot_data_raw['Timestamp'] > pd.to_datetime('2018-08-01'), shot_data_raw['Timestamp'] < pd.to_datetime('2019-08-01'))
matches_2017_18 = np.logical_and(shot_data_raw['Timestamp'] > pd.to_datetime('2017-08-01'), shot_data_raw['Timestamp'] < pd.to_datetime('2018-08-01'))
matches_2016_17 = np.logical_and(shot_data_raw['Timestamp'] > pd.to_datetime('2016-08-01'), shot_data_raw['Timestamp'] < pd.to_datetime('2017-08-01'))

matches_year = [matches_2020_21, matches_2019_20, matches_2018_19, matches_2017_18, matches_2016_17]

In [36]:
players_csv = [pd.read_csv('data/fantasy-league/2020-21/players_raw.csv'),
              pd.read_csv('data/fantasy-league/2019-20/players_raw.csv'),
              pd.read_csv('data/fantasy-league/2018-19/players_raw.csv'),
              pd.read_csv('data/fantasy-league/2017-18/players_raw.csv'),
              pd.read_csv('data/fantasy-league/2016-17/players_raw.csv')]

for csv in players_csv:
    csv['full_name'] = csv['first_name'] + ' ' + csv['second_name']

In [129]:
threats = [[get_player_threat(csv, p) for p in shot_data_raw[year]['Player']] for year, csv in zip(matches_year, players_csv)]

In [130]:
shot_data_raw['Threat'] = np.concatenate(threats)

In [131]:
shot_data_raw.head()

Unnamed: 0,Timestamp,Score,Player Advantage,Minute,Player,Squad,Against,Outcome,Distance,Body Part,Notes,SCA 1 Player,SCA 1 Event,SCA 2 Player,SCA 2 Event,Threat
0,2016-08-13 12:30:00,0.0,0.0,147,Riyad Mahrez,Leicester City,Hull City,Goal,,,Penalty Kick,Goal,,,,720.0
1,2016-08-13 17:30:00,1.0,0.0,4,Sergio Agüero,Manchester City,Sunderland,Goal,,,Penalty Kick,—,,,,720.0
2,2016-08-15 20:00:00,1.0,0.0,147,Eden Hazard,Chelsea,West Ham United,Goal,,,Penalty Kick,Yellow Card,,,,627.0
3,2016-08-19 20:00:00,2.0,0.0,152,Zlatan Ibrahimović,Manchester United,Southampton,Goal,,,Penalty Kick,Goal,,,,627.0
4,2016-08-20 12:30:00,1.0,0.0,27,Sergio Agüero,Manchester City,Stoke City,Goal,,,Penalty Kick,Yellow Card,,,,


In [132]:
# shot_data_raw.to_csv('shot_data.csv')

# Load Data

In [4]:
shot_data = pd.read_csv('data/fantasy-league/shot_data.csv', index_col=0)

In [5]:
shot_data['Timestamp'] = pd.to_datetime(shot_data['Timestamp'])

In [6]:
shot_data['Notes'] = shot_data['Notes'].fillna('normal')

In [7]:
shot_data.head()

Unnamed: 0,Timestamp,Score,Player Advantage,Minute,Player,Squad,Against,Outcome,Distance,Body Part,Notes,SCA 1 Player,SCA 1 Event,SCA 2 Player,SCA 2 Event,Threat
0,2016-08-13 12:30:00,0.0,0.0,147,Riyad Mahrez,Leicester City,Hull City,Goal,,,Penalty Kick,Goal,,,,720.0
1,2016-08-13 17:30:00,1.0,0.0,4,Sergio Agüero,Manchester City,Sunderland,Goal,,,Penalty Kick,—,,,,720.0
2,2016-08-15 20:00:00,1.0,0.0,147,Eden Hazard,Chelsea,West Ham United,Goal,,,Penalty Kick,Yellow Card,,,,627.0
3,2016-08-19 20:00:00,2.0,0.0,152,Zlatan Ibrahimović,Manchester United,Southampton,Goal,,,Penalty Kick,Goal,,,,627.0
4,2016-08-20 12:30:00,1.0,0.0,27,Sergio Agüero,Manchester City,Stoke City,Goal,,,Penalty Kick,Yellow Card,,,,


## Separate Shot by Type    

In [8]:
def shot_data_by_type(type_name, df):
    types = [v for v in df['Notes'].unique() if type_name in v.lower()]
    
    type_df = df[[n in types for n in df['Notes']]]
    type_df = type_df.reset_index(drop=True)
    
    type_goals = type_df['Outcome'] == 'Goal'
    
    type_df = type_df.drop(columns=['Squad', 'Against', 'Outcome', 'Player', 
                                    'Body Part', 'SCA 1 Player', 'SCA 1 Event', 
                                   'SCA 2 Player', 'SCA 2 Event'])
    
    type_df = pd.concat([type_df, pd.get_dummies(type_df['Notes'])], axis=1)
    
    type_df['Goal'] = type_goals.astype(int)
    
    return type_df

In [9]:
types = ['volley', 'header', 'free kick', 'overhead', 'back heel', 'penalty kick']

type_dfs = dict()

all_types = set(shot_data['Notes'].unique())
used = set()

for t in types:
    type_dfs[t] = shot_data_by_type(t, shot_data)
    
    used = used.union(set([v for v in shot_data['Notes'].unique() if t in v.lower()]))

In [10]:
all_types.difference(used)

{'Deflected', 'Lob', 'Open goal', 'normal'}

In [11]:
# remaining to be added to 'normal' shots
rest_of_shots = list(all_types.difference(used))
new_names = list(range(len(rest_of_shots)))

for i in range(len(rest_of_shots)):
    current = rest_of_shots[i]
    new_names[i] = current + ' normal' if not 'normal' == current else current
    
# change the names in the dataframe
for o, n in zip(rest_of_shots, new_names):
    # 'normal' does not change
    if o != n:
        originals = shot_data['Notes'] == o
        shot_data.loc[originals, 'Notes'] = n

In [12]:
type_dfs['normal'] = shot_data_by_type('normal', shot_data)
used = used.union(set([v for v in shot_data['Notes'].unique() if 'normal' in v.lower()]))

# Modeling

In [13]:
volley_df = type_dfs['normal']

In [14]:
volley_df.head()

Unnamed: 0,Timestamp,Score,Player Advantage,Minute,Distance,Notes,Threat,Deflected normal,Lob normal,Open goal normal,normal,Goal
0,2017-08-11 19:45:00,1.0,0.0,2,12.0,normal,109.0,0,0,0,1,1
1,2017-08-11 19:45:00,0.0,0.0,5,2.0,Open goal normal,112.0,0,0,1,0,1
2,2017-08-11 19:45:00,0.0,0.0,8,22.0,normal,381.0,0,0,0,1,0
3,2017-08-11 19:45:00,0.0,0.0,14,23.0,normal,,0,0,0,1,0
4,2017-08-11 19:45:00,0.0,0.0,22,10.0,normal,399.0,0,0,0,1,0


In [15]:
volley_data = volley_df.drop(columns=['Timestamp', 'Notes', 'Minute'])

In [16]:
volley_data.head()

Unnamed: 0,Score,Player Advantage,Distance,Threat,Deflected normal,Lob normal,Open goal normal,normal,Goal
0,1.0,0.0,12.0,109.0,0,0,0,1,1
1,0.0,0.0,2.0,112.0,0,0,1,0,1
2,0.0,0.0,22.0,381.0,0,0,0,1,0
3,0.0,0.0,23.0,,0,0,0,1,0
4,0.0,0.0,10.0,399.0,0,0,0,1,0


In [17]:
volley_data['Threat'] = volley_data['Threat'].fillna(volley_data['Threat'].mean())

In [18]:
volley_data.head()

Unnamed: 0,Score,Player Advantage,Distance,Threat,Deflected normal,Lob normal,Open goal normal,normal,Goal
0,1.0,0.0,12.0,109.0,0,0,0,1,1
1,0.0,0.0,2.0,112.0,0,0,1,0,1
2,0.0,0.0,22.0,381.0,0,0,0,1,0
3,0.0,0.0,23.0,590.79994,0,0,0,1,0
4,0.0,0.0,10.0,399.0,0,0,0,1,0


In [19]:
volley_goals = volley_data['Goal']
volley_data = volley_data.drop(columns=['Goal'])

In [20]:
import random

In [21]:
total = len(volley_goals)

shuffle = np.random.permutation(total)

train_volley_data = volley_data.iloc[shuffle][:int(total*0.9)]
train_volley_labels = volley_goals[shuffle][:int(total*0.9)]

test_volley_data = volley_data.iloc[shuffle][int(total*0.9):]
test_volley_labels = volley_goals[shuffle][int(total*0.9):]


In [22]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=180)
lr.fit(train_volley_data, train_volley_labels)
sum(lr.predict(test_volley_data) != test_volley_labels)

286

In [23]:
lr.coef_

array([[ 4.97311263e-01,  2.57674182e-01, -1.00327384e-01,
         6.28343781e-05,  6.13750949e-01,  2.13513615e-01,
         5.69784161e-01, -1.30287075e+00]])

In [24]:
test_volley_data.columns

Index(['Score', 'Player Advantage', 'Distance', 'Threat', 'Deflected normal',
       'Lob normal', 'Open goal normal', 'normal'],
      dtype='object')

In [25]:
pre = lr.predict(test_volley_data)
pre[pre > 0.5] = 1
pre[pre <= 0.5] = 0

In [26]:
sum(pre != test_volley_labels)

286

In [27]:
sum(test_volley_labels[test_volley_labels > 0] == pre[test_volley_labels > 0])

16

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(kernel='poly', probability=True))
trained = clf.fit(train_volley_data, train_volley_labels * 2 - 1)

In [None]:
pred = trained.predict_proba(test_volley_data)

In [None]:
pred = np.argmax(pred, axis=1)

In [None]:
sum(pred > 0)

In [127]:
1 - sum(pred != test_volley_labels) / len(test_volley_labels)

0.8922204213938412

In [114]:
sum(test_volley_labels[test_volley_labels > 0] == pred[test_volley_labels > 0])

25

In [28]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(train_volley_data, train_volley_labels)
pred = clf.predict_proba(test_volley_data)

In [29]:
1 - sum(np.argmax(pred, axis=1)!= test_volley_labels) / len(test_volley_labels)

0.8804700162074555

In [30]:
sum(test_volley_labels[test_volley_labels > 0] == np.argmax(pred, axis=1)[test_volley_labels > 0])

30

In [31]:
len(test_volley_labels[test_volley_labels > 0])

282

In [47]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(train_volley_data, train_volley_labels)
pred = neigh.predict(test_volley_data)

In [48]:
sum(pred != test_volley_labels)

422

In [49]:
sum(test_volley_labels[test_volley_labels > 0] == pred[test_volley_labels > 0])

56

In [50]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=1, random_state=0)
clf.fit(train_volley_data, train_volley_labels)
pred = clf.predict(test_volley_data)

In [51]:
sum(pred != test_volley_labels)

282

In [52]:
sum(test_volley_labels[test_volley_labels > 0] == pred[test_volley_labels > 0])

0