# Stage 1: Build a database

## Basic match dataframe setup

In [1]:
import sqlite3
import pandas as pd
from time import time

start = time()

path = "./"
database = path + 'football_database.sqlite'

conn = sqlite3.connect(database)

df = pd.read_sql_query("""
        SELECT id, country_id, league_id, season, stage, date, 
            match_api_id, home_team_api_id, away_team_api_id, 
            home_team_goal, away_team_goal, home_player_1, home_player_2, 
            home_player_3, home_player_4, home_player_5, 
            home_player_6, home_player_7, home_player_8, 
            home_player_9, home_player_10, home_player_11, 
            away_player_1, away_player_2, away_player_3, 
            away_player_4, away_player_5, away_player_6, 
            away_player_7, away_player_8, away_player_9, 
            away_player_10, away_player_11, goal, shoton, 
            shotoff, foulcommit, card, cross, corner, possession
        FROM match
        """, conn)

df.set_index('id', drop=True, inplace=True)

In [2]:
# drop null values
df.dropna(inplace=True)

#Replace the previous line with this to drop nans in certain columns only
#df.dropna(subset=['home_player_1', 'home_player_2'], inplace=True)

df.shape

(13325, 40)

In [3]:
# Change player ids to int instead of float

home_fields = [f'home_player_{i}' for i in range(1, 12)]
away_fields = [f'away_player_{i}' for i in range(1, 12)]

df[home_fields] = df[home_fields].astype(int)
df[away_fields] = df[away_fields].astype(int)

df['date'] = df['date'].apply(pd.to_datetime)

In [4]:
# Make df smaller for faster processing
df = df.iloc[12325:13325]

df.shape

(1000, 40)

In [5]:
# Add match_result

df.loc[df['home_team_goal'] > df['away_team_goal'], 'match_result'] = 1
df.loc[df['home_team_goal'] < df['away_team_goal'], 'match_result'] = 2
df.loc[df['home_team_goal'] == df['away_team_goal'], 'match_result'] = 3
df['match_result'] = df['match_result'].astype(int)

# Get the first year of season
df['season'] = df['season'].apply(lambda x: x[:4])

df.head(5)

Unnamed: 0_level_0,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,...,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,match_result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23607,21518,21518,2013,26,2014-03-02,1506071,8370,10267,1,0,...,241825,<goal><value><comment>n</comment><stats><goals...,<shoton />,<shotoff />,<foulcommit />,<card><value><comment>y</comment><stats><ycard...,<cross />,<corner />,<possession />,1
23608,21518,21518,2013,27,2014-03-09,1506072,10267,8315,1,1,...,33028,<goal><value><comment>n</comment><stats><goals...,<shoton />,<shotoff />,<foulcommit />,<card><value><comment>y</comment><stats><ycard...,<cross />,<corner />,<possession />,3
23609,21518,21518,2013,27,2014-03-08,1506073,7878,10205,2,0,...,150644,<goal><value><comment>p</comment><stats><penal...,<shoton />,<shotoff />,<foulcommit />,<card><value><comment>y</comment><stats><ycard...,<cross />,<corner />,<possession />,1
23610,21518,21518,2013,27,2014-03-08,1506074,8603,8305,2,0,...,38573,<goal><value><comment>n</comment><stats><goals...,<shoton />,<shotoff />,<foulcommit />,<card><value><comment>y</comment><stats><ycard...,<cross />,<corner />,<possession />,1
23611,21518,21518,2013,27,2014-03-09,1506075,8558,10268,3,1,...,500791,<goal><value><comment>n</comment><stats><goals...,<shoton />,<shotoff />,<foulcommit />,<card><value><comment>y</comment><stats><ycard...,<cross />,<corner />,<possession />,1


In [6]:
## Calculate lineup average age

# Load birth year data
birth_day = pd.read_sql_query("""
        SELECT player_api_id as id, birthday
        FROM player
""", conn, params=())

birth_day.head(5)

Unnamed: 0,id,birthday
0,505942,1992-02-29 00:00:00
1,155782,1989-12-15 00:00:00
2,162549,1991-05-13 00:00:00
3,30572,1982-05-08 00:00:00
4,23780,1979-11-08 00:00:00


In [7]:
# Get average age of a line up for home players

dates = df[home_fields].astype(int).replace(to_replace=birth_day['id'].tolist(), value=birth_day['birthday'].tolist())
dates['date'] = df['date']

dates.head(5)

Unnamed: 0_level_0,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
23607,1984-06-22 00:00:00,1985-07-11 00:00:00,1989-06-06 00:00:00,1983-01-13 00:00:00,1981-05-26 00:00:00,1981-02-28 00:00:00,1994-11-21 00:00:00,1991-03-23 00:00:00,1988-03-20 00:00:00,1990-01-04 00:00:00,1984-08-20 00:00:00,2014-03-02
23608,1985-06-24 00:00:00,1984-02-25 00:00:00,1985-02-14 00:00:00,1983-10-29 00:00:00,1993-03-01 00:00:00,1989-04-16 00:00:00,1980-01-16 00:00:00,1989-12-26 00:00:00,1984-04-01 00:00:00,1993-01-20 00:00:00,1993-08-30 00:00:00,2014-03-09
23609,1979-01-25 00:00:00,1988-05-10 00:00:00,1992-02-20 00:00:00,1992-05-27 00:00:00,1989-11-02 00:00:00,1987-08-03 00:00:00,1984-06-23 00:00:00,1991-01-11 00:00:00,1980-08-11 00:00:00,1987-02-03 00:00:00,1990-02-08 00:00:00,2014-03-08
23610,1987-05-13 00:00:00,1988-09-11 00:00:00,1984-04-10 00:00:00,1987-05-16 00:00:00,1989-06-09 00:00:00,1990-03-06 00:00:00,1991-06-13 00:00:00,1984-03-18 00:00:00,1992-08-26 00:00:00,1981-06-27 00:00:00,1992-03-08 00:00:00,2014-03-08
23611,1986-10-02 00:00:00,1986-01-21 00:00:00,1981-03-10 00:00:00,1988-01-17 00:00:00,1990-01-05 00:00:00,1987-09-08 00:00:00,1989-10-09 00:00:00,1986-10-12 00:00:00,1989-10-06 00:00:00,1979-10-31 00:00:00,1983-06-09 00:00:00,2014-03-09


In [8]:
# Calculate average difference from player ages
dates = dates.apply(pd.to_datetime)

df['avg_home_ages'] = dates[home_fields].apply(lambda s: dates['date']-s).mean(axis=1).astype('timedelta64[D]').apply(lambda s: s/365).round(1)

df['avg_home_ages'].head(5)

id
23607    27.5
23608    26.7
23609    26.7
23610    26.1
23611    27.9
Name: avg_home_ages, dtype: float64

In [9]:
# Get average age of a line up for away players

dates = df[away_fields].astype(int).replace(to_replace=birth_day['id'].tolist(), value=birth_day['birthday'].tolist())
dates['date'] = df['date']

dates.head(5)

Unnamed: 0_level_0,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
23607,1985-06-24 00:00:00,1987-06-12 00:00:00,1985-02-14 00:00:00,1983-10-29 00:00:00,1993-03-01 00:00:00,1992-05-21 00:00:00,1980-01-16 00:00:00,1989-12-26 00:00:00,1989-11-20 00:00:00,1988-07-29 00:00:00,1993-08-30 00:00:00,2014-03-02
23608,1981-03-06 00:00:00,1989-04-14 00:00:00,1980-08-19 00:00:00,1994-05-27 00:00:00,1988-02-29 00:00:00,1989-03-08 00:00:00,1987-12-14 00:00:00,1989-08-14 00:00:00,1984-11-04 00:00:00,1992-12-19 00:00:00,1981-02-11 00:00:00,2014-03-09
23609,1989-06-28 00:00:00,1990-11-24 00:00:00,1990-11-22 00:00:00,1984-06-12 00:00:00,1988-03-18 00:00:00,1994-06-23 00:00:00,1991-10-17 00:00:00,1987-10-14 00:00:00,1994-11-10 00:00:00,1984-01-05 00:00:00,1987-05-12 00:00:00,2014-03-08
23610,1982-04-27 00:00:00,1984-12-21 00:00:00,1989-09-01 00:00:00,1985-04-09 00:00:00,1985-08-30 00:00:00,1982-04-01 00:00:00,1984-03-15 00:00:00,1986-11-24 00:00:00,1992-05-11 00:00:00,1982-07-02 00:00:00,1984-08-07 00:00:00,2014-03-08
23611,1981-09-29 00:00:00,1988-04-27 00:00:00,1989-01-27 00:00:00,1987-06-05 00:00:00,1982-02-09 00:00:00,1986-05-11 00:00:00,1986-02-06 00:00:00,1983-01-05 00:00:00,1992-11-02 00:00:00,1984-02-25 00:00:00,1991-03-01 00:00:00,2014-03-09


In [10]:
# Calculate average difference from player ages
dates = dates.apply(pd.to_datetime)

df['avg_away_ages'] = dates[away_fields].apply(lambda s: dates['date']-s).mean(axis=1).astype('timedelta64[D]').apply(lambda s: s/365).round(1)

df['avg_away_ages'].head(5)

id
23607    26.0
23608    27.0
23609    24.7
23610    28.7
23611    27.6
Name: avg_away_ages, dtype: float64

## Task 1A: Parse XML fields

In [11]:
# Create a helper function that counts team nodes
from collections import Counter
import xml.etree.ElementTree as ET

def count_teams(xml_string):
    c = Counter()
    root = ET.fromstring(xml_string)
    for t in root.iter('team'):
        c.update([t.text])
    return c

In [12]:
# Add common XML fields to the main dataframe
common_xml_fields = ['shoton', 'shotoff', 'foulcommit', 'corner']

for field in common_xml_fields:
    df[f'home_{field}'] = df.apply(lambda s: count_teams(s[field])[str(s['home_team_api_id'])], axis=1)
    df[f'away_{field}'] = df.apply(lambda s: count_teams(s[field])[str(s['away_team_api_id'])], axis=1)

df.head(5)

Unnamed: 0_level_0,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,...,avg_home_ages,avg_away_ages,home_shoton,away_shoton,home_shotoff,away_shotoff,home_foulcommit,away_foulcommit,home_corner,away_corner
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23607,21518,21518,2013,26,2014-03-02,1506071,8370,10267,1,0,...,27.5,26.0,0,0,0,0,0,0,0,0
23608,21518,21518,2013,27,2014-03-09,1506072,10267,8315,1,1,...,26.7,27.0,0,0,0,0,0,0,0,0
23609,21518,21518,2013,27,2014-03-08,1506073,7878,10205,2,0,...,26.7,24.7,0,0,0,0,0,0,0,0
23610,21518,21518,2013,27,2014-03-08,1506074,8603,8305,2,0,...,26.1,28.7,0,0,0,0,0,0,0,0
23611,21518,21518,2013,27,2014-03-09,1506075,8558,10268,3,1,...,27.9,27.6,0,0,0,0,0,0,0,0


In [13]:
# Get card details

def get_cards(xml_string, card_type):
    c = Counter()
    for val in ET.fromstring(xml_string).iter('value'):
        if val.find('comment').text == card_type:
            try:
                c.update([val.find('team').text])
            except AttributeError as e:
                pass
    return c

df['home_ycard'] = df.apply(lambda s: get_cards(s['card'], 'y')[str(s['home_team_api_id'])], axis=1)
df['home_rcard'] = df.apply(lambda s: get_cards(s['card'], 'r')[str(s['home_team_api_id'])], axis=1)
df['away_ycard'] = df.apply(lambda s: get_cards(s['card'], 'y')[str(s['away_team_api_id'])], axis=1)
df['away_rcard'] = df.apply(lambda s: get_cards(s['card'], 'r')[str(s['away_team_api_id'])], axis=1)

df[['home_ycard', 'home_rcard', 'away_ycard', 'away_rcard']].head(5)

Unnamed: 0_level_0,home_ycard,home_rcard,away_ycard,away_rcard
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
23607,3,0,2,0
23608,3,0,2,0
23609,4,0,1,0
23610,3,0,3,0
23611,1,0,3,0


In [14]:
# Get possession details for each team

def get_pos(xml_string, filter):
    try:
        val = ET.fromstring(xml_string).findall(filter)[-1].text
        return val
    except IndexError as e:
        return 50

df['away_pos'] = df['possession'].apply(lambda s: get_pos(s, './value/awaypos'))
df['home_pos'] = df['possession'].apply(lambda s: get_pos(s, './value/homepos'))

df[['away_pos', 'home_pos']].head(5)

Unnamed: 0_level_0,away_pos,home_pos
id,Unnamed: 1_level_1,Unnamed: 2_level_1
23607,50,50
23608,50,50
23609,50,50
23610,50,50
23611,50,50


## Task 1B: Parsing player attributes

In [15]:
# Load Player attributes
player_attrib = pd.read_sql_query("""
        SELECT  player_api_id, date, overall_rating, crossing, finishing, heading_accuracy,
                short_passing, volleys, dribbling, curve, free_kick_accuracy,
                long_passing, ball_control, acceleration, sprint_speed, agility,
                reactions, balance, shot_power, jumping, stamina, strength,
                long_shots, aggression, interceptions, positioning, vision,
                penalties, marking, standing_tackle, sliding_tackle
        FROM player_attributes
""", conn, params=())

player_attrib['date'] = player_attrib['date'].apply(pd.to_datetime)

player_attrib.head(5)

Unnamed: 0,player_api_id,date,overall_rating,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,...,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle
0,505942,2016-02-18,67.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,...,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0
1,505942,2015-11-19,67.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,...,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0
2,505942,2015-09-21,62.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,...,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0
3,505942,2015-03-20,61.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,...,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0
4,505942,2007-02-22,61.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,...,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0


In [16]:
# Prepare helper functions and variables

def get_latest_attribute_idx(player, col_name):
    ''' Takes a player and a field and returns the id of the most recent attribute '''
    earlier_dates = player_attrib.loc[(player_attrib['player_api_id'] == player[col_name]) 
                                      & (player_attrib['date'] < pd.to_datetime(player['date']))]
    return earlier_dates[earlier_dates['date'] == earlier_dates['date'].max()].index[0]

player_attribs = ['overall_rating', 'crossing', 'finishing', 'heading_accuracy',
                    'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
                    'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility',
                    'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength',
                    'long_shots', 'aggression', 'interceptions', 'positioning', 'vision',
                    'penalties', 'marking', 'standing_tackle', 'sliding_tackle'
                 ]

index_list = player_attrib.index.tolist()


In [17]:
# Get the attributes for each player closest to match point
home_attribs = df[['date'] + home_fields[1:]].copy()


# Map the ids to the corresponding attribute ids
for col in home_fields[1:]:
    home_attribs[col] = home_attribs.apply(lambda s: get_latest_attribute_idx(s, col), axis=1)

# Show mapped attribute ids
home_attribs[home_fields[1:]].head(5)

Unnamed: 0_level_0,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
23607,172420,5563,183374,147353,151244,158190,154521,4783,71464,83159
23608,82943,143411,80604,90119,35850,161654,164219,85500,54549,58329
23609,9065,8121,79991,23117,57627,108930,89001,74339,182543,180964
23610,90915,34309,87294,43370,8676,102873,156303,100587,154193,27007
23611,78275,43795,69299,90654,177068,39836,33074,144323,162799,161137


In [18]:
# Get the desired home player attributes

for attrib in player_attribs:
    df[f'home_{attrib}'] = home_attribs[home_fields[1:]].replace(to_replace=index_list, value=player_attrib[attrib].tolist()) \
                            .mean(axis=1).round(1)

df[[f'home_{attrib}' for attrib in player_attribs]].head(5)

Unnamed: 0_level_0,home_overall_rating,home_crossing,home_finishing,home_heading_accuracy,home_short_passing,home_volleys,home_dribbling,home_curve,home_free_kick_accuracy,home_long_passing,...,home_strength,home_long_shots,home_aggression,home_interceptions,home_positioning,home_vision,home_penalties,home_marking,home_standing_tackle,home_sliding_tackle
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23607,72.5,57.2,54.6,61.4,68.7,54.0,62.9,60.0,58.6,66.8,...,69.3,57.5,65.4,56.5,59.9,59.4,57.0,52.8,59.3,56.9
23608,75.8,65.4,59.7,60.1,72.4,59.3,69.7,68.9,62.5,66.0,...,65.5,66.4,67.0,60.8,68.4,66.7,62.2,55.2,60.8,58.0
23609,72.1,61.3,48.2,60.1,67.3,54.0,65.8,61.8,50.6,63.7,...,71.1,57.9,62.8,62.7,57.0,62.2,50.5,53.2,62.0,60.6
23610,72.5,60.7,52.7,61.7,67.9,47.4,66.6,59.2,48.9,62.7,...,70.5,57.1,62.2,59.0,65.0,63.6,50.2,49.6,57.2,50.6
23611,73.2,63.6,55.5,61.2,67.2,53.5,66.5,63.3,56.9,65.6,...,67.9,58.3,70.1,56.1,59.8,64.7,62.4,50.5,56.3,54.0


In [19]:
# Get the average attributes for the away players
away_attribs = df[['date'] + away_fields[1:]].copy()


# Map the ids to the corresponding attribute ids
for col in away_fields[1:]:
    away_attribs[col] = away_attribs.apply(lambda s: get_latest_attribute_idx(s, col), axis=1)

# Get the desired away player attributes

for attrib in player_attribs:
    df[f'away_{attrib}'] = away_attribs[away_fields[1:]].replace(to_replace=index_list, value=player_attrib[attrib].tolist()) \
                            .mean(axis=1).round(1)

df[[f'away_{attrib}' for attrib in player_attribs]].head(5)

Unnamed: 0_level_0,away_overall_rating,away_crossing,away_finishing,away_heading_accuracy,away_short_passing,away_volleys,away_dribbling,away_curve,away_free_kick_accuracy,away_long_passing,...,away_strength,away_long_shots,away_aggression,away_interceptions,away_positioning,away_vision,away_penalties,away_marking,away_standing_tackle,away_sliding_tackle
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23607,73.2,64.9,58.4,60.7,69.3,58.2,65.3,63.9,59.8,66.1,...,66.4,65.0,61.9,59.8,67.3,63.8,59.7,55.5,59.8,57.0
23608,76.4,61.4,60.1,60.9,74.7,54.7,68.9,67.3,57.1,69.2,...,69.4,59.7,72.9,65.8,68.0,70.1,53.0,52.9,61.4,57.2
23609,72.3,61.3,52.0,57.7,70.6,49.8,71.3,62.3,57.5,65.9,...,60.9,60.7,61.7,59.8,63.3,66.9,57.7,46.1,56.4,52.1
23610,73.7,66.2,54.5,62.1,70.8,54.5,68.3,66.1,59.7,67.6,...,68.1,60.9,66.6,59.9,61.3,66.1,57.5,49.6,57.0,56.4
23611,71.4,59.6,60.5,58.7,67.5,52.1,64.8,66.4,61.2,66.9,...,70.6,64.0,62.2,55.2,62.3,59.5,59.2,49.4,55.0,53.3


In [20]:
# Load GK Attributes
gk_attrib = pd.read_sql_query("""
        SELECT  player_api_id, date, gk_diving, overall_rating as gk_rating,
                gk_handling, gk_kicking, gk_positioning, gk_reflexes
        FROM player_attributes
""", conn)

gk_attrib['date'] = gk_attrib['date'].apply(pd.to_datetime)

gk_attrib.head(5)

Unnamed: 0,player_api_id,date,gk_diving,gk_rating,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,505942,2016-02-18,6.0,67.0,11.0,10.0,8.0,8.0
1,505942,2015-11-19,6.0,67.0,11.0,10.0,8.0,8.0
2,505942,2015-09-21,6.0,62.0,11.0,10.0,8.0,8.0
3,505942,2015-03-20,5.0,61.0,10.0,9.0,7.0,7.0
4,505942,2007-02-22,5.0,61.0,10.0,9.0,7.0,7.0


In [21]:
# Load GK attributes for home team
def get_latest_attribute_idx(gk, col_name):
    earlier_dates = gk_attrib.loc[(gk_attrib['player_api_id'] == gk[col_name]) 
                                      & (gk_attrib['date'] < pd.to_datetime(gk['date']))]
    return earlier_dates[earlier_dates['date'] == earlier_dates['date'].max()].index[0]

gk_attributes = ['gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning', 'gk_reflexes', 'gk_rating']

for attrib in gk_attributes:
    df[f'home_{attrib}'] = df[['date', 'home_player_1']].apply(lambda s: get_latest_attribute_idx(s, 'home_player_1'), axis=1) \
                             .replace(to_replace=index_list, value=gk_attrib[attrib].tolist())

df[[f'home_{attrib}' for attrib in gk_attributes]].head(5)

Unnamed: 0_level_0,home_gk_diving,home_gk_handling,home_gk_kicking,home_gk_positioning,home_gk_reflexes,home_gk_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23607,78.0,66.0,63.0,65.0,81.0,72.0
23608,90.0,72.0,74.0,73.0,90.0,81.0
23609,77.0,68.0,72.0,69.0,80.0,73.0
23610,70.0,76.0,67.0,64.0,78.0,71.0
23611,78.0,79.0,62.0,77.0,75.0,76.0


In [22]:
# Load GK attributes for away team

for attrib in gk_attributes:
    df[f'away_{attrib}'] = df[['date', 'away_player_1']].apply(lambda s: get_latest_attribute_idx(s, 'away_player_1'), axis=1) \
                             .replace(to_replace=index_list, value=gk_attrib[attrib].tolist())

df[[f'away_{attrib}' for attrib in gk_attributes]].head(5)

Unnamed: 0_level_0,away_gk_diving,away_gk_handling,away_gk_kicking,away_gk_positioning,away_gk_reflexes,away_gk_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
23607,90.0,72.0,74.0,73.0,90.0,81.0
23608,67.0,79.0,75.0,80.0,68.0,74.0
23609,81.0,74.0,68.0,69.0,74.0,74.0
23610,77.0,67.0,73.0,65.0,76.0,71.0
23611,73.0,72.0,66.0,75.0,79.0,74.0


In [23]:
# Calculate total_overall_rating

df['home_total_overall_rating'] = (( 10 * df['home_overall_rating'] + df['home_gk_rating'] ) / 11).round(1)
df['away_total_overall_rating'] = (( 10 * df['away_overall_rating'] + df['away_gk_rating'] ) / 11).round(1)

df[['home_total_overall_rating', 'away_total_overall_rating']].head()

Unnamed: 0_level_0,home_total_overall_rating,away_total_overall_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1
23607,72.5,73.9
23608,76.3,76.2
23609,72.2,72.5
23610,72.4,73.5
23611,73.5,71.6


## Task 1C: Team attributes

In [24]:
# Load Player attributes
team_attrib = pd.read_sql_query("""
        SELECT  team_api_id, date, buildUpPlaySpeed, buildUpPlaySpeedClass, buildUpPlayDribblingClass,
                buildUpPlayPassing, buildUpPlayPassingClass, buildUpPlayPositioningClass, chanceCreationPassing,
                chanceCreationPassingClass, chanceCreationCrossing, chanceCreationCrossingClass, chanceCreationShooting,
                chanceCreationShootingClass, chanceCreationPositioningClass, defencePressure, defencePressureClass,
                defenceAggression, defenceAggressionClass, defenceTeamWidth, defenceTeamWidthClass, defenceDefenderLineClass
        FROM team_attributes
""", conn)

team_attrib['date'] = team_attrib['date'].apply(pd.to_datetime)

team_attrib.head(5)

Unnamed: 0,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationPassingClass,...,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,9930,2010-02-22,60,Balanced,Little,50,Mixed,Organised,60,Normal,...,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,9930,2014-09-19,52,Balanced,Normal,56,Mixed,Organised,54,Normal,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,9930,2015-09-10,47,Balanced,Normal,54,Mixed,Organised,54,Normal,...,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,8485,2010-02-22,70,Fast,Little,70,Long,Organised,70,Risky,...,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,8485,2011-02-22,47,Balanced,Little,52,Mixed,Organised,53,Normal,...,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover


In [25]:
# Change all dates before the min date in team attributes to the minimum

df_copy = df[['date', 'home_team_api_id', 'away_team_api_id']].copy()

df_copy.loc[df_copy['date'] < team_attrib['date'].min(), 'date'] = team_attrib['date'].min()

df_copy.head(5)

Unnamed: 0_level_0,date,home_team_api_id,away_team_api_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
23607,2014-03-02,8370,10267
23608,2014-03-09,10267,8315
23609,2014-03-08,7878,10205
23610,2014-03-08,8603,8305
23611,2014-03-09,8558,10268


In [26]:
# set up helper functions and variables

def get_latest_attribute_idx(team, col_name):
    try:
        earlier_dates = team_attrib.loc[(team_attrib['team_api_id'] == team[col_name])
                                    & (team_attrib['date'] <= pd.to_datetime(team['date']))]
        index = earlier_dates[earlier_dates['date'] == earlier_dates['date'].max()].index[0]
    except IndexError as e:
        # No avilable team attributes before the match date, so we take the earliest available date
        earlier_dates = team_attrib.loc[team_attrib['team_api_id'] == team[col_name]]
        try:
            index = earlier_dates[earlier_dates['date'] == earlier_dates['date'].min()].index[0]
        except IndexError as e:
            index = 9930
    return index


index_list = team_attrib.index.tolist()

team_attribs = ['buildUpPlaySpeed', 'buildUpPlaySpeedClass', 'buildUpPlayDribblingClass',
                'buildUpPlayPassing', 'buildUpPlayPassingClass', 'buildUpPlayPositioningClass', 'chanceCreationPassing',
                'chanceCreationPassingClass', 'chanceCreationCrossing', 'chanceCreationCrossingClass', 'chanceCreationShooting',
                'chanceCreationShootingClass', 'chanceCreationPositioningClass', 'defencePressure',
                'defencePressureClass', 'defenceAggression', 'defenceAggressionClass', 'defenceTeamWidth',
                'defenceTeamWidthClass', 'defenceDefenderLineClass']

In [27]:
# get home team attributes

for attrib in team_attribs:
    df[f'home_{attrib}'] = df_copy[['date', 'home_team_api_id']] \
                                     .apply(lambda s: get_latest_attribute_idx(s, 'home_team_api_id'), axis=1) \
                                     .replace(to_replace=index_list, value=team_attrib[attrib].tolist())

df[[f'home_{attrib}' for attrib in team_attribs]].head(5)

Unnamed: 0_level_0,home_buildUpPlaySpeed,home_buildUpPlaySpeedClass,home_buildUpPlayDribblingClass,home_buildUpPlayPassing,home_buildUpPlayPassingClass,home_buildUpPlayPositioningClass,home_chanceCreationPassing,home_chanceCreationPassingClass,home_chanceCreationCrossing,home_chanceCreationCrossingClass,home_chanceCreationShooting,home_chanceCreationShootingClass,home_chanceCreationPositioningClass,home_defencePressure,home_defencePressureClass,home_defenceAggression,home_defenceAggressionClass,home_defenceTeamWidth,home_defenceTeamWidthClass,home_defenceDefenderLineClass
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
23607,45,Balanced,Little,44,Mixed,Free Form,44,Normal,44,Normal,47,Normal,Organised,63,Medium,51,Press,59,Normal,Cover
23608,20,Slow,Little,35,Mixed,Organised,38,Normal,56,Normal,55,Normal,Organised,51,Medium,38,Press,59,Normal,Cover
23609,52,Balanced,Little,37,Mixed,Organised,72,Risky,76,Lots,52,Normal,Organised,49,Medium,45,Press,68,Wide,Cover
23610,59,Balanced,Little,37,Mixed,Organised,59,Normal,43,Normal,53,Normal,Organised,58,Medium,49,Press,58,Normal,Cover
23611,50,Balanced,Little,44,Mixed,Organised,52,Normal,60,Normal,40,Normal,Free Form,37,Medium,40,Press,45,Normal,Cover


In [28]:
# get away team attributes
for attrib in team_attribs:
    df[f'away_{attrib}'] = df_copy[['date', 'away_team_api_id']] \
                                     .apply(lambda s: get_latest_attribute_idx(s, 'away_team_api_id'), axis=1) \
                                     .replace(to_replace=index_list, value=team_attrib[attrib].tolist())

df[[f'away_{attrib}' for attrib in team_attribs]].head(5)

Unnamed: 0_level_0,away_buildUpPlaySpeed,away_buildUpPlaySpeedClass,away_buildUpPlayDribblingClass,away_buildUpPlayPassing,away_buildUpPlayPassingClass,away_buildUpPlayPositioningClass,away_chanceCreationPassing,away_chanceCreationPassingClass,away_chanceCreationCrossing,away_chanceCreationCrossingClass,away_chanceCreationShooting,away_chanceCreationShootingClass,away_chanceCreationPositioningClass,away_defencePressure,away_defencePressureClass,away_defenceAggression,away_defenceAggressionClass,away_defenceTeamWidth,away_defenceTeamWidthClass,away_defenceDefenderLineClass
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
23607,20,Slow,Little,35,Mixed,Organised,38,Normal,56,Normal,55,Normal,Organised,51,Medium,38,Press,59,Normal,Cover
23608,46,Balanced,Little,38,Mixed,Organised,52,Normal,73,Lots,37,Normal,Free Form,37,Medium,60,Press,68,Wide,Cover
23609,24,Slow,Little,26,Short,Organised,43,Normal,57,Normal,58,Normal,Organised,51,Medium,51,Press,56,Normal,Cover
23610,29,Slow,Little,40,Mixed,Organised,58,Normal,42,Normal,52,Normal,Organised,49,Medium,45,Press,47,Normal,Cover
23611,55,Balanced,Little,73,Long,Organised,53,Normal,54,Normal,57,Normal,Organised,49,Medium,51,Press,58,Normal,Cover


## Task 1D: Stadium Information

In [29]:
# Load Stadium Info
stadiums = pd.read_sql_query("""
        SELECT  team_api_id, capacity
        FROM stadiums
""", conn)

stadiums.head(5)

Unnamed: 0,team_api_id,capacity
0,8722,50000
1,8350,49780
2,8165,50000
3,9905,34000
4,8485,22199


In [30]:
# Add stadium info
import numpy as np

stadiums = stadiums.replace(r'', np.nan).dropna().astype(int)

stadiums.tail()

Unnamed: 0,team_api_id,capacity
119,8654,35303
120,8528,25133
121,8602,27828
122,4087,15660
123,8576,10660


In [31]:
df = df.merge(stadiums, left_on='home_team_api_id', right_on='team_api_id', how='left')

df.drop('team_api_id', axis=1, inplace=True)

df['capacity'].head(5)

0    15489.0
1    55000.0
2    22524.0
3    52500.0
4    40500.0
Name: capacity, dtype: float64

In [32]:
# Save dataframe to csv

df.drop(['shoton', 'card', 'shotoff', 'goal', 'foulcommit', 'cross', 'corner', 'possession'], axis=1, inplace=True)

df.to_csv('result.csv')

In [33]:
end = time()

In [34]:
print("Total code run tie in {:.1f} minutes".format((end-start)/60) + '\n')

Total code run tie in 13.4 minutes

