# Stage 1: Build a database

## Basic match dataframe setup

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect("football_database.sqlite")
df = pd.read_sql_query("""
        SELECT id, country_id, league_id, season, stage, date, 
            match_api_id, home_team_api_id, away_team_api_id, 
            home_team_goal, away_team_goal, home_player_1, home_player_2, 
            home_player_3, home_player_4, home_player_5, 
            home_player_6, home_player_7, home_player_8, 
            home_player_9, home_player_10, home_player_11, 
            away_player_1, away_player_2, away_player_3, 
            away_player_4, away_player_5, away_player_6, 
            away_player_7, away_player_8, away_player_9, 
            away_player_10, away_player_11, goal, shoton, 
            shotoff, foulcommit, card, cross, corner, possession
        FROM match
        """, conn)

df.set_index('id', drop=True, inplace=True)

In [2]:
# drop null values
df.dropna(inplace=True)

df.shape

(13325, 40)

In [3]:
# Change player ids to int instead of float

home_fields = [f'home_player_{i}' for i in range(1, 12)]
away_fields = [f'away_player_{i}' for i in range(1, 12)]

df[home_fields] = df[home_fields].astype(int)
df[away_fields] = df[away_fields].astype(int)

df['date'] = df['date'].apply(pd.to_datetime)

In [4]:
# Make df smaller for faster processing
df = df.head(5)

In [5]:
# Add match_result

df.loc[df['home_team_goal'] > df['away_team_goal'], 'match_result'] = 1
df.loc[df['home_team_goal'] < df['away_team_goal'], 'match_result'] = 2
df.loc[df['home_team_goal'] == df['away_team_goal'], 'match_result'] = 3
df['match_result'] = df['match_result'].astype(int)

# Get the first year of season
df['season'] = df['season'].apply(lambda x: x[:4])

df.head(5)

Unnamed: 0_level_0,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,...,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,match_result
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1729,1729,1729,2008,1,2008-08-17,489042,10260,10261,1,1,...,37799,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>56</comment><event...,3
1730,1729,1729,2008,1,2008-08-16,489043,9825,8659,1,0,...,27267,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card />,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>65</comment><event...,1
1731,1729,1729,2008,1,2008-08-16,489044,8472,8650,0,1,...,30853,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>45</comment><event...,2
1732,1729,1729,2008,1,2008-08-16,489045,8654,8528,2,1,...,34466,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>50</comment><event...,1
1734,1729,1729,2008,1,2008-08-16,489047,8668,8655,2,3,...,30646,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>46</comment><event...,2


In [6]:
## Calculate lineup average age

# Load birth year data
birth_day = pd.read_sql_query("""
        SELECT player_api_id as id, birthday
        FROM player
""", conn, params=())

birth_day.head(5)

Unnamed: 0,id,birthday
0,505942,1992-02-29 00:00:00
1,155782,1989-12-15 00:00:00
2,162549,1991-05-13 00:00:00
3,30572,1982-05-08 00:00:00
4,23780,1979-11-08 00:00:00


In [7]:
df[home_fields]

Unnamed: 0_level_0,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1729,30726,30362,30620,30865,32569,24148,34944,30373,24154,24157,30829
1730,23686,26111,38835,30986,31291,31013,30935,39297,26181,30960,36410
1731,32562,38836,24446,24408,36786,38802,24655,17866,30352,23927,24410
1732,36374,30966,23818,37277,30687,36394,37169,24223,24773,34543,23139
1734,31465,30371,24004,33086,30857,24011,109058,23268,24846,24006,24160


In [8]:
# Get average age of a line up for home players

dates = df[home_fields].astype(int).replace(to_replace=birth_day['id'].tolist(), value=birth_day['birthday'].tolist())
dates['date'] = df['date']

dates.head(5)

Unnamed: 0_level_0,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1729,1970-10-29 00:00:00,1979-10-13 00:00:00,1978-11-07 00:00:00,1981-10-21 00:00:00,1981-05-15 00:00:00,1984-02-01 00:00:00,1981-07-28 00:00:00,1974-11-16 00:00:00,1973-11-29 00:00:00,1987-09-13 00:00:00,1985-10-24 00:00:00,2008-08-17
1730,1977-05-19 00:00:00,1983-02-14 00:00:00,1977-08-17 00:00:00,1987-01-18 00:00:00,1985-07-26 00:00:00,1989-03-16 00:00:00,1983-06-04 00:00:00,1988-02-16 00:00:00,1987-06-26 00:00:00,1984-02-26 00:00:00,1988-01-16 00:00:00,2008-08-16
1731,1982-12-31 00:00:00,1979-02-21 00:00:00,1980-10-11 00:00:00,1980-08-06 00:00:00,1985-06-28 00:00:00,1980-01-06 00:00:00,1979-11-27 00:00:00,1982-07-29 00:00:00,1984-10-21 00:00:00,1981-01-15 00:00:00,1983-03-15 00:00:00,2008-08-16
1732,1980-01-18 00:00:00,1985-04-19 00:00:00,1979-04-18 00:00:00,1983-01-01 00:00:00,1978-03-09 00:00:00,1983-08-01 00:00:00,1987-05-08 00:00:00,1980-10-13 00:00:00,1981-08-14 00:00:00,1983-11-12 00:00:00,1983-11-24 00:00:00,2008-08-16
1734,1979-03-06 00:00:00,1977-01-21 00:00:00,1980-09-06 00:00:00,1982-08-16 00:00:00,1974-09-12 00:00:00,1982-03-26 00:00:00,1991-03-11 00:00:00,1982-08-17 00:00:00,1984-12-11 00:00:00,1981-05-17 00:00:00,1982-11-22 00:00:00,2008-08-16


In [9]:
# Calculate average difference from player ages
dates = dates.apply(pd.to_datetime)

df['avg_home_ages'] = dates[home_fields].apply(lambda s: dates['date']-s).mean(axis=1).astype('timedelta64[D]').apply(lambda s: s/365).round(1)

df['avg_home_ages'].head(5)

id
1729    28.6
1730    24.0
1731    26.8
1732    26.2
1734    26.9
Name: avg_home_ages, dtype: float64

In [10]:
# Get average age of a line up for away players

dates = df[away_fields].astype(int).replace(to_replace=birth_day['id'].tolist(), value=birth_day['birthday'].tolist())
dates['date'] = df['date']

dates.head(5)

Unnamed: 0_level_0,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1729,1976-04-20 00:00:00,1977-10-19 00:00:00,1986-01-23 00:00:00,1982-01-22 00:00:00,1986-05-28 00:00:00,1986-01-04 00:00:00,1987-04-18 00:00:00,1975-01-21 00:00:00,1983-07-05 00:00:00,1979-03-02 00:00:00,1984-10-28 00:00:00,2008-08-17
1730,1985-09-03 00:00:00,1978-10-06 00:00:00,1985-11-30 00:00:00,1980-10-06 00:00:00,1978-12-14 00:00:00,1979-01-02 00:00:00,1982-07-14 00:00:00,1984-12-14 00:00:00,1986-05-25 00:00:00,1983-01-26 00:00:00,1987-03-05 00:00:00,2008-08-16
1731,1982-08-31 00:00:00,1983-01-17 00:00:00,1978-01-28 00:00:00,1973-10-07 00:00:00,1981-09-11 00:00:00,1980-07-22 00:00:00,1980-05-30 00:00:00,1988-03-05 00:00:00,1980-05-05 00:00:00,1980-07-08 00:00:00,1984-03-20 00:00:00,2008-08-16
1732,1981-05-02 00:00:00,1976-11-04 00:00:00,1979-09-24 00:00:00,1983-05-02 00:00:00,1980-03-11 00:00:00,1985-08-04 00:00:00,1988-03-21 00:00:00,1984-07-29 00:00:00,1980-09-27 00:00:00,1983-04-01 00:00:00,1978-01-11 00:00:00,2008-08-16
1734,1979-10-15 00:00:00,1974-07-11 00:00:00,1984-03-28 00:00:00,1977-10-18 00:00:00,1981-12-12 00:00:00,1979-12-27 00:00:00,1980-11-25 00:00:00,1981-09-08 00:00:00,1981-03-10 00:00:00,1978-01-25 00:00:00,1981-08-16 00:00:00,2008-08-16


In [11]:
# Calculate average difference from player ages
dates = dates.apply(pd.to_datetime)

df['avg_away_ages'] = dates[away_fields].apply(lambda s: dates['date']-s).mean(axis=1).astype('timedelta64[D]').apply(lambda s: s/365).round(1)

df['avg_away_ages'].head(5)

id
1729    26.4
1730    25.6
1731    27.4
1732    26.7
1734    28.5
Name: avg_away_ages, dtype: float64

## Task 1A: Parse XML fields

In [12]:
# Create a helper function that counts team nodes
from collections import Counter
import xml.etree.ElementTree as ET

def count_teams(xml_string):
    c = Counter()
    root = ET.fromstring(xml_string)
    for t in root.iter('team'):
        c.update([t.text])
    return c

In [13]:
# Add common XML fields to the main dataframe
common_xml_fields = ['shoton', 'shotoff', 'foulcommit', 'corner']

for field in common_xml_fields:
    df[f'home_{field}'] = df.apply(lambda s: count_teams(s[field])[str(s['home_team_api_id'])], axis=1)
    df[f'away_{field}'] = df.apply(lambda s: count_teams(s[field])[str(s['away_team_api_id'])], axis=1)

df.head(5)

Unnamed: 0_level_0,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,...,avg_home_ages,avg_away_ages,home_shoton,away_shoton,home_shotoff,away_shotoff,home_foulcommit,away_foulcommit,home_corner,away_corner
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1729,1729,1729,2008,1,2008-08-17,489042,10260,10261,1,1,...,28.6,26.4,11,1,10,9,16,11,6,6
1730,1729,1729,2008,1,2008-08-16,489043,9825,8659,1,0,...,24.0,25.6,12,2,13,3,11,9,7,5
1731,1729,1729,2008,1,2008-08-16,489044,8472,8650,0,1,...,26.8,27.4,4,11,3,5,13,12,1,8
1732,1729,1729,2008,1,2008-08-16,489045,8654,8528,2,1,...,26.2,26.7,5,7,7,15,14,13,6,10
1734,1729,1729,2008,1,2008-08-16,489047,8668,8655,2,3,...,26.9,28.5,2,8,7,8,11,11,3,4


In [14]:
# Get card details
def get_cards(xml_string, card_type):
    c = Counter()
    for val in ET.fromstring(xml_string).iter('value'):
        if val.find('card_type').text == card_type:
            c.update([val.find('team').text])
    return c

df['home_ycard'] = df.apply(lambda s: get_cards(s['card'], 'y')[str(s['home_team_api_id'])], axis=1)
df['home_rcard'] = df.apply(lambda s: get_cards(s['card'], 'r')[str(s['home_team_api_id'])], axis=1)
df['away_ycard'] = df.apply(lambda s: get_cards(s['card'], 'y')[str(s['away_team_api_id'])], axis=1)
df['away_rcard'] = df.apply(lambda s: get_cards(s['card'], 'r')[str(s['away_team_api_id'])], axis=1)

df[['home_ycard', 'home_rcard', 'away_ycard', 'away_rcard']].head(5)

Unnamed: 0_level_0,home_ycard,home_rcard,away_ycard,away_rcard
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1729,3,0,0,0
1730,0,0,0,0
1731,0,0,2,0
1732,2,0,1,0
1734,2,0,2,0


In [15]:
# Get possession details for each team

df['away_pos'] = df['possession'].apply(lambda s: ET.fromstring(s).findall('./value/awaypos')[-1].text)
df['home_pos'] = df['possession'].apply(lambda s: ET.fromstring(s).findall('./value/homepos')[-1].text)

df[['away_pos', 'home_pos']].head(5)

Unnamed: 0_level_0,away_pos,home_pos
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1729,45,55
1730,34,66
1731,54,46
1732,48,52
1734,49,51


## Task 1B: Parsing player attributes

In [16]:
# Load Player attributes
player_attrib = pd.read_sql_query("""
        SELECT  player_api_id, date, overall_rating, crossing, finishing, heading_accuracy,
                short_passing, volleys, dribbling, curve, free_kick_accuracy,
                long_passing, ball_control, acceleration, sprint_speed, agility,
                reactions, balance, shot_power, jumping, stamina, strength,
                long_shots, aggression, interceptions, positioning, vision,
                penalties, marking, standing_tackle, sliding_tackle
        FROM player_attributes
""", conn, params=())

player_attrib['date'] = player_attrib['date'].apply(pd.to_datetime)

player_attrib.head(5)

Unnamed: 0,player_api_id,date,overall_rating,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,...,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle
0,505942,2016-02-18,67.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,...,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0
1,505942,2015-11-19,67.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,...,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0
2,505942,2015-09-21,62.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,...,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0
3,505942,2015-03-20,61.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,...,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0
4,505942,2007-02-22,61.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,...,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0


In [17]:
# Prepare helper functions and variables

def get_latest_attribute_idx(player, col_name):
    ''' Takes a player and a field and returns the id of the most recent attribute '''
    earlier_dates = player_attrib.loc[(player_attrib['player_api_id'] == player[col_name]) 
                                      & (player_attrib['date'] < pd.to_datetime(player['date']))]
    return earlier_dates[earlier_dates['date'] == earlier_dates['date'].max()].index[0]

player_attribs = ['overall_rating', 'crossing', 'finishing', 'heading_accuracy',
                    'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
                    'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility',
                    'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength',
                    'long_shots', 'aggression', 'interceptions', 'positioning', 'vision',
                    'penalties', 'marking', 'standing_tackle', 'sliding_tackle'
                 ]

index_list = player_attrib.index.tolist()


In [19]:
# Get the attributes for each player closest to match point
home_attribs = df[['date'] + home_fields[1:]].copy()


# Map the ids to the corresponding attribute ids
for col in home_fields[1:]:
    home_attribs[col] = home_attribs.apply(lambda s: get_latest_attribute_idx(s, col), axis=1)

# Show mapped attribute ids
home_attribs[home_fields[1:]].head(5)

Unnamed: 0_level_0,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1729,179124,149861,131150,139238,38352,122758,140510,155486,57592,178974
1730,18518,179774,84075,60709,169567,50140,42388,156892,50031,131822
1731,139070,134972,37260,143004,164746,169311,13139,96785,48526,38671
1732,176073,119962,25074,104657,91768,114713,158629,119660,26441,41653
1734,143112,89388,85064,134889,126085,74982,143029,100505,100989,3977


In [21]:
# Get the desired home player attributes

for attrib in player_attribs:
    df[f'home_{attrib}'] = home_attribs[home_fields[1:]].replace(to_replace=index_list, value=player_attrib[attrib].tolist()) \
                            .mean(axis=1).round(1)

df[[f'home_{attrib}' for attrib in player_attribs]].head(5)

Unnamed: 0_level_0,home_overall_rating,home_crossing,home_finishing,home_heading_accuracy,home_short_passing,home_volleys,home_dribbling,home_curve,home_free_kick_accuracy,home_long_passing,...,home_strength,home_long_shots,home_aggression,home_interceptions,home_positioning,home_vision,home_penalties,home_marking,home_standing_tackle,home_sliding_tackle
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1729,82.2,66.3,61.0,76.7,77.4,63.7,67.4,61.9,56.0,71.1,...,76.5,60.5,75.8,77.3,82.5,76.3,80.2,63.5,65.8,65.6
1730,78.3,64.4,54.4,69.2,69.8,64.1,71.2,65.0,41.7,58.4,...,72.6,53.4,65.8,70.9,74.2,72.7,70.8,56.3,58.1,64.9
1731,73.3,67.1,57.2,61.5,66.4,56.4,62.3,63.5,54.2,61.0,...,67.8,56.8,71.2,71.6,71.5,66.4,67.6,54.7,58.2,55.9
1732,76.8,61.6,57.9,69.6,66.2,61.0,66.1,67.5,49.6,55.0,...,73.1,60.2,73.2,75.1,76.3,73.0,71.8,57.3,62.9,61.9
1734,76.7,64.2,45.7,72.8,74.2,66.6,60.1,61.6,51.6,64.9,...,72.9,66.3,73.2,74.4,76.7,74.0,70.6,67.9,69.6,69.1


In [None]:
# Get the average attributes for the away players
away_attribs = df[['date'] + away_fields[1:]].copy()


# Map the ids to the corresponding attribute ids
for col in away_fields[1:]:
    away_attribs[col] = away_attribs.apply(lambda s: get_latest_attribute_idx(s, col), axis=1)

# Get the desired away player attributes

for attrib in player_attribs:
    df[f'away_{attrib}'] = away_attribs[away_fields[1:]].replace(to_replace=index_list, value=player_attrib[attrib].tolist()) \
                            .mean(axis=1).round(1)

df[[f'away_{attrib}' for attrib in player_attribs]].head(5)

## Task 1C: Team attributes

In [None]:
# Load Player attributes
team_attrib = pd.read_sql_query("""
        SELECT  team_api_id, date, buildUpPlaySpeed, buildUpPlaySpeedClass, buildUpPlayDribblingClass,
                buildUpPlayPassing, buildUpPlayPassingClass, buildUpPlayPositioningClass, chanceCreationPassing,
                chanceCreationPassingClass, chanceCreationCrossing, chanceCreationCrossingClass, chanceCreationShooting,
                chanceCreationShootingClass, chanceCreationPositioningClass, defencePressure, defencePressureClass,
                defenceAggression, defenceAggressionClass, defenceTeamWidth, defenceTeamWidthClass, defenceDefenderLineClass,
                buildUpPlaySpeed, buildUpPlaySpeedClass
        FROM team_attributes
""", conn)

team_attrib['date'] = team_attrib['date'].apply(pd.to_datetime)

team_attrib.head(5)

In [None]:
# get matching team_id

def get_latest_attribute_idx(team, col_name):
    earlier_dates = team_attrib.loc[(team_attrib['team_api_id'] == team[col_name]) 
                                      & (team_attrib['date'] < pd.to_datetime(team['date']))]
    return earlier_dates[earlier_dates['date'] == earlier_dates['date'].max()].index[0]

#df[['home_team_api_id', 'away_team_api_id']].apply(lambda s: get_latest_attribute_idx(s, 'buildUpPlaySpeed'), axis=1)

#team_attrib.loc[(team_attrib['team_api_id'] == df['home_team_api_id'].iloc[0])
#                & (team_attrib['date'] < pd.to_datetime(df['date']).iloc[0])]
#team_attrib['date'] > 
team_attrib['date'].min()