# Stage 1: Build a database

## Basic match dataframe setup

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect("football_database.sqlite")
df = pd.read_sql_query("""
        SELECT id, country_id, league_id, season, stage, date, 
            match_api_id, home_team_api_id, away_team_api_id, 
            home_team_goal, away_team_goal, home_player_1, home_player_2, 
            home_player_3, home_player_4, home_player_5, 
            home_player_6, home_player_7, home_player_8, 
            home_player_9, home_player_10, home_player_11, 
            away_player_1, away_player_2, away_player_3, 
            away_player_4, away_player_5, away_player_6, 
            away_player_7, away_player_8, away_player_9, 
            away_player_10, away_player_11, goal, shoton, 
            shotoff, foulcommit, card, cross, corner, possession
        FROM match
        """, conn)


In [2]:
# drop null values
df.dropna(inplace=True)

df.shape

(13325, 41)

In [3]:
# Change player ids to int instead of float

home_fields = [f'home_player_{i}' for i in range(1, 12)]
away_fields = [f'away_player_{i}' for i in range(1, 12)]

df[home_fields] = df[home_fields].astype(int)
df[away_fields] = df[away_fields].astype(int)

df['date'] = df['date'].apply(pd.to_datetime)

In [4]:
# Make df smaller for faster processing
df = df.head(5)

In [5]:
# Add match_result

df.loc[df['home_team_goal'] > df['away_team_goal'], 'match_result'] = 1
df.loc[df['home_team_goal'] < df['away_team_goal'], 'match_result'] = 2
df.loc[df['home_team_goal'] == df['away_team_goal'], 'match_result'] = 3
df['match_result'] = df['match_result'].astype(int)

# Get the first year of season
df['season'] = df['season'].apply(lambda x: x[:4])

df.head(5)

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,match_result
1728,1729,1729,1729,2008,1,2008-08-17,489042,10260,10261,1,...,37799,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>56</comment><event...,3
1729,1730,1729,1729,2008,1,2008-08-16,489043,9825,8659,1,...,27267,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card />,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>65</comment><event...,1
1730,1731,1729,1729,2008,1,2008-08-16,489044,8472,8650,0,...,30853,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>45</comment><event...,2
1731,1732,1729,1729,2008,1,2008-08-16,489045,8654,8528,2,...,34466,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>50</comment><event...,1
1733,1734,1729,1729,2008,1,2008-08-16,489047,8668,8655,2,...,30646,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>46</comment><event...,2


In [6]:
## Calculate lineup average age

# Load birth year data
birth_day = pd.read_sql_query("""
        SELECT player_api_id as id, birthday
        FROM player
""", conn, params=())

birth_day.head(5)

Unnamed: 0,id,birthday
0,505942,1992-02-29 00:00:00
1,155782,1989-12-15 00:00:00
2,162549,1991-05-13 00:00:00
3,30572,1982-05-08 00:00:00
4,23780,1979-11-08 00:00:00


In [None]:
# Get average age of a line up for home players

dates = df[home_fields].astype(int).replace(to_replace=birth_day['id'].tolist(), value=birth_day['birthday'].tolist())
dates['date'] = df['date']

dates.head(5)

In [None]:
# Calculate average difference from player ages
dates = dates.apply(pd.to_datetime)

dates['avg_home_ages'] = dates.apply(lambda s: dates['date']-s).astype('timedelta64[Y]').mean(axis=1).astype(int)

df = df.join(dates['avg_home_ages']).drop('id', axis=1)

df.head(5)

In [None]:
# Get average age of a line up for away players

dates = df[away_fields].astype(int).replace(to_replace=birth_day['id'].tolist(), value=birth_day['birthday'].tolist())
dates['date'] = df['date']

dates.head(5)

In [None]:
# Calculate average difference from player ages
dates = dates.apply(pd.to_datetime)

dates['avg_away_ages'] = dates.apply(lambda s: dates['date']-s).astype('timedelta64[Y]').mean(axis=1).astype(int)

df = df.join(dates['avg_away_ages'])

df.head(5)

## Task 1A: Parse XML fields

In [None]:
# Create a helper function that counts team nodes
from collections import Counter
import xml.etree.ElementTree as ET

def count_teams(xml_string):
    c = Counter()
    root = ET.fromstring(xml_string)
    for t in root.iter('team'):
        c.update([t.text])
    return c

In [None]:
# Add extracted XML info to the dataframe

df['home_shoton'] = df.apply(lambda s: count_teams(s['shoton'])[str(s['home_team_api_id'])], axis=1)
df['away_shoton'] = df.apply(lambda s: count_teams(s['shoton'])[str(s['away_team_api_id'])], axis=1)

df['home_shotoff'] = df.apply(lambda s: count_teams(s['shotoff'])[str(s['home_team_api_id'])], axis=1)
df['away_shotoff'] = df.apply(lambda s: count_teams(s['shotoff'])[str(s['away_team_api_id'])], axis=1)

df['home_foulcommit'] = df.apply(lambda s: count_teams(s['foulcommit'])[str(s['home_team_api_id'])], axis=1)
df['away_foulcommit'] = df.apply(lambda s: count_teams(s['foulcommit'])[str(s['away_team_api_id'])], axis=1)

df['home_card'] = df.apply(lambda s: count_teams(s['card'])[str(s['home_team_api_id'])], axis=1)
df['away_card'] = df.apply(lambda s: count_teams(s['card'])[str(s['away_team_api_id'])], axis=1)

df['home_corner'] = df.apply(lambda s: count_teams(s['corner'])[str(s['home_team_api_id'])], axis=1)
df['away_corner'] = df.apply(lambda s: count_teams(s['corner'])[str(s['away_team_api_id'])], axis=1)

df['away_pos'] = df['possession'].apply(lambda s: ET.fromstring(s).findall('./value/awaypos')[-1].text)
df['home_pos'] = df['possession'].apply(lambda s: ET.fromstring(s).findall('./value/homepos')[-1].text)

df.head(5)

## Task 1B: Parsing player attributes

In [None]:
# Load Player attributes
player_attrib = pd.read_sql_query("""
        SELECT player_api_id as id, date, overall_rating, crossing, finishing, heading_accuracy,
                short_passing, volleys, dribbling, curve, free_kick_accuracy,
                long_passing, ball_control, acceleration, sprint_speed, agility,
                reactions, balance, shot_power, jumping, stamina, strength,
                long_shots, aggression, interceptions, positioning, vision,
                penalties, marking, standing_tackle, sliding_tackle
        FROM player_attributes
""", conn, params=())

player_attrib['date'] = player_attrib['date'].apply(pd.to_datetime)

player_attrib.head(5)

In [None]:
# Get the attributes for each player closest to match point
target = df[['date', 'home_player_1']].copy()

def get_most_recent_attributes(player):
    earlier_dates = player_attrib.loc[(player_attrib['id'] == player['home_player_1']) 
                                      & (player_attrib['date'] < pd.to_datetime(player['date']))]
    return earlier_dates[earlier_dates['date'] == earlier_dates['date'].max()].index[0]

target['attrib_id'] = target.apply(get_most_recent_attributes, axis=1)
target.head(5)

In [None]:
# Map the attrib_id to the actual attributes

target.merge(player_attrib, how='left', left_on='attrib_id', right_index=True)