# Match statistics with player attributes
This generates the match statistics, but with player attributes and their position marked. The field is seperated in 8x22 grid. To keep things simple goalkeepers are excluded from data. Home players are even position of y, and away player are odd positions of y.

So imagining the field as a picture, then the player attributes are considered channels.

In [1]:
import pandas as pd
import numpy as np
import sqlite3

In [2]:
# Grab dataframes for Match and Player_Attributes from Sqlite3 database.
conn = sqlite3.connect("database.sqlite")
df_match = pd.read_sql_query("select * from Match;", conn)
df_player = pd.read_sql_query("select * from Player_Attributes;", conn)
conn.close()

In [3]:
# Generate col names that have player position and api info
keep_cols = [col for col in df_match.columns.values if "_player_" in col]
keep_cols += ['stage', 'date', 'match_api_id', 'home_team_api_id', 'away_team_api_id', 'home_team_goal', 'away_team_goal']


# Generate cols that we want to drop (based on ones we want to keep)
drop_cols = [col for col in df_match.columns if col not in keep_cols]
# Drop goalkeeper info
drop_cols += ['home_player_1', 'away_player_1', 'home_player_X1', 'home_player_Y1', 'away_player_X1', 'away_player_Y1']

print(drop_cols)

['id', 'country_id', 'league_id', 'season', 'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner', 'possession', 'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD', 'VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD', 'BSA', 'home_player_1', 'away_player_1', 'home_player_X1', 'home_player_Y1', 'away_player_X1', 'away_player_Y1']


In [4]:
# Drop columns that we generated above, do this before NaN check below
df_match.drop(drop_cols, axis=1, inplace=True)

# Cleanup dataset, drop any row with NaN values, reset indices
df_match.dropna(inplace=True)
df_match.reset_index(drop=True, inplace=True)
df_match.head()

Unnamed: 0,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X2,home_player_X3,home_player_X4,...,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11
0,24,2009-02-27 00:00:00,493017,8203,9987,2,1,2.0,4.0,6.0,...,38293.0,148313.0,104411.0,148314.0,37202.0,43158.0,9307.0,42153.0,32690.0,38782.0
1,24,2009-03-01 00:00:00,493019,9985,9984,4,0,2.0,4.0,6.0,...,37047.0,37021.0,38186.0,27110.0,32863.0,37957.0,37909.0,104386.0,38251.0,37065.0
2,25,2009-03-08 00:00:00,493025,9984,8342,1,3,2.0,4.0,6.0,...,21812.0,11736.0,37858.0,38366.0,37983.0,39578.0,38336.0,52280.0,27423.0,38440.0
3,25,2009-03-07 00:00:00,493027,8635,10000,2,0,2.0,4.0,6.0,...,37886.0,37903.0,37889.0,94030.0,37893.0,37981.0,131531.0,130027.0,38231.0,131530.0
4,26,2009-03-13 00:00:00,493034,8203,8635,2,1,2.0,4.0,6.0,...,38388.0,38389.0,31316.0,164694.0,30949.0,38378.0,38383.0,38393.0,38253.0,37069.0


In [5]:
# Drop goalkeeper stats, and stats related to set-piece (too situational to be useful)
drop_cols = [u'player_fifa_api_id', u'free_kick_accuracy', u'penalties', u'gk_diving', u'gk_handling', u'gk_kicking',
       u'gk_positioning', u'gk_reflexes']

# Drop columns that we generated above, do this before NaN check below
df_player.drop(drop_cols, axis=1, inplace=True)

# Cleanup dataset, drop any row with NaN values, reset indices
df_player.dropna(inplace=True)
df_player.reset_index(drop=True, inplace=True)
df_player.head()

Unnamed: 0,id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,...,stamina,strength,long_shots,aggression,interceptions,positioning,vision,marking,standing_tackle,sliding_tackle
0,1,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,...,54.0,76.0,35.0,71.0,70.0,45.0,54.0,65.0,69.0,69.0
1,2,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,...,54.0,76.0,35.0,71.0,70.0,45.0,54.0,65.0,69.0,69.0
2,3,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,44.0,...,54.0,76.0,35.0,63.0,41.0,45.0,54.0,65.0,66.0,69.0
3,4,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,...,54.0,76.0,34.0,62.0,40.0,44.0,53.0,62.0,63.0,66.0
4,5,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,...,54.0,76.0,34.0,62.0,40.0,44.0,53.0,62.0,63.0,66.0


## Check dataset validity
Now that we grabbed and cleaned our datasets, lets ensure that the api ids are correlated, namely all api id's used in df_match should also exist df_player. If not then drop that specific row

In [6]:
# Build the list of columns that store the api ids
player_api_cols = [col for col in df_match.columns if "_player_" in col and ("_X" not in col and "_Y" not in col )]
print(player_api_cols)

['home_player_2', 'home_player_3', 'home_player_4', 'home_player_5', 'home_player_6', 'home_player_7', 'home_player_8', 'home_player_9', 'home_player_10', 'home_player_11', 'away_player_2', 'away_player_3', 'away_player_4', 'away_player_5', 'away_player_6', 'away_player_7', 'away_player_8', 'away_player_9', 'away_player_10', 'away_player_11']


In [7]:
print(df_match.shape)

(21490, 67)


In [8]:
for col_name in player_api_cols:
    comp_mask = np.isin(df_match[col_name], df_player['player_api_id'], invert=True)
    if comp_mask.any():
        print("Dropping %d" % len(df_match[comp_mask].index))
        df_match.drop(df_match[comp_mask].index, axis=0, inplace=True)

Dropping 432
Dropping 322
Dropping 253
Dropping 204
Dropping 159
Dropping 217
Dropping 130
Dropping 130
Dropping 128
Dropping 101
Dropping 239
Dropping 162
Dropping 149
Dropping 118
Dropping 89
Dropping 110
Dropping 79
Dropping 85
Dropping 66
Dropping 66


In [9]:
print(df_match.shape)

(18251, 67)


## Build Map
We have the data available, not we build our field map, ie the 8x22 grid, which we will fill with player_api_id occupying that position. 0 if no one is there.

In [10]:
for x in xrange(9):
    for y in xrange(18):
        col_name = "x%d,y%d" % (x, y)
        df_match.insert(len(df_match.columns), col_name, 0, allow_duplicates=False)

Inserting column x0,y0
Inserting column x0,y1
Inserting column x0,y2
Inserting column x0,y3
Inserting column x0,y4
Inserting column x0,y5
Inserting column x0,y6
Inserting column x0,y7
Inserting column x0,y8
Inserting column x0,y9
Inserting column x0,y10
Inserting column x0,y11
Inserting column x0,y12
Inserting column x0,y13
Inserting column x0,y14
Inserting column x0,y15
Inserting column x0,y16
Inserting column x0,y17
Inserting column x1,y0
Inserting column x1,y1
Inserting column x1,y2
Inserting column x1,y3
Inserting column x1,y4
Inserting column x1,y5
Inserting column x1,y6
Inserting column x1,y7
Inserting column x1,y8
Inserting column x1,y9
Inserting column x1,y10
Inserting column x1,y11
Inserting column x1,y12
Inserting column x1,y13
Inserting column x1,y14
Inserting column x1,y15
Inserting column x1,y16
Inserting column x1,y17
Inserting column x2,y0
Inserting column x2,y1
Inserting column x2,y2
Inserting column x2,y3
Inserting column x2,y4
Inserting column x2,y5
Inserting column x

In [11]:
for index, row in df_match.iterrows():
    # NOTE: Skipping 1, since that is for the goalkeeper
    for rnum in xrange(2, 12):
        # Home players
        x_col = "home_player_X%d" % rnum
        y_col = "home_player_Y%d" % rnum
        x_pos = int(row[x_col]) - 1  # The postion is bw [2,9]
        y_pos = 2 * (int(row[y_col]) - 3)  # The postion is bw [1,11]
        
        # Find the field postion column
        field_col = "x%d,y%d" % (x_pos, y_pos)
        # Find and then insert the person's api id
        df_match.loc[index, field_col] = row["home_player_%d" % rnum]
        
        # Away players
        x_col = "away_player_X%d" % rnum
        y_col = "away_player_Y%d" % rnum
        x_pos = int(row[x_col]) - 1
        y_pos = 2 * (int(row[y_col]) - 3)
        # Since we are away, we need to invert the position
        y_pos = 17 - y_pos
        
        # Find the field postion column
        field_col = "x%d,y%d" % (x_pos, y_pos)
        # Find and then insert the person's api id
        df_match.loc[index, field_col] = row["away_player_%d" % rnum]

In [12]:
# Drop cols we don't require anymore
drop_cols = [col for col in df_match.columns if "_player_" in col]
drop_cols += ['stage', 'match_api_id', 'home_team_api_id', 'away_team_api_id']
df_match.drop(drop_cols, axis=1, inplace=True)

## Done with match-field player api ids
We have now stored all the api ids. Just write to pickle so we can read from it later.

In [13]:
df_match.loc[:, 'date'] = pd.to_datetime(df_match['date'])

In [14]:
df_match.head()

Unnamed: 0,date,home_team_goal,away_team_goal,"x0,y0","x0,y1","x0,y2","x0,y3","x0,y4","x0,y5","x0,y6",...,"x8,y8","x8,y9","x8,y10","x8,y11","x8,y12","x8,y13","x8,y14","x8,y15","x8,y16","x8,y17"
3,2009-03-07,2,0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
4,2009-03-13,2,1,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
9,2009-04-12,1,0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
13,2009-05-02,0,0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
20,2009-05-16,0,1,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0


In [15]:
df_match.reset_index(drop=True, inplace=True)
df_match.to_pickle("match_field_plapi.pkl")

## Format the player attributes correctly

In [16]:
df_player.head()

Unnamed: 0,id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,...,stamina,strength,long_shots,aggression,interceptions,positioning,vision,marking,standing_tackle,sliding_tackle
0,1,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,...,54.0,76.0,35.0,71.0,70.0,45.0,54.0,65.0,69.0,69.0
1,2,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,...,54.0,76.0,35.0,71.0,70.0,45.0,54.0,65.0,69.0,69.0
2,3,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,44.0,...,54.0,76.0,35.0,63.0,41.0,45.0,54.0,65.0,66.0,69.0
3,4,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,...,54.0,76.0,34.0,62.0,40.0,44.0,53.0,62.0,63.0,66.0
4,5,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,43.0,...,54.0,76.0,34.0,62.0,40.0,44.0,53.0,62.0,63.0,66.0


In [17]:
attrib_cols = [u'crossing', u'finishing', u'heading_accuracy', u'short_passing',
       u'volleys', u'dribbling', u'curve', u'long_passing', u'ball_control',
       u'acceleration', u'sprint_speed', u'agility', u'reactions', u'balance',
       u'shot_power', u'jumping', u'stamina', u'strength', u'long_shots',
       u'aggression', u'interceptions', u'positioning', u'vision', u'marking',
       u'standing_tackle', u'sliding_tackle']
attrib_cols

[u'crossing',
 u'finishing',
 u'heading_accuracy',
 u'short_passing',
 u'volleys',
 u'dribbling',
 u'curve',
 u'long_passing',
 u'ball_control',
 u'acceleration',
 u'sprint_speed',
 u'agility',
 u'reactions',
 u'balance',
 u'shot_power',
 u'jumping',
 u'stamina',
 u'strength',
 u'long_shots',
 u'aggression',
 u'interceptions',
 u'positioning',
 u'vision',
 u'marking',
 u'standing_tackle',
 u'sliding_tackle']

In [18]:
# Normalize attributes
df_player.loc[:, attrib_cols] = df_player.loc[:, attrib_cols] / 100.0

In [19]:
df_player.head()

Unnamed: 0,id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,...,stamina,strength,long_shots,aggression,interceptions,positioning,vision,marking,standing_tackle,sliding_tackle
0,1,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,0.49,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
1,2,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,0.49,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
2,3,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,0.49,0.44,...,0.54,0.76,0.35,0.63,0.41,0.45,0.54,0.65,0.66,0.69
3,4,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,0.48,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66
4,5,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,0.48,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66


In [20]:
# Scale the overall_rating and potential
cols = ['overall_rating', 'potential']
df_player.loc[:, cols] = df_player.loc[:, cols] / 100.0
df_player.head()

Unnamed: 0,id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,...,stamina,strength,long_shots,aggression,interceptions,positioning,vision,marking,standing_tackle,sliding_tackle
0,1,505942,2016-02-18 00:00:00,0.67,0.71,right,medium,medium,0.49,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
1,2,505942,2015-11-19 00:00:00,0.67,0.71,right,medium,medium,0.49,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
2,3,505942,2015-09-21 00:00:00,0.62,0.66,right,medium,medium,0.49,0.44,...,0.54,0.76,0.35,0.63,0.41,0.45,0.54,0.65,0.66,0.69
3,4,505942,2015-03-20 00:00:00,0.61,0.65,right,medium,medium,0.48,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66
4,5,505942,2007-02-22 00:00:00,0.61,0.65,right,medium,medium,0.48,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66


In [21]:
# Just drop work_rate columns, since some rows related to it are dirty
drop_cols = ['attacking_work_rate', 'defensive_work_rate']
df_player.drop(drop_cols, axis=1, inplace=True)

In [22]:
df_player.drop('id', axis=1, inplace=True)
df_player.head()

Unnamed: 0,player_api_id,date,overall_rating,potential,preferred_foot,crossing,finishing,heading_accuracy,short_passing,volleys,...,stamina,strength,long_shots,aggression,interceptions,positioning,vision,marking,standing_tackle,sliding_tackle
0,505942,2016-02-18 00:00:00,0.67,0.71,right,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
1,505942,2015-11-19 00:00:00,0.67,0.71,right,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
2,505942,2015-09-21 00:00:00,0.62,0.66,right,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.63,0.41,0.45,0.54,0.65,0.66,0.69
3,505942,2015-03-20 00:00:00,0.61,0.65,right,0.48,0.43,0.7,0.6,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66
4,505942,2007-02-22 00:00:00,0.61,0.65,right,0.48,0.43,0.7,0.6,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66


In [23]:
df_player.loc[df_player['preferred_foot'] == 'right', 'preferred_foot'] = 1.0
df_player.head()

Unnamed: 0,player_api_id,date,overall_rating,potential,preferred_foot,crossing,finishing,heading_accuracy,short_passing,volleys,...,stamina,strength,long_shots,aggression,interceptions,positioning,vision,marking,standing_tackle,sliding_tackle
0,505942,2016-02-18 00:00:00,0.67,0.71,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
1,505942,2015-11-19 00:00:00,0.67,0.71,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
2,505942,2015-09-21 00:00:00,0.62,0.66,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.63,0.41,0.45,0.54,0.65,0.66,0.69
3,505942,2015-03-20 00:00:00,0.61,0.65,1,0.48,0.43,0.7,0.6,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66
4,505942,2007-02-22 00:00:00,0.61,0.65,1,0.48,0.43,0.7,0.6,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66


In [24]:
df_player.loc[df_player['preferred_foot'] == 'left', 'preferred_foot'] = 0.0
df_player.head()

Unnamed: 0,player_api_id,date,overall_rating,potential,preferred_foot,crossing,finishing,heading_accuracy,short_passing,volleys,...,stamina,strength,long_shots,aggression,interceptions,positioning,vision,marking,standing_tackle,sliding_tackle
0,505942,2016-02-18 00:00:00,0.67,0.71,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
1,505942,2015-11-19 00:00:00,0.67,0.71,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
2,505942,2015-09-21 00:00:00,0.62,0.66,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.63,0.41,0.45,0.54,0.65,0.66,0.69
3,505942,2015-03-20 00:00:00,0.61,0.65,1,0.48,0.43,0.7,0.6,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66
4,505942,2007-02-22 00:00:00,0.61,0.65,1,0.48,0.43,0.7,0.6,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66


In [25]:
df_player.loc[:, 'date'] = pd.to_datetime(df_player['date'])

In [26]:
df_player.reset_index(drop=True, inplace=True)
df_player.to_pickle("player_attribs.pkl")