# Build the training and test dataset
We have the training dataset split between two dataframes. Combine them to get training data, and write the labels to another file

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
df_match = pd.read_pickle("match_field_plapi.pkl")
df_player = pd.read_pickle("player_attribs.pkl")

In [3]:
df_match.head()

Unnamed: 0,date,home_team_goal,away_team_goal,"x0,y0","x0,y1","x0,y2","x0,y3","x0,y4","x0,y5","x0,y6",...,"x8,y8","x8,y9","x8,y10","x8,y11","x8,y12","x8,y13","x8,y14","x8,y15","x8,y16","x8,y17"
0,2009-03-07,2,0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
1,2009-03-13,2,1,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
2,2009-04-12,1,0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
3,2009-05-02,0,0,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
4,2009-05-16,0,1,0.0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0


In [4]:
df_player.head()

Unnamed: 0,player_api_id,date,overall_rating,potential,preferred_foot,crossing,finishing,heading_accuracy,short_passing,volleys,...,stamina,strength,long_shots,aggression,interceptions,positioning,vision,marking,standing_tackle,sliding_tackle
0,505942,2016-02-18,0.67,0.71,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
1,505942,2015-11-19,0.67,0.71,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.71,0.7,0.45,0.54,0.65,0.69,0.69
2,505942,2015-09-21,0.62,0.66,1,0.49,0.44,0.71,0.61,0.44,...,0.54,0.76,0.35,0.63,0.41,0.45,0.54,0.65,0.66,0.69
3,505942,2015-03-20,0.61,0.65,1,0.48,0.43,0.7,0.6,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66
4,505942,2007-02-22,0.61,0.65,1,0.48,0.43,0.7,0.6,0.43,...,0.54,0.76,0.34,0.62,0.4,0.44,0.53,0.62,0.63,0.66


In [5]:
# Confirm no NaN
print(df_match.isna().sum().sum() == 0)
print(df_player.isna().sum().sum() == 0)

True
True


In [6]:
# Build the list of columns that store the api ids
player_api_cols = [col for col in df_match.columns if "_player_" in col and ("_X" not in col and "_Y" not in col )]

# Confirm every api id within match is in player
for col_name in player_api_cols:
    comp_mask = np.isin(df_match[col_name], df_player['player_api_id'], invert=True)
    if comp_mask.any():
        print("API for Match is not a subset of Player!!")
        break
print("Done.")

Done.


In [7]:
print(df_player.columns)
print(df_player.shape)

Index([u'player_api_id', u'date', u'overall_rating', u'potential',
       u'preferred_foot', u'crossing', u'finishing', u'heading_accuracy',
       u'short_passing', u'volleys', u'dribbling', u'curve', u'long_passing',
       u'ball_control', u'acceleration', u'sprint_speed', u'agility',
       u'reactions', u'balance', u'shot_power', u'jumping', u'stamina',
       u'strength', u'long_shots', u'aggression', u'interceptions',
       u'positioning', u'vision', u'marking', u'standing_tackle',
       u'sliding_tackle'],
      dtype='object')
(180354, 31)


In [8]:
print(df_match.columns)
print(df_match.shape)

Index([u'date', u'home_team_goal', u'away_team_goal', u'x0,y0', u'x0,y1',
       u'x0,y2', u'x0,y3', u'x0,y4', u'x0,y5', u'x0,y6',
       ...
       u'x8,y8', u'x8,y9', u'x8,y10', u'x8,y11', u'x8,y12', u'x8,y13',
       u'x8,y14', u'x8,y15', u'x8,y16', u'x8,y17'],
      dtype='object', length=165)
(18251, 165)


The code below is really inefficient way to insert the player attributes, but I could not find a vectorized implementation that did this mainly because:

1. For each match, we need to find the player attributes closest to when the match took place, hence we need to find the min of the difference in dates.
2. Properly insert the player attributes in the correct location on the field

In [10]:
from __future__ import print_function

samples = df_match.shape[0]
input_features = torch.zeros(samples, 29, 9, 18)

for index, row in df_match.iterrows():
    print(index, end='\r')
    
    field_cols = df_match.columns[3:]
    for loc_col_name, p_api_id in row[field_cols].iteritems():
        if p_api_id == 0.0:
            continue
        
        pa = loc_col_name.split(',')
        x = int(pa[0][1:])
        y = int(pa[1][1:])
        
        
        match_date = row['date']
        min_diff = np.timedelta64(365 * 100, 'D')
        min_index = None
        
        for row_index, row in df_player.loc[df_player['player_api_id'] == p_api_id, :].iterrows():
            player_diff = abs(row['date'] - match_date)
            if player_diff < min_diff:
                min_diff = player_diff
                min_index = row_index
        
        # Knowing this min_index, store the player attributes from this index
        input_features[index, :, x, y] = torch.FloatTensor(df_player.iloc[min_index].values[2:].astype(np.double))     

18250

In [11]:
df_match['match_result'] = df_match['home_team_goal'] - df_match['away_team_goal']
# Draw is labeled 0
df_match.loc[df_match['match_result'] == 0, 'match_result'] = 0
# Home team win is labeled 1
df_match.loc[df_match['match_result'] > 0, 'match_result'] = 1
# Away team win is labeled 2
df_match.loc[df_match['match_result'] < 0, 'match_result'] = 2

In [12]:
df_match.head()

Unnamed: 0,date,home_team_goal,away_team_goal,"x0,y0","x0,y1","x0,y2","x0,y3","x0,y4","x0,y5","x0,y6",...,"x8,y9","x8,y10","x8,y11","x8,y12","x8,y13","x8,y14","x8,y15","x8,y16","x8,y17",match_result
0,2009-03-07,2,0,0.0,0,0,0,0,0,0,...,0.0,0.0,0,0,0,0,0,0,0.0,1
1,2009-03-13,2,1,0.0,0,0,0,0,0,0,...,0.0,0.0,0,0,0,0,0,0,0.0,1
2,2009-04-12,1,0,0.0,0,0,0,0,0,0,...,0.0,0.0,0,0,0,0,0,0,0.0,1
3,2009-05-02,0,0,0.0,0,0,0,0,0,0,...,0.0,0.0,0,0,0,0,0,0,0.0,0
4,2009-05-16,0,1,0.0,0,0,0,0,0,0,...,0.0,0.0,0,0,0,0,0,0,0.0,2


In [13]:
labels = torch.LongTensor(df_match['match_result'].values)

In [14]:
labels.shape

torch.Size([18251])

In [15]:
labels[4]

tensor(2)

In [16]:
# Write the features and labels to file for future use
torch.save(input_features, "match_wplayer_attrib_features.pkl")
torch.save(labels, "match_wplayer_attrib_labels.pkl")