In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
tqdm.pandas()
pd.options.display.max_columns = None

In [2]:
games = pd.read_csv('../data/games/games.csv',\
        usecols=['game_id','date', 'home_club_id', 'away_club_id','home_club_goals', 'away_club_goals'],\
        dtype = {'game_id' : np.int32, 'home_club_id' : np.int32, 'away_club_id' : np.int32,\
                'home_club_goals' : np.int8, 'away_club_goals' : np.int8})

appearance = pd.read_csv('../data/games/appearance.csv', \
        usecols=['game_id','player_club_id','yellow_cards','red_cards'],\
                dtype={'game_id':np.float32,'player_club_id':np.int32,'yellow_cards':np.int8,'red_cards':np.int8})


In [4]:
train_df = pd.read_csv("data/training_data/train_final.csv", index_col = 0)
train_df = train_df.astype(dtype={x:np.float16 for x in train_df.columns[1:-1]})


In [5]:
games.date = pd.to_datetime(games.date)

In [6]:
appearance = appearance.drop(appearance[appearance.game_id.isnull()].index).reset_index(drop=True)
appearance = appearance.astype(dtype={'game_id':np.int32})

In [7]:
games['results'] = games.progress_apply(lambda x: 'W' if x['home_club_goals']>x['away_club_goals'] else \
    ('L' if x['home_club_goals']<x['away_club_goals'] else 'D')
    , axis = 1)

100%|██████████| 42592/42592 [00:01<00:00, 33831.66it/s]


In [8]:
def get_home_history(game_id):
    N = 10   #how many games we are looking for in club's history
    
    target_game = games.loc[games.game_id == game_id,:]
    N_last_games = games.loc[((games.home_club_id == target_game['home_club_id'].values[0]) | (games.away_club_id == target_game['home_club_id'].values[0]))\
        & (games.date < target_game['date'].values[0]) ,:].sort_values('date',ascending=False)[:N]
        
    last_results = N_last_games.results.value_counts()
    try:
        wins = last_results['W']
    except:
        wins = 0
    try:
        draws = last_results['D']
    except:
        draws = 0
    try:
        loss = last_results['L']
    except:
        loss = 0

    goal_scored = N_last_games.loc[N_last_games.home_club_id == target_game['home_club_id'].values[0],'home_club_goals'].sum()\
        + N_last_games.loc[N_last_games.away_club_id == target_game['home_club_id'].values[0],'away_club_goals'].sum()

    goals_conceded = N_last_games.loc[N_last_games.home_club_id == target_game['home_club_id'].values[0],'away_club_goals'].sum()\
        + N_last_games.loc[N_last_games.away_club_id == target_game['home_club_id'].values[0],'home_club_goals'].sum()

    cards = appearance.loc[(appearance.game_id.isin(N_last_games.game_id.values)) \
        & (appearance.player_club_id == target_game['home_club_id'].values[0]),\
            ['yellow_cards','red_cards']].sum()
    
    res = {'H_wins': wins, 'H_draws': draws , 'H_loss': loss,\
        'H_goal_scored': goal_scored, 'H_goals_conceded' : goals_conceded, 'H_yellow_cards' : cards['yellow_cards'], 'H_red_cards' : cards['red_cards']}
    
    return pd.Series(res)
    
    


In [9]:
def get_away_history(game_id):
    N=10
    target_game = games.loc[games.game_id == game_id,:]
    target_club = target_game['away_club_id'].values[0]
    N_last_games = games.loc[((games.home_club_id == target_club) | (games.away_club_id == target_club))\
        & (games.date < target_game['date'].values[0]) ,:].sort_values('date',ascending=False)[:N]
        
    last_results = N_last_games.results.value_counts()
    try:
        wins = last_results['W']
    except:
        wins = 0
    try:
        draws = last_results['D']
    except:
        draws = 0
    try:
        loss = last_results['L']
    except:
        loss = 0

    goal_scored = N_last_games.loc[N_last_games.home_club_id == target_club,'home_club_goals'].sum()\
        + N_last_games.loc[N_last_games.away_club_id == target_club,'away_club_goals'].sum()

    goals_conceded = N_last_games.loc[N_last_games.home_club_id == target_club,'away_club_goals'].sum()\
        + N_last_games.loc[N_last_games.away_club_id == target_club,'home_club_goals'].sum()

    cards = appearance.loc[(appearance.game_id.isin(N_last_games.game_id.values)) \
        & (appearance.player_club_id == target_club),\
            ['yellow_cards','red_cards']].sum()
    
    res = {'A_wins': wins, 'A_draws': draws , 'A_loss': loss,\
        'A_goal_scored': goal_scored, 'A_goals_conceded' : goals_conceded, 'A_yellow_cards' : cards['yellow_cards'], 'A_red_cards' : cards['red_cards']}
    
    return pd.Series(res)
    
    


In [10]:
game_id = train_df.game_id.astype(np.int32).copy()

In [11]:
home_features = game_id.progress_apply(get_home_history)

100%|██████████| 29545/29545 [10:14<00:00, 48.06it/s]


In [12]:
away_features = game_id.progress_apply(get_away_history)

100%|██████████| 29545/29545 [10:15<00:00, 47.98it/s]


In [13]:
home_features = home_features.astype(dtype={'H_wins':np.int8, 'H_draws':np.int8, 'H_loss':np.int8, 'H_goal_scored':np.int8, 'H_goals_conceded':np.int8,
       'H_yellow_cards':np.int8, 'H_red_cards':np.int8})
away_features = away_features.astype(dtype={'A_wins':np.int8, 'A_draws':np.int8, 'A_loss':np.int8, 'A_goal_scored':np.int8, 'A_goals_conceded':np.int8,
       'A_yellow_cards':np.int8, 'A_red_cards':np.int8})

In [14]:
away_features.shape, train_df.shape

((29545, 7), (29545, 200))

In [15]:
train_more_features = pd.concat([home_features,away_features,train_df],axis =1)

In [16]:
train_more_features.to_csv('data/training_data/train_more_features.csv')