In [53]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
tqdm.pandas()
pd.options.display.max_columns = None

In [4]:
clubs = pd.read_csv('../data/games/clubs.csv', index_col = 0)
leagues = pd.read_csv('../data/games/leagues.csv', index_col = 0)
games = pd.read_csv('../data/games/games.csv', index_col = 0,
                   dtype = {'game_id' : np.int32, 'season' : np.int16, 'home_club_id' : np.int32, 'home_club_id' : np.int32,
                           'home_club_goals' : np.int8, 'away_club_goals' : np.int8})

players = pd.read_csv('../data/games/players.csv', index_col = 0,
                     dtype = {'player_id' : np.int32, 'player_club_id' : np.int16})

appearance = pd.read_csv('../data/games/appearance.csv', index_col = 0)


In [5]:
train_df = pd.read_csv("../data/training_data/train_final.csv", index_col = 0)
train_df = train_df.astype(dtype={x:np.float16 for x in train_df.columns[1:-1]})



In [13]:
games.date = pd.to_datetime(games.date)

In [15]:
train_df.head(1)

Unnamed: 0,game_id,H_GK,H_GK_overall,H_GK_potential,H_GK_attacking_crossing,H_GK_attacking_heading_accuracy,H_GK_skill_curve,H_GK_skill_fk_accuracy,H_GK_movement_agility,H_GK_movement_reactions,...,A_MF_movement_reactions,A_MF_movement_balance,A_MF_power_shot_power,A_MF_power_jumping,A_MF_power_stamina,A_MF_mentality_aggression,A_MF_mentality_positioning,A_MF_mentality_penalties,A_MF_mentality_composure,Home_result
0,2457642,1.0,70.0,70.0,25.0,25.0,25.0,25.0,36.0,65.0,...,75.25,73.5,73.5625,63.375,78.8125,59.28125,74.75,67.125,0.0,1


In [29]:
games.drop(columns = ['competition_code', 'season', 'round','home_club_position', 'away_club_position', 'stadium', 'attendance',
       'referee', 'url'], inplace = True)
       
appearance.drop(columns = ['player_id','appearance_id', 'competition_id', 'goals', 'assists', 'minutes_played'], inplace = True)

In [91]:
appearance = appearance.astype(dtype={ 'player_club_id':np.int32, 'yellow_cards':np.int8, 'red_cards':np.int8})

In [84]:
def get_home_history(game_id):
    target_game = games.loc[games.game_id == game_id,:]
    ten_last_games = games.loc[((games.home_club_id == target_game['home_club_id'].values[0]) | (games.away_club_id == target_game['home_club_id'].values[0]))\
        & (games.date < target_game['date'].values[0]) ,:].sort_values('date',ascending=False)[:10]
        
    last_results = ten_last_games.results.value_counts()
    try:
        wins = last_results['W']
    except:
        wins = 0
    try:
        draws = last_results['D']
    except:
        draws = 0
    try:
        loss = last_results['L']
    except:
        loss = 0

    goal_scored = ten_last_games.loc[ten_last_games.home_club_id == target_game['home_club_id'].values[0],'home_club_goals'].sum()\
        + ten_last_games.loc[ten_last_games.away_club_id == target_game['home_club_id'].values[0],'away_club_goals'].sum()

    goals_conceded = ten_last_games.loc[ten_last_games.home_club_id == target_game['home_club_id'].values[0],'away_club_goals'].sum()\
        + ten_last_games.loc[ten_last_games.away_club_id == target_game['home_club_id'].values[0],'home_club_goals'].sum()

    cards = appearance.loc[(appearance.game_id.isin(ten_last_games.game_id.values)) \
        & (appearance.player_club_id == target_game['home_club_id'].values[0]),\
            ['yellow_cards','red_cards']].sum()
    
    res = {'H_wins': wins, 'H_draws': draws , 'H_loss': loss,\
        'H_goal_scored': goal_scored, 'H_goals_conceded' : goals_conceded, 'H_yellow_cards' : cards['yellow_cards'], 'H_red_cards' : cards['red_cards']}
    
    return pd.Series(res)
    
    


In [102]:
def get_away_history(game_id):
    target_game = games.loc[games.game_id == game_id,:]
    target_club = target_game['away_club_id'].values[0]
    ten_last_games = games.loc[((games.home_club_id == target_club) | (games.away_club_id == target_club))\
        & (games.date < target_game['date'].values[0]) ,:].sort_values('date',ascending=False)[:10]
        
    last_results = ten_last_games.results.value_counts()
    try:
        wins = last_results['W']
    except:
        wins = 0
    try:
        draws = last_results['D']
    except:
        draws = 0
    try:
        loss = last_results['L']
    except:
        loss = 0

    goal_scored = ten_last_games.loc[ten_last_games.home_club_id == target_club,'home_club_goals'].sum()\
        + ten_last_games.loc[ten_last_games.away_club_id == target_club,'away_club_goals'].sum()

    goals_conceded = ten_last_games.loc[ten_last_games.home_club_id == target_club,'away_club_goals'].sum()\
        + ten_last_games.loc[ten_last_games.away_club_id == target_club,'home_club_goals'].sum()

    cards = appearance.loc[(appearance.game_id.isin(ten_last_games.game_id.values)) \
        & (appearance.player_club_id == target_club),\
            ['yellow_cards','red_cards']].sum()
    
    res = {'A_wins': wins, 'A_draws': draws , 'A_loss': loss,\
        'A_goal_scored': goal_scored, 'A_goals_conceded' : goals_conceded, 'A_yellow_cards' : cards['yellow_cards'], 'A_red_cards' : cards['red_cards']}
    
    return pd.Series(res)
    
    


In [92]:
home_features = train_df.game_id.progress_apply(get_home_history)

100%|██████████| 29545/29545 [13:17<00:00, 37.06it/s]


In [103]:
away_features = train_df.game_id.progress_apply(get_away_history)

100%|██████████| 29545/29545 [13:22<00:00, 36.84it/s]


In [109]:
home_features = home_features.astype(dtype={'H_wins':np.int8, 'H_draws':np.int8, 'H_loss':np.int8, 'H_goal_scored':np.int8, 'H_goals_conceded':np.int8,
       'H_yellow_cards':np.int8, 'H_red_cards':np.int8})
away_features = away_features.astype(dtype={'A_wins':np.int8, 'A_draws':np.int8, 'A_loss':np.int8, 'A_goal_scored':np.int8, 'A_goals_conceded':np.int8,
       'A_yellow_cards':np.int8, 'A_red_cards':np.int8})

In [110]:
away_features.shape, train_df.shape

((29545, 7), (29545, 176))

In [112]:
train_more_features = pd.concat([home_features,away_features,train_df],axis =1)

In [113]:
train_more_features.to_csv('../data/training_data/train_more_features.csv')