In [1]:
import os
import pandas as pd

In [2]:
table_name = 'temptable.csv'
os.chdir('../data')
df = pd.read_csv(table_name)

In [3]:
df.sample(5)

Unnamed: 0,player_stats_id,season_id,team_id,player_id,games,points,goals,assists,penalty,p_m,...,player_height,player_weight,player_site_id,player_age,player_name,player_unicode_name,player_khl_id,player_nhl_id,season_year,season_type
18697,19920,71,88,3661,19,16,5,11,4,7,...,185.0,91.0,3665,37.0,Marián Gáborík,marian gaborik,,8468483,2013,regular
23010,24478,80,62,8884,81,23,6,17,40,3,...,,194.0,8889,35.0,Matt Stajan,matt stajan,,8470162,2016,regular
2855,3075,41,1479,8989,10,4,1,3,4,-1,...,187.0,92.0,8993,43.0,Mike Leclerc,mike leclerc,,8462084,2003,regular
20070,21366,50,72,8995,75,12,4,8,36,1,...,185.0,91.0,9000,43.0,Brad Lukowich,brad lukowich,,8460580,2006,regular
9316,9994,5,86,54460,46,17,4,13,85,-32,...,188.0,95.0,54408,50.0,Jayson More,jayson more,,8449694,1991,regular


In [4]:
class DataFrameTransformer:
    def __init__(self, df):
        self.df = df
        self.transforms = []
    
    def add_transform(self, transform):
        self.transforms.append(transform)
    
    def add_transforms(self, transforms):
        for transform in transforms:
            self.add_transform(transform)
    
    def fit(self):
        for transform in self.transforms:
            self.df = transform(self.df)
        return self.df

In [5]:
# Пример - добавить столбец с общим числом очков для игрока

def add_total_points(df):
    df['total_points'] = df['points'].groupby(df['player_id']).transform('sum')
    return df

# ...

In [6]:
# Для добавления в трансформер достаточно написать функцию и добавить ее в массив transforms ниже

transformer = DataFrameTransformer(df)
transformer.add_transform(add_total_points)
ext_df = transformer.fit()

In [7]:
ext_df[['player_id', 'total_points']].sample(5)

Unnamed: 0,player_id,total_points
5806,13396,3
15019,50167,29
5577,9026,369
15013,12772,7
8377,52537,110


## Features for teams

In [8]:
table_name = 'team_stats.csv'
os.chdir('../data')
team_stats_df = pd.read_csv(table_name)

In [9]:
team_stats_df.sample(5)

Unnamed: 0,id,team_id,season_id,games,points,goals_scored,goals_missed,position_in_championship,position_in_conference,playoff_fact,position_in_division,nhl_id
16793,17995,216,92,38,42.0,106,102,12,7.0,False,3.0,
4898,5244,649,91,37,,3,69,23,,True,,
3567,3815,724,91,36,,3,94,10,,True,,
849,897,6116,91,37,,2,103,5,,True,,
25196,27439,216,92,38,42.0,106,102,12,7.0,False,3.0,


In [10]:
import numpy as np

# transform methods for seasons
def add_avg_games_for_season(df, team_stats_df):
    df = df.join(team_stats_df.groupby('season_id')['games'].mean(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_games_count_season']))
    return df

def add_avg_points_for_season(df, team_stats_df):
    df = df.join(team_stats_df.groupby('season_id')['points'].mean(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_points_season']))
    return df

def add_sum_points_for_season(df, team_stats_df):
    df = df.join(team_stats_df.groupby('season_id')['points'].sum(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_sum_points_season']))
    return df

def add_team_count_for_season(df, team_stats_df):
    df = df.merge(team_stats_df.groupby('season_id').apply(lambda x: len(x['team_id'].unique())).rename('feature_team_count_season'), on='season_id')
    return df

# transform methods for teams
def add_avg_games_for_team(df, team_stats_df):
    df = df.join(team_stats_df.groupby('team_id')['games'].mean(), on='team_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_games_count_team']))
    return df

def add_avg_points_for_team(df, team_stats_df):
    df = df.join(team_stats_df.groupby('team_id')['points'].mean(), on='team_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_points_team']))
    return df

def add_sum_points_for_team(df, team_stats_df):
    df = df.join(team_stats_df.groupby('team_id')['points'].sum(), on='team_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_sum_points_team']))
    return df


In [11]:
class DataFrameTransformer:
    def __init__(self, df, team_stats_df):
        self.df = df
        self.team_stats_df = team_stats_df
        self.transforms = []
    
    def add_transform(self, transform):
        self.transforms.append(transform)
    
    def add_transforms(self, transforms):
        for transform in transforms:
            self.add_transform(transform)
    
    def fit(self):
        for transform in transforms:
            self.df = transform(self.df, self.team_stats_df)
        return self.df

In [12]:
transforms = [add_avg_games_for_season, add_avg_points_for_season, add_sum_points_for_season, add_team_count_for_season, add_avg_games_for_team, add_avg_points_for_team, add_sum_points_for_team]
transformer = DataFrameTransformer(df, team_stats_df)
transformer.add_transforms(transforms)
ext_df = transformer.fit()

In [13]:
ext_df

Unnamed: 0,player_stats_id,season_id,team_id,player_id,games,points,goals,assists,penalty,p_m,...,season_year,season_type,total_points,feature_mean_games_count_season,feature_mean_points_season,feature_sum_points_season,feature_team_count_season,feature_mean_games_count_team,feature_mean_points_team,feature_sum_points_team
0,0,41,1479,8517,21,6,0,6,12,1,...,2003,regular,619,82.0,87.370370,2359.0,27,79.360000,86.280000,2157.0
1,18,41,78,9665,20,4,2,2,4,-9,...,2003,regular,11,82.0,87.370370,2359.0,27,79.571429,91.142857,2552.0
2,38,41,82,10673,2,0,0,0,0,0,...,2003,regular,5,82.0,87.370370,2359.0,27,80.111111,78.833333,1419.0
3,48,41,86,8658,69,19,2,17,12,8,...,2003,regular,132,82.0,87.370370,2359.0,27,79.555556,85.629630,2312.0
4,54,41,77,8833,1,0,0,0,0,0,...,2003,regular,71,82.0,87.370370,2359.0,27,79.571429,89.892857,2517.0
5,76,41,82,42376,58,26,6,20,26,-24,...,2003,regular,380,82.0,87.370370,2359.0,27,80.111111,78.833333,1419.0
6,78,41,74,8939,58,6,2,4,16,-10,...,2003,regular,53,82.0,87.370370,2359.0,27,79.571429,86.607143,2425.0
7,100,41,66,3652,81,31,11,20,40,-6,...,2003,regular,974,82.0,87.370370,2359.0,27,80.380952,83.476190,1753.0
8,116,41,68,3655,55,22,14,8,57,-5,...,2003,regular,421,82.0,87.370370,2359.0,27,79.360000,78.840000,1971.0
9,126,41,82,8615,57,34,13,21,54,-11,...,2003,regular,261,82.0,87.370370,2359.0,27,80.111111,78.833333,1419.0


## Features for seasons

In [75]:
import pandas as pd
import os
import numpy as np

### Let's initialize all of the tables

In [102]:
player_stats_df = pd.read_csv('../data/player_stats.csv')
season_df = pd.read_csv('../data/season.csv')

In [103]:
temptable = pd.read_csv('../data/temptable.csv')
df = temptable[['player_stats_id', 'season_id', 'team_id', 'player_id']].copy()
df.sample(5)

Unnamed: 0,player_stats_id,season_id,team_id,player_id
8631,9286,8,81,14033
5039,5425,47,82,9153
8770,9425,71,75,5367
2605,2808,56,64,11585
377,400,53,66,9140


In [104]:
class DataFrameTransformer_season:
    def __init__(self, df, player_stats_df):
        self.df = df
        self.player_stats_df = player_stats_df
        self.transforms = []
    
    def add_transform(self, transform):
        self.transforms.append(transform)
    
    def add_transforms(self, transforms):
        for transform in transforms:
            self.add_transform(transform)
    
    def fit(self):
        for transform in transforms:
            self.df = transform(self.df, self.player_stats_df)
        return self.df

In [105]:
# Average points per season
def avg_points_per_season(df, player_stats_df):
    df = df.join(player_stats_df.groupby('season_id')['points'].mean(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_points_season']))
    return df

# Median points per season
def med_points_per_season(df, player_stats_df):
    df = df.join(player_stats_df.groupby('season_id')['points'].median(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['featrue_median_points_season']))
    return df

# Average goals per season
def avg_goals_per_season(df, player_stats_df):
    df = df.join(player_stats_df.groupby('season_id')['goals'].mean(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_goals_season']))
    return df

# Sum of squares of goals per season
def sum_sq_goals_per_season(df, player_stats_df):
    df = df.join(player_stats_df.groupby('season_id')['goals'].apply(lambda x: x**2).sum(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_sumofsquares_goals_season']))
    return df

# Average assists per season
def avg_assists_per_season(df, player_stats_df):
    df = df.join(player_stats_df.groupby('season_id')['assists'].mean(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_mean_assists_season']))
    return df

# Sum of squares assists per season
def sum_sq_assists_per_season(df, player_stats_df):
    df = df.join(player_stats_df.groupby('season_id')['assists'].apply(lambda x: x**2).sum(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_sumofsquares_assists_season']))
    return df

# Sum of penalty per season
def sum_penalty_per_season(df, player_stats_df):
    df = df.join(player_stats_df.groupby('season_id')['penalty'].sum(), on='season_id', rsuffix='_')
    df.columns=np.append(np.array(df.columns[:-1]), np.array(['feature_sum_penalty_season']))
    return df