# 데이터 전처리 방안 수정
- `lag_period`를 1,2,3으로 수정 
- `add_trend_feature` 함수를 수정
    - 생성 피쳐의 이름 변경 : `delta_item_cnt_month_lag_1` -> `delta_1_item_cnt_month`
    - period : [1] -> [1,2]
- trend feature를 추가 : `transaction_cnt`에 대해 생성
- 파생변수 생성시 `date_block_num` 사용시 "month" 사용 자제 -> `date`로 표기

# Module

In [1]:
import pandas as pd
import numpy as np
from itertools import product
import joblib
from sklearn.preprocessing import LabelEncoder

# Data

In [2]:
data_path = "./data/"

sales_train = pd.read_csv(data_path + 'sales_train.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
shops = pd.read_csv(data_path + 'shops.csv')

test = pd.read_csv(data_path + 'test.csv')
sub = pd.read_csv(data_path + 'sample_submission.csv')

print('Before Filter ShopID:', len(sales_train))
unique_test_shop_id = test['shop_id'].unique()
sales_train = sales_train[sales_train['shop_id'].isin(unique_test_shop_id)]
print('After Filter ShopID :', len(sales_train))

Before Filter ShopID: 2935849
After Filter ShopID : 2413246


In [3]:
def downcast(df, verbose=True):
    start_memory = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col]%1==0).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif dtype_name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_memory = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print(f"Memory usage reduced from {start_memory:.2f} MB to {end_memory:.2f} MB")
        
    return df


data_files = [sales_train, items, shops, item_categories, test]
for file in data_files:
    file = downcast(file)

Memory usage reduced from 128.88 MB to 69.04 MB
Memory usage reduced from 0.51 MB to 0.23 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 4.90 MB to 1.43 MB


In [4]:
train = []

for i in sales_train['date_block_num'].unique():
    all_shop = sales_train.loc[sales_train['date_block_num'] == i, 'shop_id'].unique()
    all_item = sales_train.loc[sales_train['date_block_num'] == i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

# idx features        
idx_features = ['date_block_num', 'shop_id', 'item_id']
train = pd.DataFrame(np.vstack(train), columns=idx_features)

test['date_block_num'] = 34

# Feature Engineering

## Outliers

In [5]:
print("Before : ",len(sales_train))
sales_train = sales_train[(sales_train['item_cnt_day'] > 0)&(sales_train['item_cnt_day'] < 1000)]
print("After item_cnt_day : ",len(sales_train))
sales_train = sales_train[(sales_train['item_price'] > 0)&(sales_train['item_price'] < 50000)]
print("After item_price : ",len(sales_train))

Before :  2413246
After item_cnt_day :  2407079
After item_price :  2407076


## Feature Generation

In [6]:
def add_mean_features(df: pd.DataFrame, groupby_features: list, mean_feature_list: list = None):
    col_name = [col for col in groupby_features if col != 'date_block_num']
    base_name = '_'.join([col.replace('_id', '') for col in col_name])
    feature_name = f'{base_name}_avg_sales_date'
    
    agg_rules = {
        feature_name: ('item_cnt_month', 'mean')
    }
            
    group = df.groupby(groupby_features).agg(**agg_rules).reset_index()
    
    df = df.merge(group, on=groupby_features, how='left')
    df = downcast(df, verbose=False)
    del group
    
    if mean_feature_list is not None:
        mean_feature_list.append(feature_name)
    
    return df


def add_lag_features(df: pd.DataFrame, key_features: list, lag_features_info: dict, lag_period: list):
    """
    lag_features_info: {'feature_name_1': clip_flag_1, 'feature_name_2': clip_flag_2, ...}
    """
    
    group_keys = [key for key in key_features if key != 'date_block_num']
    grouped = df.groupby(group_keys)
    
    all_lag_cols = []
    features_to_clip = []

    for lag_feature, clip_flag in lag_features_info.items():
        for i in lag_period:
            lag_feature_name = f'{lag_feature}_lag_{i}'
            df[lag_feature_name] = grouped[lag_feature].shift(i)
            
            all_lag_cols.append(lag_feature_name)
            if clip_flag:
                features_to_clip.append(lag_feature_name)

    df[all_lag_cols] = df[all_lag_cols].fillna(0)
    
    return df, features_to_clip

### shops

In [7]:
# 기존 처리 방식
sales_train.loc[sales_train['shop_id']==0, 'shop_id'] = 57
sales_train.loc[sales_train['shop_id']==1, 'shop_id'] = 58
sales_train.loc[sales_train['shop_id']==10, 'shop_id'] = 11
sales_train.loc[sales_train['shop_id']==39, 'shop_id'] = 40

test.loc[test['shop_id']==0, 'shop_id'] = 57
test.loc[test['shop_id']==1, 'shop_id'] = 58
test.loc[test['shop_id']==10, 'shop_id'] = 11
test.loc[test['shop_id']==39, 'shop_id'] = 40

shops['city'] = shops['shop_name'].str.split(' ').str[0]
shops.loc[shops['city'] == '!Якутск', 'city'] = 'Якутск'
shops['city'] = LabelEncoder().fit_transform(shops['city'])

shops = shops.drop(columns=["shop_name"])

### items

In [8]:
items = items.drop(columns=["item_name"])
items['first_sale_month'] = sales_train.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']
items = items.fillna(34)

### item_categories

In [9]:
def split_categories(df):
    def split_func(name):
        if '-' in name:
            return name.split('-', 1)
        else:
            return [name, name]
    
    split_result = df['item_category_name'].apply(lambda x: split_func(x))
    df['subtype'] = split_result.apply(lambda x: x[1].strip())
    return df


item_categories = split_categories(item_categories)
item_categories['subtype'] = LabelEncoder().fit_transform(item_categories['subtype'])
item_categories = item_categories.drop('item_category_name', axis=1)

### set up matrix

In [10]:
group = sales_train.groupby(idx_features).agg(
    item_cnt_month = ('item_cnt_day', 'sum'),
    transaction_cnt = ('item_cnt_day', 'count'),
    avg_item_price = ('item_price', 'mean'),
).reset_index()

train = train.merge(group, on=idx_features, how='left')
all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)
all_data = all_data.fillna(0)

all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')
all_data.head()

  all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,transaction_cnt,avg_item_price,city,item_category_id,first_sale_month,subtype
0,0,59,22154,1.0,1.0,999.0,30,37,0.0,1
1,0,59,2552,0.0,0.0,0.0,30,58,0.0,29
2,0,59,2554,0.0,0.0,0.0,30,58,0.0,29
3,0,59,2555,0.0,0.0,0.0,30,56,0.0,5
4,0,59,2564,0.0,0.0,0.0,30,59,0.0,42


In [11]:
del shops, items, item_categories, group

all_data = downcast(all_data)

Memory usage reduced from 512.64 MB to 159.68 MB


### mean features

In [12]:
clip_list = []

all_data = add_mean_features(all_data, ['date_block_num','shop_id'])
all_data = add_mean_features(all_data, ['date_block_num','shop_id','item_category_id'])

all_data = add_mean_features(all_data, ['date_block_num','item_id'])
all_data = add_mean_features(all_data, ['date_block_num','item_id', 'city'])

all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id'])
all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id', 'city'])

In [13]:
mean_feature_list = [col for col in all_data.columns if '_avg_sales_date' in col]
mean_feature_list

['shop_avg_sales_date',
 'shop_item_category_avg_sales_date',
 'item_avg_sales_date',
 'item_city_avg_sales_date',
 'item_category_avg_sales_date',
 'item_category_city_avg_sales_date']

### lag features

In [14]:
%%time
lag_period = [1,2,3]

lag_features_to_process = {
    'item_cnt_month': True,
    'transaction_cnt': False,
    'avg_item_price': False
}

all_data, clip_list = add_lag_features(all_data, 
                                        key_features=idx_features,
                                        lag_features_info=lag_features_to_process,
                                        lag_period=lag_period)

all_data = downcast(all_data)

features_to_drop = []
features_to_drop.extend(['transaction_cnt', 'avg_item_price'])

Memory usage reduced from 966.46 MB to 638.70 MB
CPU times: user 5.08 s, sys: 1.6 s, total: 6.68 s
Wall time: 6.7 s


In [15]:
%%time
# Adding lag features for mean-based aggregations
for mean_feat in mean_feature_list:
    lag_features_to_process[mean_feat] = False
    
all_data, clip_list = add_lag_features(all_data, 
                                        key_features=idx_features,
                                        lag_features_info=lag_features_to_process,
                                        lag_period=lag_period)

all_data = downcast(all_data)

Memory usage reduced from 1571.55 MB to 1243.79 MB
CPU times: user 11 s, sys: 4.19 s, total: 15.2 s
Wall time: 15.2 s


In [16]:
# add average of lagged sales
sales_lag_cols = [col for col in all_data.columns if 'item_cnt_month_lag_' in col]
all_data['avg_item_cnt_month_for_lag'] = all_data[sales_lag_cols].mean(axis=1)
all_data[clip_list + ['item_cnt_month', 'avg_item_cnt_month_for_lag']] = all_data[clip_list + ['item_cnt_month', 'avg_item_cnt_month_for_lag']].clip(0, 20)

# add ratio of each lagged sale to average
for col in sales_lag_cols:
    all_data[f'ratio_{col}_to_avg'] = all_data[col] / all_data['avg_item_cnt_month_for_lag']

# add average of lagged prices
price_lag_cols = [col for col in all_data.columns if 'avg_item_price_lag_' in col]
all_data['avg_item_price_for_lag'] = all_data[price_lag_cols].mean(axis=1)

# add ratio of each lagged price to average
for col in price_lag_cols:
    all_data[f'ratio_{col}_to_avg'] = all_data[col] / all_data['avg_item_price_for_lag']

In [17]:
# add average of lagged mean features
for mean_feat in mean_feature_list:
    lag_cols = [col for col in all_data.columns if f'{mean_feat}_lag' in col]
    feat_name = f'avg_{mean_feat}_lag'
    all_data[feat_name] = all_data[lag_cols].mean(axis=1)

all_data = downcast(all_data)
features_to_drop.extend(mean_feature_list)

Memory usage reduced from 1983.35 MB to 1714.42 MB


### price trend and sales trend

In [18]:
def add_trend_feature(df: pd.DataFrame, base_col: str, lags: list):
    for i in lags:
        col_lag_curr = f'{base_col}_lag_{i}'      
        col_lag_prev = f'{base_col}_lag_{i+1}'    
        col_trend = f'delta_{i}_{base_col}'   
        
        if col_lag_prev not in df.columns or col_lag_curr not in df.columns:
            print(f'[Skip] {col_lag_curr} 또는 {col_lag_prev} 컬럼이 없어 작업을 종료합니다.')
            continue
            
        df[col_trend] = (df[col_lag_curr] - df[col_lag_prev]) / df[col_lag_prev]
        
        df[col_trend] = df[col_trend].replace([np.inf, -np.inf], np.nan)
        df[col_trend] = df[col_trend].fillna(0)
        
    df = downcast(df)

    return df

In [19]:
all_data = add_trend_feature(all_data, base_col='avg_item_price', lags=[1,2])
all_data = add_trend_feature(all_data, base_col='item_cnt_month', lags=[1,2])
all_data = add_trend_feature(all_data, base_col='transaction_cnt', lags=[1,2])

Memory usage reduced from 1848.88 MB to 1781.65 MB
Memory usage reduced from 1916.11 MB to 1848.88 MB
Memory usage reduced from 1983.35 MB to 1916.11 MB


### new item

In [20]:
all_data['new_item'] = all_data['first_sale_month'] == all_data['date_block_num']
all_data['item_age'] = all_data['date_block_num'] - all_data['first_sale_month']
features_to_drop.append('first_sale_month')

### last_sale

In [21]:
all_data['temp_last_sale'] = np.nan
all_data.loc[all_data['item_cnt_month']>0, 'temp_last_sale'] = all_data['date_block_num']

last_sale_record = all_data.groupby(['item_id', 'shop_id'])['temp_last_sale'].shift(1).ffill()

all_data['last_sale'] = (all_data['date_block_num'] - last_sale_record).fillna(-1)
all_data = all_data.drop(columns='temp_last_sale')

### days & month

In [22]:
all_data['month'] = all_data['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
all_data['days'] = all_data['month'].map(days)

In [23]:
# seasonal average features
all_data['month_avg_sales'] = np.nan
all_data['month_avg_price'] = np.nan

for month_t in all_data['date_block_num'].unique():
    mask = all_data[all_data['date_block_num'] < month_t]
    if not mask.empty:
        group = mask.groupby(['month', 'item_category_id']).agg(
            month_avg_sales=('item_cnt_month', 'mean'),
            month_avg_price=('avg_item_price', 'mean')
        ).reset_index()
        
        group['key'] = group['month'].astype(str) + '_' + group['item_category_id'].astype(str)
        sales_map_dict = dict(zip(group['key'], group['month_avg_sales']))
        price_map_dict = dict(zip(group['key'], group['month_avg_price']))
        
        current_month = (all_data['date_block_num'] == month_t)
        current_month_keys = all_data.loc[current_month, 'month'].astype(str) + '_' + all_data.loc[current_month, 'item_category_id'].astype(str)
        
        all_data.loc[current_month, 'month_avg_sales'] = current_month_keys.map(sales_map_dict)
        all_data.loc[current_month, 'month_avg_price'] = current_month_keys.map(price_map_dict)
    
all_data['month_avg_price'] = all_data['month_avg_price'].fillna(all_data['month_avg_price'].mean())
all_data['month_avg_sales'] = all_data['month_avg_sales'].fillna(all_data['month_avg_sales'].mean())

all_data = downcast(all_data)

Memory usage reduced from 2210.25 MB to 2058.98 MB


### removing columns

In [24]:
features_to_drop

['transaction_cnt',
 'avg_item_price',
 'shop_avg_sales_date',
 'shop_item_category_avg_sales_date',
 'item_avg_sales_date',
 'item_city_avg_sales_date',
 'item_category_avg_sales_date',
 'item_category_city_avg_sales_date',
 'first_sale_month']

In [25]:
all_data = all_data.drop(columns=features_to_drop)
all_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'city',
       'item_category_id', 'subtype', 'item_cnt_month_lag_1',
       'item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'transaction_cnt_lag_1',
       'transaction_cnt_lag_2', 'transaction_cnt_lag_3',
       'avg_item_price_lag_1', 'avg_item_price_lag_2', 'avg_item_price_lag_3',
       'shop_avg_sales_date_lag_1', 'shop_avg_sales_date_lag_2',
       'shop_avg_sales_date_lag_3', 'shop_item_category_avg_sales_date_lag_1',
       'shop_item_category_avg_sales_date_lag_2',
       'shop_item_category_avg_sales_date_lag_3', 'item_avg_sales_date_lag_1',
       'item_avg_sales_date_lag_2', 'item_avg_sales_date_lag_3',
       'item_city_avg_sales_date_lag_1', 'item_city_avg_sales_date_lag_2',
       'item_city_avg_sales_date_lag_3', 'item_category_avg_sales_date_lag_1',
       'item_category_avg_sales_date_lag_2',
       'item_category_avg_sales_date_lag_3',
       'item_category_city_avg_sales_date_lag_1',
       'item_cate

In [26]:
all_data.shape

(8812244, 61)

In [27]:
joblib.dump(all_data, data_path + 'all_data2.joblib')

['./data/all_data2.joblib']