# 데이터 전처리 방안 수정
- TF-IDF 제거. item_name 피처 생성 : platform_type, meta_type

# Module

In [1]:
import gc
import calendar
import re
from itertools import product
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

%matplotlib inline
plt.style.use("seaborn-v0_8-white")

# Data

In [2]:
data_path = "./data/"

sales_train = pd.read_csv(data_path + 'sales_train.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
shops = pd.read_csv(data_path + 'shops.csv')

test = pd.read_csv(data_path + 'test.csv')
sub = pd.read_csv(data_path + 'sample_submission.csv')

print('Before Filter ShopID:', len(sales_train))
unique_test_shop_id = test['shop_id'].unique()
sales_train = sales_train[sales_train['shop_id'].isin(unique_test_shop_id)]
print('After Filter ShopID :', len(sales_train))

Before Filter ShopID: 2935849
After Filter ShopID : 2413246


In [3]:
def downcast(df, verbose=True):
    start_memory = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col]%1==0).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif dtype_name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_memory = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print(f"Memory usage reduced from {start_memory:.2f} MB to {end_memory:.2f} MB")
        
    return df


data_files = [sales_train, items, shops, item_categories, test]
for file in data_files:
    file = downcast(file)

Memory usage reduced from 128.88 MB to 69.04 MB
Memory usage reduced from 0.51 MB to 0.23 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 4.90 MB to 1.43 MB


In [4]:
train = []

for i in sales_train['date_block_num'].unique():
    all_shop = sales_train.loc[sales_train['date_block_num'] == i, 'shop_id'].unique()
    all_item = sales_train.loc[sales_train['date_block_num'] == i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

# idx features        
idx_features = ['date_block_num', 'shop_id', 'item_id']
train = pd.DataFrame(np.vstack(train), columns=idx_features)

test['date_block_num'] = 34
train.head()

Unnamed: 0,date_block_num,shop_id,item_id
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564


# Feature Engineering

## Outliers
- `-1` 같은 음수값은 반품을 의미할 수 있음. 제거하지 않고 `item_cnt_month`로 집계

In [5]:
print("Before : ",len(sales_train))
sales_train = sales_train[sales_train['item_cnt_day'] < 1000]
print("After item_cnt_day : ",len(sales_train))
sales_train = sales_train[sales_train['item_price'] < 50000]
print("After item_price : ",len(sales_train))

Before :  2413246
After item_cnt_day :  2413244
After item_price :  2413241


## Define function

In [None]:
def add_mean_features(df: pd.DataFrame, groupby_features: list, mean_feature_list: list = None):
    col_name = [col for col in groupby_features if col != 'date_block_num']
    base_name = '_'.join([col.replace('_id', '') for col in col_name])
    feature_name = f'{base_name}_avg_date_sales'
    
    agg_rules = {
        feature_name: ('item_cnt_month', 'mean')
    }
    
    group = df.groupby(groupby_features).agg(**agg_rules).reset_index()
    
    df = df.merge(group, on=groupby_features, how='left')
    df = downcast(df, verbose=False)
    del group
    
    if mean_feature_list is not None:
        mean_feature_list.append(feature_name)
    
    return df


def add_lag_features(df: pd.DataFrame, key_features: list, lag_feature_cols: list, lag_period: list):
    df_result = df.copy()

    for i in lag_period:
        df_lag = df[key_features + lag_feature_cols].copy()
        
        df_lag['date_block_num'] += i
        
        lag_col_names = {col: f'{col}_lag_{i}' for col in lag_feature_cols}
        df_lag = df_lag.rename(columns=lag_col_names)
        
        df_result = pd.merge(df_result, df_lag, on=key_features, how='left')

    all_lag_cols = [f'{col}_lag_{i}' for col in lag_feature_cols for i in lag_period]
    for col in all_lag_cols:
        if 'cnt' in col or 'sales' in col:
            df_result[col] = df_result[col].fillna(0)

    return df_result


def fill_price_nans(df):
    """
    1. delta 피처 -> 0으로 대체
    2. price(절대값) 피처 -> 해당 아이템의 전체 평균 가격 -> 카테고리 평균 가격 순으로 대체
    """
    # delta feature
    delta_cols = [col for col in df.columns if 'delta_price' in col]
    for col in delta_cols:
        df[col] = df[col].fillna(0)

    # price lag features
    price_cols = [col for col in df.columns if 'price' in col and 'delta' not in col]
    if len(price_cols) > 0:
        item_mean_price = df.groupby('item_id')['date_item_avg_price'].mean()
        cat_mean_price = df.groupby('item_category_id')['date_item_avg_price'].mean()

        for col in price_cols:
            nan_mask = df[col].isnull()
            if nan_mask.sum() > 0:
                df.loc[nan_mask, col] = df.loc[nan_mask, 'item_id'].map(item_mean_price)
                    
            nan_mask = df[col].isnull()
            if nan_mask.sum() > 0:
                df.loc[nan_mask, col] = df.loc[nan_mask, 'item_category_id'].map(cat_mean_price)
        
    return df

## shops

In [7]:
# 데이터 수정
sales_train.loc[sales_train['shop_id']==0, 'shop_id'] = 57
sales_train.loc[sales_train['shop_id']==1, 'shop_id'] = 58
sales_train.loc[sales_train['shop_id']==10, 'shop_id'] = 11
sales_train.loc[sales_train['shop_id']==39, 'shop_id'] = 40

test.loc[test['shop_id']==0, 'shop_id'] = 57
test.loc[test['shop_id']==1, 'shop_id'] = 58
test.loc[test['shop_id']==10, 'shop_id'] = 11
test.loc[test['shop_id']==39, 'shop_id'] = 40

shops['city'] = shops['shop_name'].str.split(' ').str[0]
shops.loc[shops['city'] == '!Якутск', 'city'] = 'Якутск'

# 특수 상점(온라인/이동식)을 별도 도시 'Special'로 분류
shops.loc[shops['city'].isin(['Выездная', 'Интернет-магазин']), 'city'] = 'Special'

# 약어 및 오표기 수정
shops.loc[shops['city'] == 'СПб', 'city'] = 'Санкт-Петербург'
shops.loc[shops['city'] == 'Н.Новгород', 'city'] = 'НижнийНовгород'
shops.loc[shops['city'] == 'РостовНаДону', 'city'] = 'Ростов-на-Дону'

# 모스크바 위성 도시 통합
moscow_satellite_cities = ['Жуковский', 'Мытищи', 'Химки', 'Чехов', 'Балашиха', 'Сергиев']
shops.loc[shops['city'].isin(moscow_satellite_cities), 'city'] = 'МоскваОбласть'

# 쇼핑몰/센터를 나타내는 러시아어 약어로 상점 구분
mall_keywords = ['ТЦ', 'ТРК', 'ТРЦ', 'MALL', 'Молл']
is_mall = shops['shop_name'].apply(lambda x: any(keyword in x for keyword in mall_keywords))

shops['shop_type'] = np.where(is_mall, 'Mall', 'Standalone')

for col in ['city', 'shop_type']:
    encoder = LabelEncoder()
    shops[col] = encoder.fit_transform(shops[col])

shops = shops.drop(columns=["shop_name"])

## items

In [8]:
first_sale_mon = sales_train.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']
items['first_sale_month'] = items['item_id'].map(first_sale_mon).fillna(34)

##### 월별 아이템 평균가격

In [9]:
date_item_avg_price = sales_train.groupby(['date_block_num', 'item_id']).agg(date_item_avg_price=('item_price', 'mean')).reset_index()

### platform, meta_type

In [10]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9а-яА-Я\s]', ' ', text) 
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def get_platform(name):
    if 'pc' in name or 'пк' in name: return 'PC'
    if 'ps3' in name: return 'PS3'
    if 'ps4' in name: return 'PS4'
    if 'xbox' in name or 'x360' in name: return 'Xbox' # 360도 Xbox로 통합
    if 'psp' in name: return 'PSP'
    if 'vita' in name or 'psv' in name: return 'PSVita'
    if 'wii' in name: return 'Wii'
    if 'mac' in name: return 'Mac'
    if 'android' in name: return 'Android'
    return 'Etc'


def get_meta_type(name):
    # 디지털/버전 정보
    if 'цифровая' in name or 'digital' in name: return 'Digital'
    if 'версия' in name: return 'Version' # Version 보통 일반판
    
    # 실물 미디어
    if 'bd' in name or 'blu-ray' in name: return 'BluRay'
    if 'dvd' in name: return 'DVD'
    if 'cd' in name: return 'CD'
    if 'lp' in name: return 'Vinyl'
    
    # 패키지 형태
    if 'jewel' in name: return 'Jewel' # 저가판
    if 'region' in name or 'регион' in name: return 'Region' # 현지화/지역한정
    if 'edition' in name or 'издание' in name: return 'Edition' # 특별판 등
    if 'box' in name: return 'Box'
    
    # 4. 굿즈
    if 'фигурка' in name: return 'Figure'
    if 'футболка' in name: return 'TShirt'
    if 'игрушка' in name: return 'Toy'
    if 'арт' in name: return 'Art'
    
    return 'Normal'

items['cleaned_item_name'] = items['item_name'].apply(clean_text)
items['platform_type'] = items['cleaned_item_name'].apply(get_platform)
items['meta_type'] = items['cleaned_item_name'].apply(get_meta_type)

for col in ['platform_type', 'meta_type']:
    items[col] = LabelEncoder().fit_transform(items[col])

items = items.drop(columns=['cleaned_item_name', 'item_name'])
gc.collect()

18

## item_categories

In [11]:
def split_categories(df):
    def split_func(name):
        if '-' in name:
            return name.split('-', 1)
        else:
            return [name, name]
    
    split_result = df['item_category_name'].apply(lambda x: split_func(x))
    df['type'] = split_result.apply(lambda x: x[0].strip())
    df['subtype'] = split_result.apply(lambda x: x[1].strip())
    return df


item_categories = split_categories(item_categories)
etc_types = item_categories['type'].value_counts()[item_categories['type'].value_counts()<5].index.tolist()
item_categories.loc[item_categories['type'].isin(etc_types), 'type'] = 'etc'
item_categories['type'] = LabelEncoder().fit_transform(item_categories['type'])
item_categories['subtype'] = LabelEncoder().fit_transform(item_categories['subtype'])
item_categories = item_categories.drop('item_category_name', axis=1)

## revenue

In [12]:
sales_train['revenue'] = sales_train['item_cnt_day'] * sales_train['item_price']
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,revenue
0,02.01.2013,0,59,22154,999.0,1,999.0
1,03.01.2013,0,25,2552,899.0,1,899.0
2,05.01.2013,0,25,2552,899.0,-1,-899.0
3,06.01.2013,0,25,2554,1709.05,1,1709.05
4,15.01.2013,0,25,2555,1099.0,1,1099.0


## set up matrix

In [13]:
group = sales_train.groupby(idx_features).agg(
    item_cnt_month = ('item_cnt_day', 'sum'),
    transaction_cnt = ('item_cnt_day', 'count'),
    date_revenue = ('revenue', 'sum')
).reset_index()

group['item_cnt_month'] = group['item_cnt_month'].clip(0, 20) # 대회 규칙 상 타겟값을 0 ~ 20으로 제한

train = train.merge(group, on=idx_features, how='left')

all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)
all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(date_item_avg_price, on=['date_block_num', 'item_id'], how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')
all_data = all_data.sort_values(by=['date_block_num', 'shop_id', 'item_id']).reset_index(drop=True)
all_data.head()

  all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,transaction_cnt,date_revenue,city,shop_type,item_category_id,first_sale_month,platform_type,meta_type,date_item_avg_price,type,subtype
0,0,2,19,,,,1,0,40,0.0,1,9,28.0,4,6
1,0,2,27,1.0,1.0,2499.0,1,0,19,0.0,4,13,2397.5,3,12
2,0,2,28,,,,1,0,30,0.0,1,13,549.0,0,57
3,0,2,29,,,,1,0,23,0.0,8,13,2499.0,3,18
4,0,2,32,,,,1,0,40,0.0,1,9,348.889199,4,6


In [14]:
all_data.isna().sum()

date_block_num               0
shop_id                      0
item_id                      0
item_cnt_month         7513150
transaction_cnt        7513150
date_revenue           7513150
city                         0
shop_type                    0
item_category_id             0
first_sale_month             0
platform_type                0
meta_type                    0
date_item_avg_price     214277
type                         0
subtype                      0
dtype: int64

> 결측치 처리 방향

- `item_cnt_month`, `transaction_cnt`, `date_revenue`는 판매가 없어 nan으로 표시된 걸로 추정 -> 결측치를 0으로 대체
- `date_item_avg_price` 는 월별 아이템카테고리 평균가격으로 대체

In [15]:
fill_zero_cols = ['item_cnt_month', 'transaction_cnt', 'date_revenue']
all_data[fill_zero_cols] = all_data[fill_zero_cols].fillna(0)

cat_date_price = all_data.groupby(['date_block_num', 'item_category_id'])['date_item_avg_price'].mean().reset_index()
cat_date_price.columns = ['date_block_num', 'item_category_id', 'date_cat_avg_price']
all_data = all_data.merge(cat_date_price, on=['date_block_num', 'item_category_id'], how='left')
all_data['date_item_avg_price'] = all_data['date_item_avg_price'].fillna(all_data['date_cat_avg_price'])

all_data = all_data.drop(columns=['date_cat_avg_price'])

In [16]:
need_lag_list = []
long_lag_list = []
features_to_drop = []

need_lag_list.append('transaction_cnt')
long_lag_list.extend(['item_cnt_month', 'date_item_avg_price'])

In [17]:
del shops, items, item_categories, group, cat_date_price 

all_data = downcast(all_data)
gc.collect()

Memory usage reduced from 848.81 MB to 252.12 MB


0

## month, days

In [18]:
all_data['month'] = all_data['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
all_data['days'] = all_data['month'].map(days)
all_data.groupby('month')['item_cnt_month'].mean().reset_index()

Unnamed: 0,month,item_cnt_month
0,0,0.334109
1,1,0.31388
2,2,0.32043
3,3,0.271081
4,4,0.268585
5,5,0.284257
6,6,0.272341
7,7,0.309271
8,8,0.291132
9,9,0.281022


- 12월의 평균 판매량이 특히 높게 나타남. 1월도 비교적 높음
- 3월과 9월의 평균 판매량이 상대적으로 높음(신학기 영향 예상)
- 테스트 데이터에 해당하는 11월은 평균 판매량이 1년 중 가장 낮음

In [19]:
# 0: Low (11월 포함), 1: Normal, 2: High, 3: Peak
season_dict = {
    11: 3, # 연말
    0: 2, 1:2, 2: 2, 7: 2,
    3: 1, 4:1, 5: 1, 6: 1, 8: 1, 9: 1,
    10: 0 # 
}

all_data['season_type'] = all_data['month'].map(season_dict).astype('int8')

In [20]:
def count_weekends(date_block_num):
    year = 2013 + date_block_num // 12
    month = 1 + date_block_num % 12
    
    _, last_day = calendar.monthrange(year, month)
    dates = pd.date_range(start=f'{year}-{month}-01', end=f'{year}-{month}-{last_day}')
    weekend_count = dates.weekday.isin([5, 6]).sum()
    
    return weekend_count

weekend_map = {i: count_weekends(i) for i in range(35)}
all_data['num_weekends'] = all_data['date_block_num'].map(weekend_map).astype('int8')
all_data[['date_block_num', 'month', 'num_weekends']].head()

Unnamed: 0,date_block_num,month,num_weekends
0,0,0,8
1,0,0,8
2,0,0,8
3,0,0,8
4,0,0,8


In [21]:
all_data['num_weekends'].value_counts()

num_weekends
8     4500289
9     2556125
10    1755830
Name: count, dtype: int64

## mean features

In [None]:
# new
all_data = add_mean_features(all_data, ['date_block_num', 'shop_id'])
all_data = add_mean_features(all_data, ['date_block_num', 'shop_id', 'item_category_id'])
all_data = add_mean_features(all_data, ['date_block_num', 'shop_id', 'platform_type'])
all_data = add_mean_features(all_data, ['date_block_num', 'shop_id', 'meta_type'])

all_data = add_mean_features(all_data, ['date_block_num', 'platform_type'])
all_data = add_mean_features(all_data, ['date_block_num', 'meta_type'])

all_data = add_mean_features(all_data, ['date_block_num', 'item_id'])
all_data = add_mean_features(all_data, ['date_block_num', 'city', 'item_id'])

all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id'])
all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id', 'platform_type'])
all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id', 'meta_type'])

In [23]:
# # origin
# all_data = add_mean_features(all_data, ['date_block_num', 'shop_id'])
# all_data = add_mean_features(all_data, ['date_block_num', 'shop_id', 'item_category_id'])
# all_data = add_mean_features(all_data, ['date_block_num', 'shop_id', 'platform_type'])
# all_data = add_mean_features(all_data, ['date_block_num', 'shop_id', 'meta_type'])

# all_data = add_mean_features(all_data, ['date_block_num', 'item_id'])

# all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id'])
# all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id', 'platform_type'])
# all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id', 'meta_type'])

In [24]:
mean_feature_list = [col for col in all_data.columns if '_avg_date_sales' in col and 'item_avg_date_sales' not in col]
long_lag_list.append('item_avg_date_sales')
mean_feature_list.append('city_item_avg_date_sales') # 
mean_feature_list

['shop_avg_date_sales',
 'shop_item_category_avg_date_sales',
 'shop_platform_type_avg_date_sales',
 'shop_meta_type_avg_date_sales',
 'item_category_avg_date_sales',
 'item_category_platform_type_avg_date_sales',
 'item_category_meta_type_avg_date_sales',
 'city_item_avg_date_sales']

## shop revenue share

In [25]:
total_rev = all_data.groupby('date_block_num')['date_revenue'].sum().reset_index()
total_rev.columns = ['date_block_num', 'total_revenue']

shop_rev = all_data.groupby(['date_block_num', 'shop_id'])['date_revenue'].sum().reset_index()
shop_rev.columns = ['date_block_num', 'shop_id', 'shop_revenue_month']

shop_rev = pd.merge(shop_rev, total_rev, on='date_block_num', how='left')
shop_rev['shop_revenue_share'] = shop_rev['shop_revenue_month'] / shop_rev['total_revenue']
shop_rev['shop_revenue_share'] = shop_rev['shop_revenue_share'].fillna(0)

all_data = pd.merge(all_data, shop_rev[['date_block_num', 'shop_id', 'shop_revenue_share']], on=['date_block_num', 'shop_id'], how='left')

need_lag_list.append('shop_revenue_share')

## lag features

In [26]:
need_lag_list.extend(mean_feature_list)
need_lag_list

['transaction_cnt',
 'shop_revenue_share',
 'shop_avg_date_sales',
 'shop_item_category_avg_date_sales',
 'shop_platform_type_avg_date_sales',
 'shop_meta_type_avg_date_sales',
 'item_category_avg_date_sales',
 'item_category_platform_type_avg_date_sales',
 'item_category_meta_type_avg_date_sales',
 'city_item_avg_date_sales']

In [27]:
long_lag_list

['item_cnt_month', 'date_item_avg_price', 'item_avg_date_sales']

In [28]:
%%time
lag_period = [1, 2, 3]
long_lag_period = [1,2,3,4,5,6]

all_data = all_data.sort_values(by=['date_block_num', 'shop_id', 'item_id']).reset_index(drop=True)
all_data = add_lag_features(all_data, 
                            key_features=idx_features,
                            lag_feature_cols=need_lag_list,
                            lag_period=lag_period)

all_data = add_lag_features(all_data, 
                            key_features=idx_features,
                            lag_feature_cols=long_lag_list,
                            lag_period=long_lag_period)

all_data['shop_revenue_share_lag_1'].fillna(0, inplace=True)

CPU times: user 29 s, sys: 31.1 s, total: 1min
Wall time: 1min 2s


In [29]:
all_data = fill_price_nans(all_data)
print(all_data[[col for col in all_data.columns if 'price' in col]].isnull().sum())

Filled 0 delta columns with 0.
Filled 7 price columns with item/category means.
date_item_avg_price          0
date_item_avg_price_lag_1    0
date_item_avg_price_lag_2    0
date_item_avg_price_lag_3    0
date_item_avg_price_lag_4    0
date_item_avg_price_lag_5    0
date_item_avg_price_lag_6    0
dtype: int64


In [30]:
all_data = downcast(all_data)

features_to_drop.extend(need_lag_list)
features_to_drop.extend(long_lag_list)
features_to_drop.remove('item_cnt_month')
features_to_drop.remove('date_item_avg_price')

all_data = all_data.drop(columns=features_to_drop)
gc.collect()

all_data = all_data.copy()
features_to_drop = []

Memory usage reduced from 2874.17 MB to 2210.25 MB


## price trend

In [31]:
# price trend
all_data = all_data.sort_values(by=['item_id', 'date_block_num'])
df_temp = all_data[['item_id', 'date_item_avg_price']].copy()

grp = df_temp.groupby('item_id')['date_item_avg_price']
cumsum = grp.cumsum()
cumcount = grp.cumcount() + 1
df_temp['expanding_price_mean'] = cumsum / cumcount
df_temp['expanding_price_mean'] = df_temp['expanding_price_mean'].shift(1)

mask = df_temp['item_id'] != df_temp['item_id'].shift(1)
df_temp.loc[mask, 'expanding_price_mean'] = 0
df_temp['expanding_price_mean'] = df_temp['expanding_price_mean']

all_data['item_avg_price_expanding'] = df_temp['expanding_price_mean']

del df_temp, grp, cumsum, cumcount, mask
gc.collect()

all_data['delta_price_lag'] = (all_data['date_item_avg_price_lag_1'] - all_data['item_avg_price_expanding']) / all_data['item_avg_price_expanding']
all_data['delta_price_lag'] = all_data['delta_price_lag'].replace([np.inf, -np.inf], np.nan)
all_data = fill_price_nans(all_data)
all_data[['item_id', 'date_block_num', 'delta_price_lag']].head()

Filled 1 delta columns with 0.
Filled 8 price columns with item/category means.


Unnamed: 0,item_id,date_block_num,delta_price_lag
4136592,1,15,0.0
4142959,1,15,0.0
4149326,1,15,0.0
4155693,1,15,0.0
4162060,1,15,0.0


## shop revenue trend

In [32]:
all_data = all_data.sort_values(by=['shop_id', 'item_id', 'date_block_num'])
shop_monthly = all_data.groupby(['shop_id', 'date_block_num'])['date_revenue'].sum().reset_index().rename(columns={'date_revenue':'shop_revenue'})

shop_monthly = shop_monthly.sort_values(by=['shop_id', 'date_block_num'])
grp = shop_monthly.groupby('shop_id')['shop_revenue']
cumsum = grp.cumsum()
cumcount = grp.cumcount() + 1
shop_monthly['shop_expanding_mean'] = cumsum / cumcount

shop_monthly['shop_expanding_mean'] = shop_monthly['shop_expanding_mean'].shift(1)
mask = shop_monthly['shop_id'] != shop_monthly['shop_id'].shift(1)
shop_monthly.loc[mask, 'shop_expanding_mean'] = 0
shop_monthly['shop_expanding_mean'] = shop_monthly['shop_expanding_mean'].fillna(0)

shop_monthly['shop_revenue_lag_1'] = grp.shift(1).fillna(0)
shop_monthly['delta_shop_revenue_lag'] = (shop_monthly['shop_revenue_lag_1'] - shop_monthly['shop_expanding_mean']) / shop_monthly['shop_expanding_mean']
shop_monthly['delta_shop_revenue_lag'] = shop_monthly['delta_shop_revenue_lag'].replace([np.inf, -np.inf], np.nan).fillna(0)

all_data = pd.merge(all_data, shop_monthly[['shop_id', 'date_block_num', 'delta_shop_revenue_lag']], 
                    on=['shop_id', 'date_block_num'], how='left')

del shop_monthly, grp, cumsum, cumcount, mask
features_to_drop.append('date_revenue')
gc.collect()
all_data[['shop_id', 'date_block_num', 'delta_shop_revenue_lag']].head()

Unnamed: 0,shop_id,date_block_num,delta_shop_revenue_lag
0,2,15,0.588033
1,2,18,0.054119
2,2,19,-0.076263
3,2,20,0.000553
4,2,21,0.220505


## item_age

In [33]:
all_data['item_age'] = all_data['date_block_num'] - all_data['first_sale_month']
all_data.loc[all_data['item_age'] < 0, 'item_age'] = -1 # 출시전 기간을 모두 -1으로 처리 -> 출시(혹은 판매)까지 남은 기간을 모델이 알 수 없도록 함
all_data['new_item'] = (all_data['item_age'] == 0).astype('int8')
features_to_drop.append('first_sale_month')

## since last sale

In [34]:
temp_df = all_data[['date_block_num', 'shop_id', 'item_id', 'item_cnt_month']].copy()
temp_df = temp_df.sort_values(by=['shop_id', 'item_id', 'date_block_num']).reset_index(drop=True)

temp_df['temp_last_sale'] = np.nan
temp_df.loc[temp_df['item_cnt_month'] > 0, 'temp_last_sale'] = temp_df['date_block_num']
last_sale_record = temp_df.groupby(['item_id', 'shop_id'])['temp_last_sale'].shift(1).ffill()
temp_df['since_last_sale'] = (temp_df['date_block_num'] - last_sale_record).fillna(-999)
all_data = pd.merge(
    all_data, 
    temp_df[['date_block_num', 'shop_id', 'item_id', 'since_last_sale']],
    on=['date_block_num', 'shop_id', 'item_id'],
    how='left'
)

del temp_df, last_sale_record
all_data = downcast(all_data, verbose=False)
gc.collect()

0

## rolling mean/std & lag_1 and lag_12

In [35]:
# 최근 3/6개월 평균 판매량 & 표준편차 item_avg_date_sales
all_data['rolling_3m_cnt_mean'] = all_data[['item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3']].mean(axis=1)
all_data['rolling_3m_cnt_std'] = all_data[['item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3']].std(axis=1)
all_data['rolling_6m_cnt_mean'] = all_data[[col for col in all_data.columns if 'item_cnt_month_lag_' in col]].mean(axis=1)
all_data['rolling_6m_cnt_std'] = all_data[[col for col in all_data.columns if 'item_cnt_month_lag_' in col]].std(axis=1)

# 최근 3/6개월 item_avg_date_sales
all_data['rolling_3m_item_mean'] = all_data[['item_avg_date_sales_lag_1', 'item_avg_date_sales_lag_2', 'item_avg_date_sales_lag_3']].mean(axis=1)
all_data['rolling_3m_item_std'] = all_data[['item_avg_date_sales_lag_1', 'item_avg_date_sales_lag_2', 'item_avg_date_sales_lag_3']].std(axis=1)
all_data['rolling_6m_item_mean'] = all_data[[col for col in all_data.columns if 'item_avg_date_sales_lag_' in col]].mean(axis=1)
all_data['rolling_6m_item_std'] = all_data[[col for col in all_data.columns if 'item_avg_date_sales_lag_' in col]].std(axis=1)

# 최근 3개월 월별 아이템 가격 평균 & 표준편차
all_data['rolling_3m_price_mean'] = all_data[['date_item_avg_price_lag_1', 'date_item_avg_price_lag_2', 'date_item_avg_price_lag_3']].mean(axis=1)
all_data['rolling_3m_price_std'] = all_data[['date_item_avg_price_lag_1', 'date_item_avg_price_lag_2', 'date_item_avg_price_lag_3']].std(axis=1)
all_data['rolling_6m_price_mean'] = all_data[[col for col in all_data.columns if 'date_item_avg_price_lag_' in col]].mean(axis=1)
all_data['rolling_6m_price_std'] = all_data[[col for col in all_data.columns if 'date_item_avg_price_lag_' in col]].std(axis=1)

# 12개월 전 판매량/아이템 가격
all_data = all_data.sort_values(by=['shop_id', 'item_id', 'date_block_num']).reset_index(drop=True) 
group = all_data.groupby(['shop_id', 'item_id'])
all_data['item_cnt_month_lag_12'] = group['item_cnt_month'].shift(12).fillna(0)
all_data['date_item_avg_price_lag_12'] = group['date_item_avg_price'].shift(12) # 가격의 결측치는 0으로 채우지 않음
all_data = fill_price_nans(all_data)

features_to_drop.extend(['date_item_avg_price'])
all_data = downcast(all_data)

Filled 1 delta columns with 0.
Filled 13 price columns with item/category means.
Memory usage reduced from 2840.56 MB to 2580.03 MB


- 작년 동월 판매량 및 평균 가격에 대해 lag_12는 중요한 특징으로 보임
- 다만, 초기 데이터 11개월에 대해선 결측치가 많아 피처의 질이 떨어질 수 있음
- 따라서 초기 11개월의 데이터를 삭제하기로 함

## items per transaction

In [36]:
sales_col = ['item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3']
transaction_col = ['transaction_cnt_lag_1', 'transaction_cnt_lag_2', 'transaction_cnt_lag_3']

for i in range(3):
    col_name = f'items_per_transaction_lag_{i+1}'
    all_data[col_name] = all_data[sales_col[i]] / all_data[transaction_col[i]]
    all_data[col_name] = all_data[col_name].replace([np.inf, -np.inf], np.nan).fillna(0)

## item_shop_first_sale

In [None]:
all_data['item_shop_first_sale'] = -1
all_data['item_shop_age'] = -1

for month in range(1, 35):
    current_month_mask = all_data['date_block_num'] == month

    sales_until_previous_month = all_data[(all_data['date_block_num'] < month) & (all_data['item_cnt_month'] > 0)]
    first_appearance = sales_until_previous_month.groupby(['shop_id', 'item_id'])['date_block_num'].min().reset_index()
    first_appearance.columns = ['shop_id', 'item_id', 'first_sale']
    
    temp_df = all_data.loc[current_month_mask].copy()
    temp_df = pd.merge(temp_df.drop(columns=['item_shop_first_sale']), 
                       first_appearance, 
                       on=['shop_id', 'item_id'], 
                       how='left')
    
    temp_df = temp_df.rename(columns={'first_sale': 'item_shop_first_sale'})
    temp_df['item_shop_first_sale'] = temp_df['item_shop_first_sale'].fillna(-1)
    all_data.loc[current_month_mask, 'item_shop_first_sale'] = temp_df['item_shop_first_sale'].values

all_data['item_shop_age'] = all_data['date_block_num'] - all_data['item_shop_first_sale']
all_data.loc[all_data['item_shop_first_sale']==-1, 'item_shop_age'] = -1
all_data['is_new_on_shelf'] = (all_data['item_shop_age'] == 0).astype('int8')

all_data = downcast(all_data)

Memory usage reduced from 2924.60 MB to 2706.09 MB


## removing columns

In [38]:
features_to_drop

['date_revenue', 'first_sale_month', 'date_item_avg_price']

In [39]:
print(sorted([col for col in all_data.columns if '_lag_2' in col or '_lag_3' in col]))

['city_item_avg_date_sales_lag_2', 'city_item_avg_date_sales_lag_3', 'date_item_avg_price_lag_2', 'date_item_avg_price_lag_3', 'item_avg_date_sales_lag_2', 'item_avg_date_sales_lag_3', 'item_category_avg_date_sales_lag_2', 'item_category_avg_date_sales_lag_3', 'item_category_meta_type_avg_date_sales_lag_2', 'item_category_meta_type_avg_date_sales_lag_3', 'item_category_platform_type_avg_date_sales_lag_2', 'item_category_platform_type_avg_date_sales_lag_3', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'items_per_transaction_lag_2', 'items_per_transaction_lag_3', 'shop_avg_date_sales_lag_2', 'shop_avg_date_sales_lag_3', 'shop_item_category_avg_date_sales_lag_2', 'shop_item_category_avg_date_sales_lag_3', 'shop_meta_type_avg_date_sales_lag_2', 'shop_meta_type_avg_date_sales_lag_3', 'shop_platform_type_avg_date_sales_lag_2', 'shop_platform_type_avg_date_sales_lag_3', 'shop_revenue_share_lag_2', 'shop_revenue_share_lag_3', 'transaction_cnt_lag_2', 'transaction_cnt_lag_3']


In [40]:
all_data = all_data.drop(columns=features_to_drop)
features_to_drop = []

target_cols = [
    col for col in all_data.columns 
    if re.search(r'_lag_[2-6]', col)
]
# 오래된 시차 변수들 제거
features_to_drop.extend(target_cols)
features_to_drop.remove('item_avg_date_sales_lag_2')
features_to_drop.remove('item_cnt_month_lag_2')
features_to_drop.remove('item_cnt_month_lag_3')
features_to_drop.remove('transaction_cnt_lag_2')
features_to_drop.remove('date_item_avg_price_lag_2')
features_to_drop.remove('date_item_avg_price_lag_3')
all_data = all_data.drop(columns=features_to_drop)

print("Shape after dropping lag features:", all_data.shape)

Shape after dropping lag features: (8812244, 58)


## `date_block_num` >= 12

In [41]:
temp_all_data = all_data.copy()
all_data = all_data[all_data['date_block_num']>=12]
print(f"Shape after filtering: {all_data.shape}")

Shape after filtering: (5459310, 58)


In [42]:
sorted([col for col in all_data.columns if 'item_name_svd_' not in col])

['city',
 'city_item_avg_date_sales_lag_1',
 'date_block_num',
 'date_item_avg_price_lag_1',
 'date_item_avg_price_lag_12',
 'date_item_avg_price_lag_2',
 'date_item_avg_price_lag_3',
 'days',
 'delta_price_lag',
 'delta_shop_revenue_lag',
 'is_new_on_shelf',
 'item_age',
 'item_avg_date_sales_lag_1',
 'item_avg_date_sales_lag_2',
 'item_avg_price_expanding',
 'item_category_avg_date_sales_lag_1',
 'item_category_id',
 'item_category_meta_type_avg_date_sales_lag_1',
 'item_category_platform_type_avg_date_sales_lag_1',
 'item_cnt_month',
 'item_cnt_month_lag_1',
 'item_cnt_month_lag_12',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 'item_id',
 'item_shop_age',
 'item_shop_first_sale',
 'items_per_transaction_lag_1',
 'meta_type',
 'month',
 'new_item',
 'num_weekends',
 'platform_type',
 'rolling_3m_cnt_mean',
 'rolling_3m_cnt_std',
 'rolling_3m_item_mean',
 'rolling_3m_item_std',
 'rolling_3m_price_mean',
 'rolling_3m_price_std',
 'rolling_6m_cnt_mean',
 'rolling_6m_cnt_std',
 'r

# save

In [43]:
joblib.dump(all_data, data_path + 'all_data2.joblib')

['./data/all_data2.joblib']

In [44]:
all_data.loc[(all_data['item_cnt_month_lag_1'] == all_data['item_cnt_month'])]['item_cnt_month'].value_counts()

item_cnt_month
0     4268529
1      109427
2       18004
3        5524
4        2231
20       2215
5        1099
6         628
7         406
8         224
9         138
10        103
11         72
12         49
13         41
14         24
15         21
19         14
18         12
17         11
16          9
Name: count, dtype: int64