# Module

In [1]:
import gc
from collections import Counter
import re
from itertools import product
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
plt.style.use("seaborn-v0_8-white")

# Data

In [2]:
data_path = "./data/"

sales_train = pd.read_csv(data_path + 'sales_train.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
shops = pd.read_csv(data_path + 'shops.csv')
test = pd.read_csv(data_path + 'test.csv')

In [None]:
# 커널 유지를 위한 메모리 사용량 줄이기
def downcast(df, verbose=True):
    start_memory = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col]%1==0).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif dtype_name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_memory = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print(f"Memory usage reduced from {start_memory:.2f} MB to {end_memory:.2f} MB")
        
    return df

## Filter

In [None]:
# ShopID 필터링
print('Before Filter ShopID:', len(sales_train))
unique_test_shop_id = test['shop_id'].unique()
sales_train = sales_train[sales_train['shop_id'].isin(unique_test_shop_id)]
print('After Filter ShopID :', len(sales_train))

Before Filter ShopID: 2935849
After Filter ShopID : 2413246


In [None]:
# 아웃라이어 제거
print("Before : ",len(sales_train))
sales_train = sales_train[sales_train['item_cnt_day'] < 1000]
print("After item_cnt_day : ",len(sales_train))
sales_train = sales_train[sales_train['item_price'] < 50000]
print("After item_price : ",len(sales_train))

Before :  2413246
After item_cnt_day :  2413244
After item_price :  2413241


In [6]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [None]:
# ShopID 수정
def fix_shop_id(df):
    df.loc[df['shop_id'] == 0, 'shop_id'] = 57
    df.loc[df['shop_id'] == 1, 'shop_id'] = 58
    df.loc[df['shop_id'] == 10, 'shop_id'] = 11
    df.loc[df['shop_id'] == 39, 'shop_id'] = 40
    return df

sales_train = fix_shop_id(sales_train)
test = fix_shop_id(test)

## Grid

In [None]:
# 데이터프레임 그리드 생성
def create_grid(sales, test_df):
    idx_features = ['shop_id', 'item_id', 'date_block_num']
    grid = []

    for block_num in sales['date_block_num'].unique():
        cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
        cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype='int32'))

    grid = pd.DataFrame(np.vstack(grid), columns = idx_features, dtype=np.int32)
    
    # Test 데이터 붙이기 (Month 34)
    test_grid = test_df[['shop_id', 'item_id']].copy()
    test_grid['date_block_num'] = 34
    
    all_grid = pd.concat([grid, test_grid], ignore_index=True, sort=False, axis=0)
    
    return all_grid

train_matrix = create_grid(sales_train, test)
idx_features = ['date_block_num', 'shop_id', 'item_id']

print('='*10, 'Matrix Table', '='*10)
print(train_matrix.head())
print('='*10, 'Matrix Table Info', '='*10)
print(train_matrix.info())

   shop_id  item_id  date_block_num
0       59    22154               0
1       59     2552               0
2       59     2554               0
3       59     2555               0
4       59     2564               0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8812167 entries, 0 to 8812166
Data columns (total 3 columns):
 #   Column          Dtype
---  ------          -----
 0   shop_id         int64
 1   item_id         int64
 2   date_block_num  int64
dtypes: int64(3)
memory usage: 201.7 MB
None


In [9]:
print('중복 행 : ', train_matrix.duplicated(subset=idx_features).sum())

중복 행 :  0


# Feature Engineering

## Define function

In [None]:
def add_mean_features(df: pd.DataFrame, groupby_features: list, mean_feature_list: list = None):
    col_name = [col for col in groupby_features if col != 'date_block_num']
    base_name = '_'.join([col.replace('_id', '') for col in col_name])
    feature_name = f'{base_name}_avg_date_sales'
    
    agg_rules = {
        feature_name: ('item_cnt_month', 'mean')
    }
    group = df.groupby(groupby_features).agg(**agg_rules).reset_index()
    df = df.merge(group, on=groupby_features, how='left')
    del group
    
    if mean_feature_list is not None:
        mean_feature_list.append(feature_name)
    return df


def add_lag_features(df: pd.DataFrame, key_features: list, lag_feature_cols: list, lag_period: list):
    df_result = df.copy()

    for i in lag_period:
        df_lag = df[key_features + lag_feature_cols].copy()
        df_lag['date_block_num'] += i
        lag_col_names = {col: f'{col}_lag_{i}' for col in lag_feature_cols}
        df_lag = df_lag.rename(columns=lag_col_names)
        df_result = pd.merge(df_result, df_lag, on=key_features, how='left')

    all_lag_cols = [f'{col}_lag_{i}' for col in lag_feature_cols for i in lag_period]
    for col in all_lag_cols:
        if 'cnt' in col or 'sales' in col:
            df_result[col] = df_result[col].fillna(0)
    return df_result

## shops

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9а-яА-Я\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


shops['shop_name_clean'] = shops['shop_name'].apply(clean_text)
shops['city'] = shops['shop_name_clean'].str.split(' ').str[0]

# 이동식/온라인
shops.loc[shops['city'].isin(['выездная', 'интернет']), 'city'] = 'special'

# 도시 이름 오타/약어
city_corrections = {
    'спб': 'санкт-петербург',
    'н': 'нижнийновгород', 
    'нижний': 'нижнийновгород', 
    'ростовнадону': 'ростов-на-дону',
    'ростов': 'ростов-на-дону',
}
shops['city'] = shops['city'].replace(city_corrections)

# 모스크바 위성도시
moscow_satellite_cities = [
    'жуковский', 'мытищи', 'химки', 'чехов', 'балашиха', 'сергиев'
]
shops.loc[shops['city'].isin(moscow_satellite_cities), 'city'] = 'москваобласть'

shops['city'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id', 'city']]

## items

In [12]:
first_sale_mon = sales_train.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']
items['first_sale_month'] = items['item_id'].map(first_sale_mon).fillna(34)

##### 월별 아이템 평균가격

In [13]:
date_item_avg_price = sales_train.groupby(['date_block_num', 'item_id']).agg(date_item_avg_price=('item_price', 'mean')).reset_index()

### meta, platform

In [14]:
items['cleaned_item_name'] = items['item_name'].apply(clean_text)
all_words = ' '.join(items['cleaned_item_name'].values).split()
Counter(all_words).most_common(50)

[('версия', 3599),
 ('pc', 2683),
 ('bd', 2322),
 ('цифровая', 2003),
 ('регион', 1849),
 ('2', 1821),
 ('русская', 1452),
 ('jewel', 1370),
 ('и', 1251),
 ('1', 1244),
 ('the', 1185),
 ('3', 1133),
 ('1с', 1120),
 ('cd', 1019),
 ('of', 955),
 ('в', 949),
 ('mp3', 948),
 ('dvd', 899),
 ('фирм', 757),
 ('xbox', 754),
 ('ps3', 735),
 ('edition', 681),
 ('фигурка', 667),
 ('для', 638),
 ('s', 615),
 ('коллекция', 576),
 ('360', 571),
 ('3d', 566),
 ('digipack', 553),
 ('4', 551),
 ('на', 535),
 ('lp', 524),
 ('с', 504),
 ('арт', 465),
 ('2cd', 452),
 ('a', 437),
 ('русские', 420),
 ('игра', 418),
 ('сб', 406),
 ('субтитры', 402),
 ('7', 367),
 ('v', 367),
 ('5', 356),
 ('футболка', 356),
 ('игрушка', 352),
 ('английская', 345),
 ('box', 336),
 ('набор', 334),
 ('8', 332),
 ('издание', 308)]

- 대상이 되는 1C COMPANY는 게임 소프트웨어를 취급하는 러시아 기업
- pc(2위), цифровая(디지털, 4위) -> 디지털 다운로드는 cd/dvd 제품과 판매형태가 전혀 다름. 온라인 위주로 판매될 것
- xbox(19위), ps3는 콘솔게임기고 360(46위)도 xbox360 콘솔을 지칭하는 것으로 추정됨
- jewel(8위, 저가판), dvd(18위), cd(14위), bd(블루레이 디스크, 3위), box(47위), edition(41위) 등은 제품형태를 뜻하는 걸로 보임
    - 상대적으로 저렴한 저가판의 판매가 많고, 고가의 에디션은 판매량이 상대적으로 적을 것으로 추정
- фигурка (피규어, 23위), футболка (티셔츠, 44위), игрушка (장난감, 45위) -> 게임 및 콘솔과 판매 패턴이 다를 것

In [15]:
def get_platform(name):
    if 'pc' in name or 'пк' in name: return 'PC'
    if 'ps3' in name: return 'PS3'
    if 'ps4' in name: return 'PS4'
    if 'xbox' in name or 'x360' in name: return 'Xbox' # 360도 Xbox로 통합
    if 'psp' in name: return 'PSP'
    if 'vita' in name or 'psv' in name: return 'PSVita'
    if 'wii' in name: return 'Wii'
    if 'mac' in name: return 'Mac'
    if 'android' in name: return 'Android'
    return 'Etc'

def get_meta_type(name):
    # 디지털/버전 정보
    if 'цифровая' in name or 'digital' in name: return 'Digital'
    if 'версия' in name: return 'Version' # Version 보통 일반판   
    # 실물 미디어
    if 'bd' in name or 'blu-ray' in name: return 'BluRay'
    if 'dvd' in name: return 'DVD'
    if 'cd' in name: return 'CD'
    if 'lp' in name: return 'Vinyl'
    # 패키지 형태
    if 'jewel' in name: return 'Jewel' # 저가판
    if 'region' in name or 'регион' in name: return 'Region' # 현지화/지역한정
    if 'edition' in name or 'издание' in name: return 'Edition' # 특별판 등
    if 'box' in name: return 'Box'
    # 굿즈
    if 'фигурка' in name: return 'Figure'
    if 'футболка' in name: return 'TShirt'
    if 'игрушка' in name: return 'Toy'
    if 'арт' in name: return 'Art'
    return 'Normal'


items['platform'] = items['cleaned_item_name'].apply(get_platform)
items['meta'] = items['cleaned_item_name'].apply(get_meta_type)
for col in ['platform', 'meta']:
    items[col] = LabelEncoder().fit_transform(items[col])
items = items.drop(columns=['item_name', 'cleaned_item_name'])

## item_categories

In [None]:
def process_category_names(item_cats):
    # ' - ' 기준으로 대분류(type) 추출
    item_cats['split'] = item_cats['item_category_name'].str.split('-')
    item_cats['type'] = item_cats['split'].map(lambda x: x[0].strip())

    # 러시아어 대분류 -> 영어 대분류 매핑
    type_map = {
        'PC': 'PC',
        'Аксессуары': 'Accessories',
        'Билеты (Цифра)': 'Tickets (Digital)',       
        'Доставка товара': 'Delivery',               
        'Игровые консоли': 'Consoles',
        'Игры': 'Games',
        'Игры Android': 'Games Android',
        'Игры MAC': 'Games MAC',
        'Игры PC': 'Games PC',
        'Карты оплаты': 'Payment Cards',             
        'Карты оплаты (Кино, Музыка, Игры)': 'Payment Cards',
        'Кино': 'Movies',
        'Книги': 'Books',
        'Музыка': 'Music',
        'Подарки': 'Gifts',
        'Программы': 'Software',
        'Служебные': 'Service',                      
        'Чистые носители (шпиль)': 'Blank Media',    
        'Чистые носители (штучные)': 'Blank Media',  
        'Элементы питания': 'Batteries'
    }
    
    item_cats['type_code'] = item_cats['type'].map(type_map)
    item_cats['type_code'] = item_cats['type_code'].fillna('Etc')
    item_cats = item_cats[['item_category_id', 'item_category_name', 'type_code']]
    item_cats['type_code'] = LabelEncoder().fit_transform(item_cats['type_code'])
    return item_cats


item_categories = process_category_names(item_categories)
item_categories = item_categories.drop(columns='item_category_name')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_cats['type_code'] = LabelEncoder().fit_transform(item_cats['type_code'])


## Grid

In [17]:
data_files = [sales_train, shops, items, item_categories, train_matrix]
for file in data_files:
    file = downcast(file)

Memory usage reduced from 128.88 MB to 69.04 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 0.85 MB to 0.13 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 201.69 MB to 33.62 MB


In [None]:
# 판매량 데이터 생성
group = sales_train.groupby(idx_features).agg(
    item_cnt_month=('item_cnt_day', 'sum'),
    transaction_cnt=('item_cnt_day', 'count')
).reset_index()

train_matrix = train_matrix.merge(group, on=idx_features, how='left')
train_matrix['item_cnt_month'] = train_matrix['item_cnt_month'].fillna(0) # 판매량 결측치
train_matrix['item_cnt_month'] = train_matrix['item_cnt_month'].clip(0, 20) # 클리핑

# 메모리 최적화
train_matrix['date_block_num'] = train_matrix['date_block_num'].astype(np.int8)
train_matrix['shop_id'] = train_matrix['shop_id'].astype(np.int8)
train_matrix['item_id'] = train_matrix['item_id'].astype(np.int16)
train_matrix['item_cnt_month'] = train_matrix['item_cnt_month'].astype(np.float16)

all_data = train_matrix.copy()
all_data.fillna(0, inplace=True)
all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(date_item_avg_price, on=['date_block_num', 'item_id'], how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')

# 최종 데이터 확인
print("--- 최종 데이터 정보 ---")
all_data.info()
print("\n--- 최종 데이터 샘플 ---")
print(all_data.tail())

--- 최종 데이터 정보 ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8812167 entries, 0 to 8812166
Data columns (total 12 columns):
 #   Column               Dtype  
---  ------               -----  
 0   shop_id              int8   
 1   item_id              int16  
 2   date_block_num       int8   
 3   item_cnt_month       float16
 4   transaction_cnt      float64
 5   city                 int8   
 6   item_category_id     int8   
 7   first_sale_month     int8   
 8   platform             int8   
 9   meta                 int8   
 10  date_item_avg_price  float64
 11  type_code            int8   
dtypes: float16(1), float64(2), int16(1), int8(8)
memory usage: 235.3 MB

--- 최종 데이터 샘플 ---
         shop_id  item_id  date_block_num  item_cnt_month  transaction_cnt  \
8812162       45    18454              34             0.0              0.0   
8812163       45    16188              34             0.0              0.0   
8812164       45    15757              34             0.0          

In [19]:
print("중복:", all_data.duplicated(subset=idx_features).sum())

중복: 0


## NaN Check

In [21]:
temp = all_data.isna().sum()
temp[temp > 0]

date_item_avg_price    214200
dtype: int64

- 34월에 대한 가격정보가 없음
- 예측시점에 알 수 없는 정보. lag 피처로만 활용하고 삭제하는 피처이기 때문에 결측치 처리 X

In [22]:
lag_1_list = []
lag_3_list = []
features_to_drop = []

lag_3_list.extend(['transaction_cnt', 'date_item_avg_price', 'item_cnt_month'])

all_data = downcast(all_data)
del shops, items, item_categories, group

Memory usage reduced from 235.31 MB to 168.08 MB


## Month

In [23]:
all_data['month'] = all_data['date_block_num'] % 12
all_data.groupby('month')['item_cnt_month'].mean().reset_index()

Unnamed: 0,month,item_cnt_month
0,0,0.338165
1,1,0.317268
2,2,0.325494
3,3,0.275147
4,4,0.272879
5,5,0.28821
6,6,0.276233
7,7,0.31377
8,8,0.296012
9,9,0.285601


- 12월의 평균 판매량이 특히 높게 나타남. 1월도 비교적 높음
- 3월과 9월의 평균 판매량이 상대적으로 높음(신학기 영향 예상)
- 테스트 데이터에 해당하는 11월은 평균 판매량이 1년 중 가장 낮음

## mean features
- 시차 피처(lag features) 생성을 위한 중간 변수로 사용
- 시차 피처 생성 후 삭제

In [24]:
mean_feature_groups = [
    ['date_block_num', 'shop_id', 'item_category_id'],
    ['date_block_num', 'item_id'],
    ['date_block_num', 'item_category_id'],
    ['date_block_num', 'city', 'item_id'],
]
for group in mean_feature_groups:
    all_data = add_mean_features(all_data, group)

In [25]:
mean_feature_list = ['shop_item_category_avg_date_sales', 'city_item_avg_date_sales']
lag_3_list.extend(['item_avg_date_sales', 'item_category_avg_date_sales'])
print('Mean Features:', len(mean_feature_list), '개')
print(mean_feature_list)

Mean Features: 2 개
['shop_item_category_avg_date_sales', 'city_item_avg_date_sales']


## Lag Features

In [26]:
lag_1_list.extend(mean_feature_list)
print('Lag 1 Features:', len(lag_1_list), lag_1_list)
print('Lag 3 Features:', len(lag_3_list), lag_3_list)

Lag 1 Features: 2 ['shop_item_category_avg_date_sales', 'city_item_avg_date_sales']
Lag 3 Features: 5 ['transaction_cnt', 'date_item_avg_price', 'item_cnt_month', 'item_avg_date_sales', 'item_category_avg_date_sales']


In [27]:
%%time
all_data = downcast(all_data)

Memory usage reduced from 445.41 MB to 310.95 MB
CPU times: user 640 ms, sys: 300 ms, total: 940 ms
Wall time: 938 ms


In [28]:
%%time
all_data = all_data.sort_values(by=['date_block_num', 'shop_id', 'item_id']).reset_index(drop=True)

all_data = add_lag_features(all_data, 
                            key_features=idx_features,
                            lag_feature_cols=lag_1_list,
                            lag_period=[1])

CPU times: user 5.4 s, sys: 2.01 s, total: 7.41 s
Wall time: 7.45 s


In [29]:
%%time
all_data = all_data.sort_values(by=['date_block_num', 'shop_id', 'item_id']).reset_index(drop=True)
all_data = add_lag_features(all_data, 
                            key_features=idx_features,
                            lag_feature_cols=lag_3_list,
                            lag_period=[1, 2, 3])

all_data = downcast(all_data)

Memory usage reduced from 1184.96 MB to 831.99 MB
CPU times: user 10.9 s, sys: 7.14 s, total: 18.1 s
Wall time: 18.1 s


In [30]:
features_to_drop.extend(lag_1_list)
features_to_drop.extend(lag_3_list)
features_to_drop.remove('item_cnt_month')
features_to_drop.remove('date_item_avg_price')

all_data = all_data.drop(columns=features_to_drop)
gc.collect()

all_data = all_data.copy()
features_to_drop = []

## price trend

In [31]:
temp = all_data.isna().sum()
temp[temp > 0]

date_item_avg_price           214200
date_item_avg_price_lag_1    2027634
date_item_avg_price_lag_2    2514857
date_item_avg_price_lag_3    2979637
dtype: int64

In [None]:
# 역대 가격 대비 변동률
item_price_table = all_data.groupby(['date_block_num', 'item_id'])['date_item_avg_price'].first().reset_index()
item_price_table = item_price_table.sort_values(by=['item_id', 'date_block_num'])
item_price_grp = item_price_table.groupby('item_id')['date_item_avg_price']
item_price_table['item_avg_price_expanding'] = item_price_grp.expanding(min_periods=1).mean().values
item_price_table['item_avg_price_expanding'] = item_price_table.groupby('item_id')['item_avg_price_expanding'].shift(1)

all_data = all_data.merge(item_price_table[['date_block_num', 'item_id', 'item_avg_price_expanding']], 
                          on=['date_block_num', 'item_id'], 
                          how='left')
all_data['item_avg_price_expanding'] = all_data.groupby('item_id')['item_avg_price_expanding'].ffill()

temp_lag_1 = all_data['date_item_avg_price_lag_1'].fillna(all_data['item_avg_price_expanding'])
all_data['delta_price_lag'] = (temp_lag_1 - all_data['item_avg_price_expanding']) / all_data['item_avg_price_expanding']

all_data['delta_price_lag'] = all_data['delta_price_lag'].replace([np.inf, -np.inf], np.nan).fillna(0)

del item_price_table, item_price_grp, temp_lag_1, all_data['item_avg_price_expanding']
print(all_data['delta_price_lag'].isna().sum())
print(len(all_data[all_data['delta_price_lag']==0]))
gc.collect()

0
4055433


0

## Item Age

In [None]:
# 상품의 최초 판매 이후 경과 기간
all_data['item_age'] = all_data['date_block_num'] - all_data['first_sale_month']
all_data.loc[all_data['item_age']>=12, 'item_age'] = 12 # 누수 방지를 위해 12개월을 최대값으로 제한

In [None]:
# 해당 카테고리 신상품의 판매량 정도(대박 신규 상품 판매량에 영향을 미칠 가능성이 높음)
new_item_df = all_data[all_data['item_age']==0]
new_item_cat_mean = new_item_df.groupby('item_category_id')['item_cnt_month'].mean().reset_index()
new_item_cat_mean.columns = ['item_category_id', 'new_item_cat_avg_cnt']
all_data = all_data.merge(new_item_cat_mean, on='item_category_id', how='left')
all_data['new_item_effect'] = np.where(all_data['item_age']==0, all_data['new_item_cat_avg_cnt'], 0)
all_data = all_data.drop(columns='new_item_cat_avg_cnt')
del new_item_df, new_item_cat_mean
gc.collect()

0

In [None]:
# 상점별 아이템의 최초 판매 이후 경과 기간
shop_first_sale = sales_train.groupby(['shop_id', 'item_id'])['date_block_num'].min()
all_data['item_shop_first_sale'] = all_data.set_index(['shop_id', 'item_id']).index.map(shop_first_sale)
all_data['item_shop_first_sale'] = all_data['item_shop_first_sale'].fillna(34)
all_data['item_shop_age'] = all_data['date_block_num'] - all_data['item_shop_first_sale']

all_data.loc[all_data['item_shop_age'] < 0, 'item_shop_age'] = 0
all_data.loc[all_data['item_shop_age']>=12, 'item_shop_age'] = 12

features_to_drop.extend(['first_sale_month', 'item_shop_first_sale'])

## Last Sale

In [None]:
# 상점별로 아이템의 마지막 판매 이후 경과 기간
temp_df = all_data[['date_block_num', 'shop_id', 'item_id', 'item_cnt_month']].copy()
temp_df = temp_df.sort_values(by=['shop_id', 'item_id', 'date_block_num']).reset_index(drop=True)

# 판매기록이 있는 월을 기록
temp_df['item_shop_last_sale'] = np.nan
temp_df.loc[temp_df['item_cnt_month'] > 0, 'item_shop_last_sale'] = temp_df['date_block_num']

# 직전판매월 기록을 가져옴
last_sale_record = temp_df.groupby(['item_id', 'shop_id'])['item_shop_last_sale'].shift(1).ffill()
temp_df['item_shop_last_sale'] = (temp_df['date_block_num'] - last_sale_record)

all_data = pd.merge(
    all_data, 
    temp_df[['date_block_num', 'shop_id', 'item_id', 'item_shop_last_sale']],
    on=['date_block_num', 'shop_id', 'item_id'],
    how='left'
)

all_data.loc[all_data['item_shop_last_sale']>=12, 'item_shop_last_sale'] = 12
all_data.loc[all_data['item_shop_last_sale']<0, 'item_shop_last_sale'] = 12
all_data['item_shop_last_sale'] = all_data['item_shop_last_sale'].fillna(12)

del temp_df, last_sale_record
gc.collect()

0

## rolling mean/std & lag_1 and lag_12

In [37]:
all_data = all_data.drop(columns=features_to_drop)
features_to_drop = []
all_data = downcast(all_data)

Memory usage reduced from 958.05 MB to 773.16 MB


In [38]:
# 최근 3 평균 판매량 & 표준편차 item_cnt_month_lag
all_data['rolling_3m_cnt_mean'] = all_data[['item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3']].mean(axis=1)
all_data['rolling_3m_cnt_std'] = all_data[['item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3']].std(axis=1)

# 최근 3개월 평균 item_avg_date_sales
all_data['rolling_3m_item_mean'] = all_data[['item_avg_date_sales_lag_1', 'item_avg_date_sales_lag_2', 'item_avg_date_sales_lag_3']].mean(axis=1)
all_data['rolling_3m_item_std'] = all_data[['item_avg_date_sales_lag_1', 'item_avg_date_sales_lag_2', 'item_avg_date_sales_lag_3']].std(axis=1)

# 최근 3개월 평균 item_avg_date_sales
all_data['rolling_3m_item_cat_mean'] = all_data[['item_category_avg_date_sales_lag_1', 'item_category_avg_date_sales_lag_2', 'item_category_avg_date_sales_lag_3']].mean(axis=1)

# 최근 3개월 월별 아이템 가격 평균 & 표준편차
all_data['rolling_3m_price_mean'] = all_data[['date_item_avg_price_lag_1', 'date_item_avg_price_lag_2', 'date_item_avg_price_lag_3']].mean(axis=1)
features_to_drop.extend(['date_item_avg_price'])

all_data = downcast(all_data)

Memory usage reduced from 1075.70 MB to 1008.47 MB


## Difference

In [40]:
def add_diff_features(df, lag_cols):
    df_result = df.copy()
    for i in range(len(lag_cols) - 1):
        base_name = lag_cols[0].replace('_lag_1', '')
        col_name = f'{base_name}_diff_{i+1}'
        diff_series = df_result[lag_cols[i]] - df_result[lag_cols[i+1]]
        df_result[col_name] = diff_series
    return df_result

item_cnt_lags = ['item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3']
item_avg_sales_lags = ['item_avg_date_sales_lag_1', 'item_avg_date_sales_lag_2', 'item_avg_date_sales_lag_3']

all_data = add_diff_features(all_data, item_cnt_lags)
all_data = add_diff_features(all_data, item_avg_sales_lags)

diff_cols = [col for col in all_data.columns if '_diff_' in col]
print(diff_cols)

del item_cnt_lags, item_avg_sales_lags, diff_cols

['item_cnt_month_diff_1', 'item_cnt_month_diff_2', 'item_avg_date_sales_diff_1', 'item_avg_date_sales_diff_2']


## removing columns

In [41]:
features_to_drop

['date_item_avg_price']

In [42]:
print(sorted([col for col in all_data.columns if '_lag_2' in col or '_lag_3' in col]))

['date_item_avg_price_lag_2', 'date_item_avg_price_lag_3', 'item_avg_date_sales_lag_2', 'item_avg_date_sales_lag_3', 'item_category_avg_date_sales_lag_2', 'item_category_avg_date_sales_lag_3', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'transaction_cnt_lag_2', 'transaction_cnt_lag_3']


In [43]:
all_data = all_data.drop(columns=features_to_drop)
features_to_drop = []

target_cols = [
    col for col in all_data.columns 
    if re.search(r'_lag_[2-3]', col)
]
# 오래된 시차 변수들 제거
features_to_drop.extend(target_cols)
features_to_drop.remove('item_avg_date_sales_lag_2')
features_to_drop.remove('item_category_avg_date_sales_lag_2')
features_to_drop.remove('item_cnt_month_lag_2')
features_to_drop.remove('item_cnt_month_lag_3')
features_to_drop.remove('transaction_cnt_lag_2')
all_data = all_data.drop(columns=features_to_drop)

print("Shape after dropping lag features:", all_data.shape)

Shape after dropping lag features: (8812167, 37)


## Category-Month Ratio

In [44]:
train_subset = all_data[all_data['date_block_num'] <= 32]
group_cat = train_subset.groupby('item_category_id')['item_cnt_month'].mean()
nov_sales = train_subset[train_subset['month'] == 10]
group_cat_nov = nov_sales.groupby('item_category_id')['item_cnt_month'].mean()

# 값이 1.0보다 크면 11월 성수기, 작으면 비수기
cat_nov_ratio = group_cat_nov / group_cat
cat_nov_ratio = cat_nov_ratio.fillna(1.0) # 결측치는 1.0(평범)으로 채움

cat_ratio_df = cat_nov_ratio.reset_index()
cat_ratio_df.columns = ['item_category_id', 'category_nov_ratio']
all_data = pd.merge(all_data, cat_ratio_df, on='item_category_id', how='left')
all_data['category_nov_ratio'] = all_data['category_nov_ratio'].fillna(1.0)

## Clip
- 데이터에 대한 설명에 따라 상점별 아이템 월간 판매량에 대한 피처들을 (0, 20)으로 클립
- 모델이 극단적인 값에 대해 학습하지 않도록 방지함

In [45]:
cols_to_clip =['item_cnt_month', 'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'rolling_3m_cnt_mean']

for col in cols_to_clip:
    all_data[col] = all_data[col].clip(0, 20)

## Check

In [46]:
sorted([col for col in all_data.columns])

['category_nov_ratio',
 'city',
 'city_item_avg_date_sales_lag_1',
 'date_block_num',
 'date_item_avg_price_lag_1',
 'delta_price_lag',
 'item_age',
 'item_avg_date_sales_diff_1',
 'item_avg_date_sales_diff_2',
 'item_avg_date_sales_lag_1',
 'item_avg_date_sales_lag_2',
 'item_category_avg_date_sales_lag_1',
 'item_category_avg_date_sales_lag_2',
 'item_category_id',
 'item_cnt_month',
 'item_cnt_month_diff_1',
 'item_cnt_month_diff_2',
 'item_cnt_month_lag_1',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 'item_id',
 'item_shop_age',
 'item_shop_last_sale',
 'meta',
 'month',
 'new_item_effect',
 'platform',
 'rolling_3m_cnt_mean',
 'rolling_3m_cnt_std',
 'rolling_3m_item_cat_mean',
 'rolling_3m_item_mean',
 'rolling_3m_item_std',
 'rolling_3m_price_mean',
 'shop_id',
 'shop_item_category_avg_date_sales_lag_1',
 'transaction_cnt_lag_1',
 'transaction_cnt_lag_2',
 'type_code']

# Save

In [47]:
all_data.shape

(8812167, 38)

In [48]:
temp = all_data.isna().sum()
temp[temp > 0]

date_item_avg_price_lag_1    2027634
rolling_3m_price_mean        1194831
dtype: int64

In [50]:
all_data.duplicated(subset=idx_features).sum()

0

In [51]:
all_data = downcast(all_data)
joblib.dump(all_data, data_path + 'all_data_result.joblib')

Memory usage reduced from 882.41 MB to 848.80 MB


['./data/all_data_result.joblib']