# 데이터 전처리 방안 수정
- `04_08_FE_.ipynb`의 내용을 기본적으로 유지
- `item_name`에 대해 TF-IDF를 적용하여 피처를 추가
- lag_2, lag_3을 제거하고, 이동편균/표준편차를 추가 + lag_12 추가
- 중복된 가격관련 변수들 제거
- `transaction_cnt` 변수 제거. `item_cnt_month`와 의미가 중복됨

# Module

In [1]:
import calendar
import re
from itertools import product
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Data

In [2]:
data_path = "./data/"

sales_train = pd.read_csv(data_path + 'sales_train.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
shops = pd.read_csv(data_path + 'shops.csv')

test = pd.read_csv(data_path + 'test.csv')
sub = pd.read_csv(data_path + 'sample_submission.csv')

print('Before Filter ShopID:', len(sales_train))
unique_test_shop_id = test['shop_id'].unique()
sales_train = sales_train[sales_train['shop_id'].isin(unique_test_shop_id)]
print('After Filter ShopID :', len(sales_train))

Before Filter ShopID: 2935849
After Filter ShopID : 2413246


In [3]:
def downcast(df, verbose=True):
    start_memory = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col]%1==0).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif dtype_name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_memory = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print(f"Memory usage reduced from {start_memory:.2f} MB to {end_memory:.2f} MB")
        
    return df


data_files = [sales_train, items, shops, item_categories, test]
for file in data_files:
    file = downcast(file)

Memory usage reduced from 128.88 MB to 69.04 MB
Memory usage reduced from 0.51 MB to 0.23 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 4.90 MB to 1.43 MB


In [4]:
train = []

for i in sales_train['date_block_num'].unique():
    all_shop = sales_train.loc[sales_train['date_block_num'] == i, 'shop_id'].unique()
    all_item = sales_train.loc[sales_train['date_block_num'] == i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

# idx features        
idx_features = ['date_block_num', 'shop_id', 'item_id']
train = pd.DataFrame(np.vstack(train), columns=idx_features)

test['date_block_num'] = 34
train.head()

Unnamed: 0,date_block_num,shop_id,item_id
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564


# Feature Engineering

## Outliers
- `-1` 같은 음수값은 반품을 의미할 수 있음. 제거하지 않고 `item_cnt_month`로 집계

In [5]:
print("Before : ",len(sales_train))
sales_train = sales_train[sales_train['item_cnt_day'] < 1000]
print("After item_cnt_day : ",len(sales_train))
sales_train = sales_train[sales_train['item_price'] < 50000]
print("After item_price : ",len(sales_train))

Before :  2413246
After item_cnt_day :  2413244
After item_price :  2413241


## Feature Generation

In [6]:
def add_mean_features(df: pd.DataFrame, groupby_features: list, mean_feature_list: list = None):
    col_name = [col for col in groupby_features if col != 'date_block_num']
    base_name = '_'.join([col.replace('_id', '') for col in col_name])
    feature_name = f'{base_name}_avg_date_sales'
    
    agg_rules = {
        feature_name: ('item_cnt_month', 'mean')
    }
            
    group = df.groupby(groupby_features).agg(**agg_rules).reset_index()
    
    df = df.merge(group, on=groupby_features, how='left')
    df = downcast(df, verbose=False)
    del group
    
    if mean_feature_list is not None:
        mean_feature_list.append(feature_name)
    
    return df

def add_lag_features(df: pd.DataFrame, key_features: list, lag_features_info: dict, lag_period: list):
    """
    lag_features_info: {'feature_name_1': clip_flag_1, 'feature_name_2': clip_flag_2, ...}
    """
    
    group_keys = [key for key in key_features if key != 'date_block_num']
    grouped = df.groupby(group_keys)
    
    all_lag_cols = []
    features_to_clip = []

    for lag_feature, clip_flag in lag_features_info.items():
        for i in lag_period:
            lag_feature_name = f'{lag_feature}_lag_{i}'
            df[lag_feature_name] = grouped[lag_feature].shift(i).fillna(0)
            
            all_lag_cols.append(lag_feature_name)
            if clip_flag:
                features_to_clip.append(lag_feature_name)
    
    return df, features_to_clip

### shops

In [7]:
# 데이터 수정
sales_train.loc[sales_train['shop_id']==0, 'shop_id'] = 57
sales_train.loc[sales_train['shop_id']==1, 'shop_id'] = 58
sales_train.loc[sales_train['shop_id']==10, 'shop_id'] = 11
sales_train.loc[sales_train['shop_id']==39, 'shop_id'] = 40

test.loc[test['shop_id']==0, 'shop_id'] = 57
test.loc[test['shop_id']==1, 'shop_id'] = 58
test.loc[test['shop_id']==10, 'shop_id'] = 11
test.loc[test['shop_id']==39, 'shop_id'] = 40

# 도시 구분
shops['city'] = shops['shop_name'].str.split(' ').str[0]
shops.loc[shops['city'] == '!Якутск', 'city'] = 'Якутск'

# 쇼핑몰/센터를 나타내는 러시아어 약어로 상점 구분
mall_keywords = ['ТЦ', 'ТРК', 'ТРЦ', 'MALL', 'Молл']
is_mall = shops['shop_name'].apply(lambda x: any(keyword in x for keyword in mall_keywords))

shops['shop_type'] = np.where(is_mall, 'Mall', 'Standalone')

encoder = LabelEncoder()
for col in ['city', 'shop_type']:
    shops[col] = encoder.fit_transform(shops[col])

shops = shops.drop(columns=["shop_name"])

### items

In [8]:
items['first_sale_month'] = sales_train.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']
items = items.fillna(34)

##### 아이템별 평균가격

In [9]:
# 아이템별 평균 가격
group = sales_train.groupby(['item_id']).agg(item_avg_price=('item_price', 'mean')).reset_index()
items = items.merge(group, on='item_id', how='left')

temp = sales_train.merge(items[['item_id', 'item_category_id']], on='item_id', how='left')
cat_price_map = temp.groupby('item_category_id')['item_price'].mean()
items['item_avg_price'] = items['item_avg_price'].fillna(items['item_category_id'].map(cat_price_map))
item_avg_price = items[['item_id', 'item_avg_price']]

del temp, cat_price_map, items['item_avg_price']

##### 월별 아이템 평균가격

In [10]:
date_item_avg_price = sales_train.groupby(['date_block_num', 'item_id']).agg(date_item_avg_price=('item_price', 'mean')).reset_index()

### TF-IDF

In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

items['clean_item_name'] = items['item_name'].apply(clean_text)

tfidf = TfidfVectorizer(max_features=1000)
item_name_tfidf = tfidf.fit_transform(items['clean_item_name'])

pca = PCA(n_components=25)
item_name_pca = pca.fit_transform(item_name_tfidf.toarray())

pca_cols = [f'item_name_pca_{i}' for i in range(25)]
item_name_pca_df = pd.DataFrame(item_name_pca, columns=pca_cols)
item_name_pca_df['item_id'] = items['item_id']

items = items.merge(item_name_pca_df, on='item_id', how='left')
items = items.drop(columns=['clean_item_name', 'item_name'])
items.head()

Unnamed: 0,item_id,item_category_id,first_sale_month,item_name_pca_0,item_name_pca_1,item_name_pca_2,item_name_pca_3,item_name_pca_4,item_name_pca_5,item_name_pca_6,...,item_name_pca_15,item_name_pca_16,item_name_pca_17,item_name_pca_18,item_name_pca_19,item_name_pca_20,item_name_pca_21,item_name_pca_22,item_name_pca_23,item_name_pca_24
0,0,40,34.0,-0.031618,-0.042669,-0.072524,-0.015254,-0.015637,-0.029837,-0.009403,...,-0.00026,-0.0051,-0.00315,-0.00102,0.017783,-0.01677,-0.00271,0.006224,-0.01114,-0.005941
1,1,76,15.0,-0.070875,-0.13528,0.262862,-0.050056,-0.052989,-0.010333,-0.004627,...,0.03453,-0.023243,-0.02181,-0.019322,0.012253,0.008235,-0.017802,-0.013203,0.009162,0.040275
2,2,40,34.0,-0.031141,-0.04208,-0.071788,-0.016624,-0.015819,-0.028404,-0.008475,...,0.002678,-0.004804,-0.0028,-0.002973,0.017392,-0.017227,-0.002329,0.004851,-0.009087,-0.004364
3,3,40,34.0,-0.031141,-0.04208,-0.071788,-0.016624,-0.015819,-0.028404,-0.008475,...,0.002678,-0.004804,-0.0028,-0.002973,0.017392,-0.017227,-0.002329,0.004851,-0.009087,-0.004364
4,4,40,34.0,-0.031595,-0.042901,-0.07256,-0.017793,-0.015983,-0.030477,-0.009632,...,0.002413,-0.004896,-0.000723,-0.005417,0.018329,-0.01759,-0.002391,0.005069,-0.010132,-0.002561


### item_categories

In [12]:
def split_categories(df):
    def split_func(name):
        if '-' in name:
            return name.split('-', 1)
        else:
            return [name, name]
    
    split_result = df['item_category_name'].apply(lambda x: split_func(x))
    df['type'] = split_result.apply(lambda x: x[0].strip())
    df['subtype'] = split_result.apply(lambda x: x[1].strip())
    return df


item_categories = split_categories(item_categories)
etc_types = item_categories['type'].value_counts()[item_categories['type'].value_counts()<5].index.tolist()
item_categories.loc[item_categories['type'].isin(etc_types), 'type'] = 'etc'
item_categories['type'] = LabelEncoder().fit_transform(item_categories['type'])
item_categories['subtype'] = LabelEncoder().fit_transform(item_categories['subtype'])
item_categories = item_categories.drop('item_category_name', axis=1)

### revenue

In [13]:
sales_train['revenue'] = sales_train['item_cnt_day'] * sales_train['item_price']
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,revenue
0,02.01.2013,0,59,22154,999.0,1,999.0
1,03.01.2013,0,25,2552,899.0,1,899.0
2,05.01.2013,0,25,2552,899.0,-1,-899.0
3,06.01.2013,0,25,2554,1709.05,1,1709.05
4,15.01.2013,0,25,2555,1099.0,1,1099.0


### set up matrix

In [14]:
group = sales_train.groupby(idx_features).agg(
    item_cnt_month = ('item_cnt_day', 'sum'),
    date_revenue = ('revenue', 'sum')
).reset_index()

train = train.merge(group, on=idx_features, how='left')
all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)
all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(item_avg_price, on='item_id', how='left')
all_data = all_data.merge(date_item_avg_price, on=['date_block_num', 'item_id'], how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')
all_data.head()

  all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,date_revenue,city,shop_type,item_category_id,first_sale_month,item_name_pca_0,...,item_name_pca_19,item_name_pca_20,item_name_pca_21,item_name_pca_22,item_name_pca_23,item_name_pca_24,item_avg_price,date_item_avg_price,type,subtype
0,0,59,22154,1.0,999.0,30,0,37,0.0,0.411114,...,0.006082,-0.009574,-0.000171,-0.016214,0.010373,-0.021502,678.166667,999.0,4,1
1,0,59,2552,,,30,0,58,0.0,-0.044188,...,0.013508,-0.004075,0.092186,-0.011783,0.006912,-0.007019,936.5,899.0,6,29
2,0,59,2554,,,30,0,58,0.0,-0.03603,...,-0.01125,-0.007644,0.018435,-0.011147,0.005858,-0.009884,1709.05,1709.05,6,29
3,0,59,2555,,,30,0,56,0.0,-0.043594,...,0.012146,-0.042528,0.2421,-0.081733,0.053558,-0.062556,1122.076923,1099.0,6,5
4,0,59,2564,,,30,0,59,0.0,-0.03617,...,-0.142096,0.036876,0.028464,-0.029926,0.031458,0.01241,341.307692,349.0,6,42


In [15]:
all_data.isna().sum()

date_block_num               0
shop_id                      0
item_id                      0
item_cnt_month         7513150
date_revenue           7513150
city                         0
shop_type                    0
item_category_id             0
first_sale_month             0
item_name_pca_0              0
item_name_pca_1              0
item_name_pca_2              0
item_name_pca_3              0
item_name_pca_4              0
item_name_pca_5              0
item_name_pca_6              0
item_name_pca_7              0
item_name_pca_8              0
item_name_pca_9              0
item_name_pca_10             0
item_name_pca_11             0
item_name_pca_12             0
item_name_pca_13             0
item_name_pca_14             0
item_name_pca_15             0
item_name_pca_16             0
item_name_pca_17             0
item_name_pca_18             0
item_name_pca_19             0
item_name_pca_20             0
item_name_pca_21             0
item_name_pca_22             0
item_nam

> 결측치 처리 방향

- `item_cnt_month`, `date_revenue`는 판매가 없어 nan으로 표시된 걸로 추정 -> 결측치를 0으로 대체
- `date_item_avg_price` 는 월별 아이템카테고리 평균가격으로 대체

In [16]:
fill_zero_cols = ['item_cnt_month', 'date_revenue']
all_data[fill_zero_cols] = all_data[fill_zero_cols].fillna(0)

cat_date_price = all_data.groupby(['date_block_num', 'item_category_id'])['date_item_avg_price'].mean().reset_index()
cat_date_price.columns = ['date_block_num', 'item_category_id', 'date_cat_avg_price']
all_data = all_data.merge(cat_date_price, on=['date_block_num', 'item_category_id'], how='left')
all_data['date_item_avg_price'] = all_data['date_item_avg_price'].fillna(all_data['date_cat_avg_price'])

all_data = all_data.drop(columns=['date_cat_avg_price'])

In [17]:
need_lag_list = []
features_to_drop = []

need_lag_list.extend(['item_cnt_month', 'date_item_avg_price'])
features_to_drop.append('date_revenue')

In [18]:
del shops, items, item_categories, group, cat_date_price 
all_data = downcast(all_data)

Memory usage reduced from 2395.14 MB to 1142.95 MB


### mean features

In [19]:
all_data = add_mean_features(all_data, ['date_block_num', 'shop_id'])
all_data = add_mean_features(all_data, ['date_block_num', 'shop_id', 'item_category_id'])

all_data = add_mean_features(all_data, ['date_block_num', 'item_id'])

all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id'])
all_data = add_mean_features(all_data, ['date_block_num', 'subtype'])

In [20]:
mean_feature_list = [col for col in all_data.columns if '_avg_date_sales' in col]
mean_feature_list

['shop_avg_date_sales',
 'shop_item_category_avg_date_sales',
 'item_avg_date_sales',
 'item_category_avg_date_sales',
 'subtype_avg_date_sales']

In [21]:
need_lag_list.extend(mean_feature_list)

### lag features

In [22]:
need_lag_list

['item_cnt_month',
 'date_item_avg_price',
 'shop_avg_date_sales',
 'shop_item_category_avg_date_sales',
 'item_avg_date_sales',
 'item_category_avg_date_sales',
 'subtype_avg_date_sales']

In [23]:
%%time
lag_period = [1, 2, 3]
clip_list = []
lag_features_to_process = {}

lag_features_to_process['item_cnt_month'] = True
for lag_feature in need_lag_list:
    if lag_feature == 'item_cnt_month':
        continue
    lag_features_to_process[lag_feature] = False

all_data, clip_list = add_lag_features(all_data, 
                                        key_features=idx_features,
                                        lag_features_info=lag_features_to_process,
                                        lag_period=lag_period)

all_data = downcast(all_data)

features_to_drop.extend(need_lag_list)
features_to_drop.remove('item_cnt_month')

Memory usage reduced from 2218.66 MB to 2067.39 MB
CPU times: user 7.39 s, sys: 1.07 s, total: 8.46 s
Wall time: 8.47 s


### trend features - price, revenue

In [24]:
[col for col in all_data.columns if 'price' in col]

['item_avg_price',
 'date_item_avg_price',
 'date_item_avg_price_lag_1',
 'date_item_avg_price_lag_2',
 'date_item_avg_price_lag_3']

In [25]:
# delta price lag
cols_delta_price = []
for i in lag_period:
    col_name = f'delta_price_lag_{i}'
    all_data[col_name] = (all_data[f'date_item_avg_price_lag_{i}'] - all_data['item_avg_price']) / all_data['item_avg_price']
    cols_delta_price.append(col_name)

for i, col in enumerate(cols_delta_price):
    if i==0:
        all_data['delta_price_lag'] = all_data[col]
    else:
        all_data['delta_price_lag'] = all_data['delta_price_lag'].fillna(all_data[col])
        
all_data['delta_price_lag'] = all_data['delta_price_lag'].fillna(0)
features_to_drop.extend(cols_delta_price)
features_to_drop.append('item_avg_price')
all_data = downcast(all_data)

Memory usage reduced from 2336.32 MB to 2201.85 MB


In [26]:
# add shop_date_avg_revenue
shop_date_rev = all_data.groupby(['date_block_num', 'shop_id'])['date_revenue'].mean().reset_index()
shop_date_rev.columns = ['date_block_num', 'shop_id', 'shop_date_avg_revenue']

all_data = all_data.merge(shop_date_rev, on=['date_block_num', 'shop_id'], how='left')
features_to_drop.append('shop_date_avg_revenue')

# add shop_avg_revenue
shop_avg_revenue = all_data.groupby('shop_id')['shop_date_avg_revenue'].mean().reset_index()
shop_avg_revenue.columns = ['shop_id', 'shop_revenue_mean']
all_data = all_data.merge(shop_avg_revenue, on='shop_id', how='left')

features_to_drop.append('shop_revenue_mean')
del shop_date_rev, shop_avg_revenue

# add lag features for shop_date_avg_revenue
lag_features_to_process = {'shop_date_avg_revenue': False}
all_data, clip_list = add_lag_features(all_data, 
                                        key_features=idx_features,
                                        lag_features_info=lag_features_to_process,
                                        lag_period=lag_period)

all_data = downcast(all_data)

Memory usage reduced from 2538.01 MB to 2369.93 MB


In [27]:
# delta revenue
cols_delta_revenue = []
for i in lag_period:
    col_name = f'delta_shop_revenue_lag_{i}'
    all_data[f'delta_shop_revenue_lag_{i}'] = (all_data[f'shop_date_avg_revenue_lag_{i}'] - all_data['shop_revenue_mean']) / all_data['shop_revenue_mean']
    cols_delta_revenue.append(col_name)

for i, col in enumerate(cols_delta_revenue):
    if i==0:
        all_data['delta_shop_revenue_lag'] = all_data[col]
    else:
        all_data['delta_shop_revenue_lag'] = all_data[f'delta_shop_revenue_lag'].fillna(all_data[col])
        
all_data['delta_shop_revenue_lag'] = all_data['delta_shop_revenue_lag'].fillna(0)

features_to_drop.extend(cols_delta_revenue)
features_to_drop.append('shop_revenue_mean')
all_data = downcast(all_data)
features_to_drop

Memory usage reduced from 2504.40 MB to 2504.40 MB


['date_revenue',
 'date_item_avg_price',
 'shop_avg_date_sales',
 'shop_item_category_avg_date_sales',
 'item_avg_date_sales',
 'item_category_avg_date_sales',
 'subtype_avg_date_sales',
 'delta_price_lag_1',
 'delta_price_lag_2',
 'delta_price_lag_3',
 'item_avg_price',
 'shop_date_avg_revenue',
 'shop_revenue_mean',
 'delta_shop_revenue_lag_1',
 'delta_shop_revenue_lag_2',
 'delta_shop_revenue_lag_3',
 'shop_revenue_mean']

### new item

In [28]:
all_data['new_item'] = all_data['first_sale_month'] == all_data['date_block_num']
all_data['item_age'] = all_data['date_block_num'] - all_data['first_sale_month']
features_to_drop.append('first_sale_month')

### since last sale date

In [29]:
all_data['temp_last_sale'] = np.nan
all_data.loc[all_data['item_cnt_month']>0, 'temp_last_sale'] = all_data['date_block_num']

last_sale_record = all_data.groupby(['item_id', 'shop_id'])['temp_last_sale'].shift(1).ffill()

all_data['since_last_sale'] = (all_data['date_block_num'] - last_sale_record).fillna(-1)
all_data = all_data.drop(columns='temp_last_sale')

### days & month

In [30]:
all_data['month'] = all_data['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
all_data['days'] = all_data['month'].map(days)

In [31]:
all_data.groupby('month')['item_cnt_month'].mean().reset_index()

Unnamed: 0,month,item_cnt_month
0,0,0.361002
1,1,0.334841
2,2,0.35051
3,3,0.305403
4,4,0.301577
5,5,0.314516
6,6,0.294632
7,7,0.335717
8,8,0.357936
9,9,0.323847


- 12월의 평균 판매량이 특히 높게 나타남. 1월도 비교적 높음
- 3월과 9월의 평균 판매량이 상대적으로 높음(신학기 영향 예상)
- 테스트 데이터에 해당하는 11월은 평균 판매량이 1년 중 가장 낮음

In [32]:
# 0: Low (11월 포함), 1: Normal, 2: High, 3: Peak
season_dict = {
    11: 3, # 연말
    0: 2, 2: 2, 8: 2, # 0: 연초 / 2, 8: 신학기 
    7: 2, # 7: 저조한 6대비 판매량 회복. 신학기 준비
    1: 1, 3: 1, 4: 1, 5: 1, 9: 1, # 평범
    6: 0, 10: 0  # 6: 비수기, 휴가철 / 10: pre holiday
}

all_data['season_type'] = all_data['month'].map(season_dict).astype('int8')

In [33]:
def count_weekends(date_block_num):
    year = 2013 + date_block_num // 12
    month = 1 + date_block_num % 12
    
    _, last_day = calendar.monthrange(year, month)
    dates = pd.date_range(start=f'{year}-{month}-01', end=f'{year}-{month}-{last_day}')
    weekend_count = dates.weekday.isin([5, 6]).sum()
    
    return weekend_count

weekend_map = {i: count_weekends(i) for i in range(35)}
all_data['num_weekends'] = all_data['date_block_num'].map(weekend_map).astype('int8')
print(all_data[['date_block_num', 'month', 'num_weekends']].head())

   date_block_num  month  num_weekends
0               0      0             8
1               0      0             8
2               0      0             8
3               0      0             8
4               0      0             8


In [34]:
all_data['num_weekends'].value_counts()

num_weekends
8     4500289
9     2556125
10    1755830
Name: count, dtype: int64

### rolling mean/std & lag_1 and lag_12

In [35]:
# 최근 3개월 평균 판매량 & 표준편차
all_data['rolling_3m_mean_cnt'] = all_data[[col for col in all_data.columns if 'item_cnt_month_' in col]].mean(axis=1)
all_data['rolling_3m_std_cnt'] = all_data[[col for col in all_data.columns if 'item_cnt_month_' in col]].std(axis=1)

# 최근 3개월 월별 아이템 가격 평균 & 표준편차
all_data['rolling_3m_mean_price'] = all_data[[col for col in all_data.columns if 'date_item_avg_price_' in col]].mean(axis=1)
all_data['rolling_3m_std_price'] = all_data[[col for col in all_data.columns if 'date_item_avg_price_' in col]].std(axis=1)

# 12개월 전 판매량/아이템 가격
lag_features_info = {'item_cnt_month': True, 'date_item_avg_price': False}
all_data, clip_list = add_lag_features(all_data, 
                                        key_features=idx_features,
                                        lag_features_info=lag_features_info,
                                        lag_period=[12])

features_to_drop.extend(['item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'date_item_avg_price_lag_2', 'date_item_avg_price_lag_3'])
all_data = downcast(all_data)

Memory usage reduced from 3084.27 MB to 2848.96 MB


### removing columns

In [36]:
features_to_drop

['date_revenue',
 'date_item_avg_price',
 'shop_avg_date_sales',
 'shop_item_category_avg_date_sales',
 'item_avg_date_sales',
 'item_category_avg_date_sales',
 'subtype_avg_date_sales',
 'delta_price_lag_1',
 'delta_price_lag_2',
 'delta_price_lag_3',
 'item_avg_price',
 'shop_date_avg_revenue',
 'shop_revenue_mean',
 'delta_shop_revenue_lag_1',
 'delta_shop_revenue_lag_2',
 'delta_shop_revenue_lag_3',
 'shop_revenue_mean',
 'first_sale_month',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 'date_item_avg_price_lag_2',
 'date_item_avg_price_lag_3']

In [37]:
all_data = all_data.drop(columns=features_to_drop)

features_to_drop = []
features_to_drop.extend([col for col in all_data.columns if '_lag_2' in col or '_lag_3' in col])
all_data = all_data.drop(columns=features_to_drop)

print("Shape after dropping lag features:", all_data.shape)
all_data = downcast(all_data)

Shape after dropping lag features: (8812244, 57)
Memory usage reduced from 1630.38 MB to 1630.38 MB


## drop rows with `date_block_num` < 3

In [42]:
all_data = all_data[all_data['date_block_num']>=3].fillna(0)
print(f"Shape after filtering: {all_data.shape}")
all_data = downcast(all_data)

Shape after filtering: (8026950, 57)
Memory usage reduced from 1546.33 MB to 1546.33 MB


In [43]:
joblib.dump(all_data, data_path + 'all_data2.joblib')

['./data/all_data2.joblib']