# Module

In [None]:
import pandas as pd
import numpy as np
from itertools import product
import joblib
from sklearn.preprocessing import LabelEncoder

# Data

In [2]:
data_path = "./data/"

sales_train = pd.read_csv(data_path + 'sales_train.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
shops = pd.read_csv(data_path + 'shops.csv')

test = pd.read_csv(data_path + 'test.csv')
sub = pd.read_csv(data_path + 'sample_submission.csv')

In [3]:
print('Before :', len(sales_train))
unique_test_shop_id = test['shop_id'].unique()
sales_train = sales_train[sales_train['shop_id'].isin(unique_test_shop_id)]
print('After :', len(sales_train))

Before : 2935849
After : 2413246


- `test.csv`에 있는 `shop_id`로 데이터 필터링

In [4]:
def downcast(df, verbose=True):
    start_memory = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col]%1==0).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif dtype_name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_memory = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print(f"Memory usage reduced from {start_memory:.2f} MB to {end_memory:.2f} MB")
        
    return df

In [5]:
data_files = [sales_train, items, shops, item_categories, test]
for file in data_files:
    file = downcast(file)

Memory usage reduced from 128.88 MB to 69.04 MB
Memory usage reduced from 0.51 MB to 0.23 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 4.90 MB to 1.43 MB


In [6]:
train = []

for i in sales_train['date_block_num'].unique():
    all_shop = sales_train.loc[sales_train['date_block_num'] == i, 'shop_id'].unique()
    all_item = sales_train.loc[sales_train['date_block_num'] == i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

# idx features        
idx_features = ['date_block_num', 'shop_id', 'item_id']
train = pd.DataFrame(np.vstack(train), columns=idx_features)

test['date_block_num'] = 34

# Feature Engineering

## Outliers

In [7]:
print("Before : ",len(sales_train))
sales_train = sales_train[(sales_train['item_cnt_day'] > 0)&(sales_train['item_cnt_day'] < 1000)]
print("After item_cnt_day : ",len(sales_train))
sales_train = sales_train[(sales_train['item_price'] > 0)&(sales_train['item_price'] < 50000)]
print("After item_price : ",len(sales_train))

Before :  2413246
After item_cnt_day :  2407079


After item_price :  2407076


## Feature Generation

### shops

In [8]:
# 기존 처리 방식
sales_train.loc[sales_train['shop_id']==0, 'shop_id'] = 57
sales_train.loc[sales_train['shop_id']==1, 'shop_id'] = 58
sales_train.loc[sales_train['shop_id']==10, 'shop_id'] = 11
sales_train.loc[sales_train['shop_id']==39, 'shop_id'] = 40

test.loc[test['shop_id']==0, 'shop_id'] = 57
test.loc[test['shop_id']==1, 'shop_id'] = 58
test.loc[test['shop_id']==10, 'shop_id'] = 11
test.loc[test['shop_id']==39, 'shop_id'] = 40

shops['city'] = shops['shop_name'].str.split(' ').str[0]
shops.loc[shops['city'] == '!Якутск', 'city'] = 'Якутск'
shops['city'] = LabelEncoder().fit_transform(shops['city'])
shops = shops.drop(columns=["shop_name"])

### items

In [9]:
items = items.drop(columns=["item_name"])
items['first_sale_month'] = sales_train.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']
items = items.fillna(34)

### item_categories

In [10]:
def split_categories(df):
    def split_func(name):
        if '-' in name:
            return name.split('-', 1)
        else:
            return [name, name]
    
    split_result = df['item_category_name'].apply(lambda x: split_func(x))
    df['type'] = split_result.apply(lambda x: x[0].strip())
    df['subtype'] = split_result.apply(lambda x: x[1].strip())
    return df


item_categories = split_categories(item_categories)
etc_types = item_categories['type'].value_counts()[item_categories['type'].value_counts()<5].index.tolist()
item_categories.loc[item_categories['type'].isin(etc_types), 'type'] = 'etc'
item_categories['type'] = LabelEncoder().fit_transform(item_categories['type'])
item_categories['subtype'] = LabelEncoder().fit_transform(item_categories['subtype'])
item_categories = item_categories.drop('item_category_name', axis=1)

### derivative features

In [11]:
group = sales_train.groupby(idx_features).agg(
    item_cnt_month = ('item_cnt_day', 'sum'),
    transaction_cnt = ('item_cnt_day', 'count'),
    avg_item_price = ('item_price', 'mean'),
).reset_index()

train = train.merge(group, on=idx_features, how='left')
all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)
all_data = all_data.fillna(0)

all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')
all_data.head()

  all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,transaction_cnt,avg_item_price,city,item_category_id,first_sale_month,type,subtype
0,0,59,22154,1.0,1.0,999.0,30,37,0.0,4,1
1,0,59,2552,0.0,0.0,0.0,30,58,0.0,6,29
2,0,59,2554,0.0,0.0,0.0,30,58,0.0,6,29
3,0,59,2555,0.0,0.0,0.0,30,56,0.0,6,5
4,0,59,2564,0.0,0.0,0.0,30,59,0.0,6,42


In [12]:
del shops, items, item_categories, group

all_data = downcast(all_data)

Memory usage reduced from 579.88 MB to 168.08 MB


### time lag features

In [13]:
def add_mean_features(df, mean_features, groupby_features):
    if len(groupby_features) == 2:
        feature_name = f'avg_sales_month_by_{groupby_features[1]}'
    else:
        feature_name = f'avg_sales_month_by_{groupby_features[1]}_and_{groupby_features[2]}'
    
    agg_rules = {
        feature_name: ('item_cnt_month', 'mean')
    }
            
    group = df.groupby(groupby_features).agg(**agg_rules).reset_index()
    
    df = df.merge(group, on=groupby_features, how='left')
    df = downcast(df, verbose=False)
    mean_features.append(feature_name)
    
    del group
    
    return df, mean_features


item_mean_feature_list = []
all_data, item_mean_feature_list = add_mean_features(all_data, item_mean_feature_list, ['date_block_num', 'item_id'])
all_data, item_mean_feature_list = add_mean_features(all_data, item_mean_feature_list, ['date_block_num', 'item_id', 'city'])

shop_mean_feature_list = []
all_data, shop_mean_feature_list = add_mean_features(all_data, shop_mean_feature_list, ['date_block_num', 'shop_id'])

In [14]:
def add_lag_features(df: pd.DataFrame, features_to_clip: list, key_features: list, lag_feature: str, lag_period: list, clip_flag: bool):
    for i in lag_period:
        df_temp = df[key_features + [lag_feature]].copy()
        df_temp['date_block_num'] += i
        
        lag_feature_name = f'{lag_feature}_lag_{i}'
        df_temp.columns = key_features + [lag_feature_name]
        
        df = df.merge(df_temp.drop_duplicates(), on=key_features, how='left')
        df[lag_feature_name] = df[lag_feature_name].fillna(0)
        
        if clip_flag:
            features_to_clip.append(lag_feature_name)
            
    df = downcast(df, False)
    del df_temp
    
    return df, features_to_clip


lag_features_to_clip = []

all_data, lag_features_to_clip = add_lag_features(all_data, features_to_clip=lag_features_to_clip, 
                                                  key_features=idx_features, lag_feature='item_cnt_month',
                                                  lag_period=[1,2,3,4,6,12], clip_flag=True)

all_data, lag_features_to_clip = add_lag_features(all_data, features_to_clip=lag_features_to_clip, 
                                                  key_features=idx_features, lag_feature='transaction_cnt',
                                                  lag_period=[1,2,3,4,6,12], clip_flag=False)

all_data, lag_features_to_clip = add_lag_features(all_data, features_to_clip=lag_features_to_clip, 
                                                  key_features=idx_features, lag_feature='avg_item_price',
                                                  lag_period=[1,2,3,4,6,12], clip_flag=False)


features_to_drop = []
features_to_drop.extend(['transaction_cnt', 'avg_item_price'])

In [15]:
for feat in item_mean_feature_list:
    all_data, lag_feature_to_clip = add_lag_features(all_data, features_to_clip=lag_features_to_clip, 
                                                  key_features=idx_features, lag_feature=feat,
                                                  lag_period=[1,2,3,6,12], clip_flag=True)


for feat in shop_mean_feature_list:
    all_data, lag_feature_to_clip = add_lag_features(all_data, features_to_clip=lag_features_to_clip, 
                                                key_features=idx_features, lag_feature=feat,
                                                lag_period=[1,2,3,6,12], clip_flag=True)
    

all_data = all_data.drop(columns=item_mean_feature_list)
all_data = all_data.drop(columns=shop_mean_feature_list)

In [16]:
sales_lag_cols = [col for col in all_data.columns if 'item_cnt_month_lag' in col]

all_data['avg_item_cnt_month_lag'] = all_data[sales_lag_cols].mean(axis=1)
all_data[lag_feature_to_clip + ['item_cnt_month', 'avg_item_cnt_month_lag']] = all_data[lag_feature_to_clip + ['item_cnt_month', 'avg_item_cnt_month_lag']].clip(0, 20)

### price trend and sales trend

In [17]:
def add_trend_feature(df: pd.DataFrame, base_col: str, lags: list):
    for i in lags:
        col_lag_curr = f'{base_col}_lag_{i}'      
        col_lag_prev = f'{base_col}_lag_{i+1}'    
        col_trend = f'delta_{base_col}_lag_{i}'   
        
        if col_lag_prev not in df.columns or col_lag_curr not in df.columns:
            print(f'[Skip] {col_lag_curr} 또는 {col_lag_prev} 컬럼이 없어 작업을 종료합니다.')
            continue
            
        df[col_trend] = (df[col_lag_curr] - df[col_lag_prev]) / df[col_lag_prev]
        
        df[col_trend] = df[col_trend].replace([np.inf, -np.inf], np.nan)
        df[col_trend] = df[col_trend].fillna(0)
        
    df = downcast(df)

    return df

In [18]:
all_data = add_trend_feature(all_data, base_col='avg_item_price', lags=[1,2,3])
all_data = add_trend_feature(all_data, base_col='item_cnt_month', lags=[1,2,3])

Memory usage reduced from 1495.91 MB to 1302.62 MB
Memory usage reduced from 1504.32 MB to 1403.47 MB


### new item

In [19]:
all_data['new_item'] = all_data['first_sale_month'] == all_data['date_block_num']
all_data['month_diff'] = all_data['date_block_num'] - all_data['first_sale_month']
features_to_drop.append('first_sale_month')

### days & month

In [20]:
all_data['month'] = all_data['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
all_data['days'] = all_data['month'].map(days)

### removing columns

In [21]:
features_to_drop

['transaction_cnt', 'avg_item_price', 'first_sale_month']

In [22]:
all_data = all_data.drop(columns=features_to_drop)
all_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'city',
       'item_category_id', 'type', 'subtype', 'item_cnt_month_lag_1',
       'item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'item_cnt_month_lag_4',
       'item_cnt_month_lag_6', 'item_cnt_month_lag_12',
       'transaction_cnt_lag_1', 'transaction_cnt_lag_2',
       'transaction_cnt_lag_3', 'transaction_cnt_lag_4',
       'transaction_cnt_lag_6', 'transaction_cnt_lag_12',
       'avg_item_price_lag_1', 'avg_item_price_lag_2', 'avg_item_price_lag_3',
       'avg_item_price_lag_4', 'avg_item_price_lag_6', 'avg_item_price_lag_12',
       'avg_sales_month_by_item_id_lag_1', 'avg_sales_month_by_item_id_lag_2',
       'avg_sales_month_by_item_id_lag_3', 'avg_sales_month_by_item_id_lag_6',
       'avg_sales_month_by_item_id_lag_12',
       'avg_sales_month_by_item_id_and_city_lag_1',
       'avg_sales_month_by_item_id_and_city_lag_2',
       'avg_sales_month_by_item_id_and_city_lag_3',
       'avg_sales_month_by_item_i

In [23]:
all_data = downcast(all_data)

Memory usage reduced from 1411.87 MB to 1353.05 MB


In [24]:
joblib.dump(all_data, data_path + 'all_data.joblib')

['./data/all_data.joblib']