# 데이터 전처리 수정방향
1. 
2. 

# Module

In [1]:
import pandas as pd
import numpy as np
from itertools import product
import joblib
from sklearn.preprocessing import LabelEncoder

# Data

In [2]:
data_path = "./data/"

sales_train = pd.read_csv(data_path + 'sales_train.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
shops = pd.read_csv(data_path + 'shops.csv')

test = pd.read_csv(data_path + 'test.csv')
sub = pd.read_csv(data_path + 'sample_submission.csv')

print('Before Filter ShopID:', len(sales_train))
unique_test_shop_id = test['shop_id'].unique()
sales_train = sales_train[sales_train['shop_id'].isin(unique_test_shop_id)]
print('After Filter ShopID :', len(sales_train))

Before Filter ShopID: 2935849
After Filter ShopID : 2413246


In [3]:
def downcast(df, verbose=True):
    start_memory = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col]%1==0).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif dtype_name.startswith('float'):
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_memory = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print(f"Memory usage reduced from {start_memory:.2f} MB to {end_memory:.2f} MB")
        
    return df


data_files = [sales_train, items, shops, item_categories, test]
for file in data_files:
    file = downcast(file)

Memory usage reduced from 128.88 MB to 69.04 MB
Memory usage reduced from 0.51 MB to 0.23 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 0.00 MB to 0.00 MB
Memory usage reduced from 4.90 MB to 1.43 MB


In [4]:
train = []

for i in sales_train['date_block_num'].unique():
    all_shop = sales_train.loc[sales_train['date_block_num'] == i, 'shop_id'].unique()
    all_item = sales_train.loc[sales_train['date_block_num'] == i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

# idx features        
idx_features = ['date_block_num', 'shop_id', 'item_id']
train = pd.DataFrame(np.vstack(train), columns=idx_features)

test['date_block_num'] = 34

# Feature Engineering

## Outliers

In [5]:
print("Before : ",len(sales_train))
sales_train = sales_train[(sales_train['item_cnt_day'] > 0)&(sales_train['item_cnt_day'] < 1000)]
print("After item_cnt_day : ",len(sales_train))
sales_train = sales_train[(sales_train['item_price'] > 0)&(sales_train['item_price'] < 50000)]
print("After item_price : ",len(sales_train))

Before :  2413246
After item_cnt_day :  2407079
After item_price :  2407076


## Feature Generation

In [6]:
def add_mean_features(df: pd.DataFrame, groupby_features: list, mean_feature_list: list = None):
    col_name = [col for col in groupby_features if col != 'date_block_num']
    base_name = '_'.join([col.replace('_id', '') for col in col_name])
    feature_name = f'{base_name}_avg_sales_month'
    
    agg_rules = {
        feature_name: ('item_cnt_month', 'mean')
    }
            
    group = df.groupby(groupby_features).agg(**agg_rules).reset_index()
    
    df = df.merge(group, on=groupby_features, how='left')
    df = downcast(df, verbose=False)
    del group
    
    if mean_feature_list is not None:
        mean_feature_list.append(feature_name)
    
    return df
        

def add_lag_features(df: pd.DataFrame, features_to_clip: list, key_features: list, lag_feature: str, lag_period: list, clip_flag: bool):
    for i in lag_period:
        df_temp = df[key_features + [lag_feature]].copy()
        df_temp['date_block_num'] += i
        
        lag_feature_name = f'{lag_feature}_lag_{i}'
        df_temp.columns = key_features + [lag_feature_name]
        
        df = df.merge(df_temp.drop_duplicates(), on=key_features, how='left')
        df[lag_feature_name] = df[lag_feature_name].fillna(0)
        
        if clip_flag:
            features_to_clip.append(lag_feature_name)
            
    df = downcast(df, False)
    del df_temp
    
    return df, features_to_clip


### shops

In [7]:
# 기존 처리 방식
sales_train.loc[sales_train['shop_id']==0, 'shop_id'] = 57
sales_train.loc[sales_train['shop_id']==1, 'shop_id'] = 58
sales_train.loc[sales_train['shop_id']==10, 'shop_id'] = 11
sales_train.loc[sales_train['shop_id']==39, 'shop_id'] = 40

test.loc[test['shop_id']==0, 'shop_id'] = 57
test.loc[test['shop_id']==1, 'shop_id'] = 58
test.loc[test['shop_id']==10, 'shop_id'] = 11
test.loc[test['shop_id']==39, 'shop_id'] = 40

shops['city'] = shops['shop_name'].str.split(' ').str[0]
shops.loc[shops['city'] == '!Якутск', 'city'] = 'Якутск'

In [8]:
# 쇼핑몰/센터를 나타내는 러시아어 약어로 상점 구분
mall_keywords = ['ТЦ', 'ТРК', 'ТРЦ', 'MALL', 'Молл']
is_mall = shops['shop_name'].apply(lambda x: any(keyword in x for keyword in mall_keywords))

shops['shop_type'] = np.where(is_mall, 'Mall', 'Standalone')

# 온라인, 오프라인, 이동식 매장 등 특수 카테고리 분류
conditions = [
    shops['shop_name'].str.contains('Интернет-магазин|Выездная Торговля', case=False, regex=True)
]

choices = ['Online']
shops['shop_category'] = np.select(conditions, choices, default='Offline')


encoder = LabelEncoder()
for col in ['city', 'shop_type', 'shop_category']:
    shops[col] = encoder.fit_transform(shops[col])

In [9]:
shops = shops.drop(columns=["shop_name"])

### items

In [10]:
items = items.drop(columns=["item_name"])
items['first_sale_month'] = sales_train.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']
items = items.fillna(34)

### item_categories

In [11]:
def split_categories(df):
    def split_func(name):
        if '-' in name:
            return name.split('-', 1)
        else:
            return [name, name]
    
    split_result = df['item_category_name'].apply(lambda x: split_func(x))
    df['type'] = split_result.apply(lambda x: x[0].strip())
    df['subtype'] = split_result.apply(lambda x: x[1].strip())
    return df


item_categories = split_categories(item_categories)
etc_types = item_categories['type'].value_counts()[item_categories['type'].value_counts()<5].index.tolist()
item_categories.loc[item_categories['type'].isin(etc_types), 'type'] = 'etc'
item_categories['type'] = LabelEncoder().fit_transform(item_categories['type'])
item_categories['subtype'] = LabelEncoder().fit_transform(item_categories['subtype'])
item_categories = item_categories.drop('item_category_name', axis=1)

### set up matrix

In [12]:
group = sales_train.groupby(idx_features).agg(
    item_cnt_month = ('item_cnt_day', 'sum'),
    transaction_cnt = ('item_cnt_day', 'count'),
    avg_item_price = ('item_price', 'mean'),
).reset_index()

train = train.merge(group, on=idx_features, how='left')
all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)
all_data = all_data.fillna(0)

all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')
all_data.head()

  all_data = pd.concat([train, test.drop('ID', axis=1)], ignore_index=True, keys=idx_features)


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,transaction_cnt,avg_item_price,city,shop_type,shop_category,item_category_id,first_sale_month,type,subtype
0,0,59,22154,1.0,1.0,999.0,30,0,0,37,0.0,4,1
1,0,59,2552,0.0,0.0,0.0,30,0,0,58,0.0,6,29
2,0,59,2554,0.0,0.0,0.0,30,0,0,58,0.0,6,29
3,0,59,2555,0.0,0.0,0.0,30,0,0,56,0.0,6,5
4,0,59,2564,0.0,0.0,0.0,30,0,0,59,0.0,6,42


In [13]:
del shops, items, item_categories, group

all_data = downcast(all_data)

Memory usage reduced from 714.34 MB to 184.89 MB


### mean features

In [14]:
clip_list = []

all_data = add_mean_features(all_data, ['date_block_num','shop_id'])
all_data = add_mean_features(all_data, ['date_block_num','shop_id','item_category_id'])

all_data = add_mean_features(all_data, ['date_block_num','item_id'])
all_data = add_mean_features(all_data, ['date_block_num','item_id', 'city'])
all_data = add_mean_features(all_data, ['date_block_num','item_id', 'shop_type'])

all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id'])
all_data = add_mean_features(all_data, ['date_block_num', 'item_category_id', 'city'])

In [15]:
mean_feature_list = [col for col in all_data.columns if '_avg_sales_month' in col]
mean_feature_list

['shop_avg_sales_month',
 'shop_item_category_avg_sales_month',
 'item_avg_sales_month',
 'item_city_avg_sales_month',
 'item_shop_type_avg_sales_month',
 'item_category_avg_sales_month',
 'item_category_city_avg_sales_month']

### lag features

In [16]:
all_data, clip_list = add_lag_features(all_data, features_to_clip=clip_list, 
                                                  key_features=idx_features, lag_feature='item_cnt_month',
                                                  lag_period=[1,2,3], clip_flag=True)

all_data, clip_list = add_lag_features(all_data, features_to_clip=clip_list, 
                                                  key_features=idx_features, lag_feature='transaction_cnt',
                                                  lag_period=[1,2,3], clip_flag=False)

all_data, clip_list = add_lag_features(all_data, features_to_clip=clip_list, 
                                                  key_features=idx_features, lag_feature='avg_item_price',
                                                  lag_period=[1,2,3], clip_flag=False)


features_to_drop = []
features_to_drop.extend(['transaction_cnt', 'avg_item_price'])

In [17]:
for feat in mean_feature_list:
    all_data, clip_list = add_lag_features(all_data, features_to_clip=clip_list, 
                                                key_features=idx_features, lag_feature=feat,
                                                lag_period=[1,2,3], clip_flag=False)

In [18]:
sales_lag_cols = [col for col in all_data.columns if 'item_cnt_month_lag' in col]

all_data['avg_item_cnt_month_lag'] = all_data[sales_lag_cols].mean(axis=1)
all_data[clip_list + ['item_cnt_month', 'avg_item_cnt_month_lag']] = all_data[clip_list + ['item_cnt_month', 'avg_item_cnt_month_lag']].clip(0, 20)

In [19]:
for mean_feat in mean_feature_list:
    lag_cols = [f'{mean_feat}_lag_{i}' for i in [1,2,3]]
    feat_name = f'avg_{mean_feat}_lag'
    all_data[feat_name] = all_data[lag_cols].mean(axis=1)

all_data = downcast(all_data)    
features_to_drop.extend(mean_feature_list)

Memory usage reduced from 1706.01 MB to 1638.78 MB


### price trend and sales trend

In [20]:
def add_trend_feature(df: pd.DataFrame, base_col: str, lags: list):
    for i in lags:
        col_lag_curr = f'{base_col}_lag_{i}'      
        col_lag_prev = f'{base_col}_lag_{i+1}'    
        col_trend = f'delta_{base_col}_lag_{i}'   
        
        if col_lag_prev not in df.columns or col_lag_curr not in df.columns:
            print(f'[Skip] {col_lag_curr} 또는 {col_lag_prev} 컬럼이 없어 작업을 종료합니다.')
            continue
            
        df[col_trend] = (df[col_lag_curr] - df[col_lag_prev]) / df[col_lag_prev]
        
        df[col_trend] = df[col_trend].replace([np.inf, -np.inf], np.nan)
        df[col_trend] = df[col_trend].fillna(0)
        
    df = downcast(df)

    return df

In [21]:
all_data = add_trend_feature(all_data, base_col='avg_item_price', lags=[1,2])
all_data = add_trend_feature(all_data, base_col='item_cnt_month', lags=[1,2])

Memory usage reduced from 1773.25 MB to 1706.01 MB
Memory usage reduced from 1840.48 MB to 1773.25 MB


### new item

In [22]:
all_data['new_item'] = all_data['first_sale_month'] == all_data['date_block_num']
all_data['item_age'] = all_data['date_block_num'] - all_data['first_sale_month']
features_to_drop.append('first_sale_month')

### last_sale

In [23]:
all_data['temp_last_sale'] = np.nan
all_data.loc[all_data['item_cnt_month']>0, 'temp_last_sale'] = all_data['date_block_num']

last_sale_record = all_data.groupby(['item_id', 'shop_id'])['temp_last_sale'].transform(
    lambda x: x.shift(1).ffill()
    )

all_data['last_sale'] = (all_data['date_block_num'] - last_sale_record).fillna(-1)
all_data = all_data.drop(columns='temp_last_sale')

### days & month

In [24]:
all_data['month'] = all_data['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
all_data['days'] = all_data['month'].map(days)

### removing columns

In [25]:
features_to_drop

['transaction_cnt',
 'avg_item_price',
 'shop_avg_sales_month',
 'shop_item_category_avg_sales_month',
 'item_avg_sales_month',
 'item_city_avg_sales_month',
 'item_shop_type_avg_sales_month',
 'item_category_avg_sales_month',
 'item_category_city_avg_sales_month',
 'first_sale_month']

In [26]:
all_data = all_data.drop(columns=features_to_drop)
all_data.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'city',
       'shop_type', 'shop_category', 'item_category_id', 'type', 'subtype',
       'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
       'transaction_cnt_lag_1', 'transaction_cnt_lag_2',
       'transaction_cnt_lag_3', 'avg_item_price_lag_1', 'avg_item_price_lag_2',
       'avg_item_price_lag_3', 'shop_avg_sales_month_lag_1',
       'shop_avg_sales_month_lag_2', 'shop_avg_sales_month_lag_3',
       'shop_item_category_avg_sales_month_lag_1',
       'shop_item_category_avg_sales_month_lag_2',
       'shop_item_category_avg_sales_month_lag_3',
       'item_avg_sales_month_lag_1', 'item_avg_sales_month_lag_2',
       'item_avg_sales_month_lag_3', 'item_city_avg_sales_month_lag_1',
       'item_city_avg_sales_month_lag_2', 'item_city_avg_sales_month_lag_3',
       'item_shop_type_avg_sales_month_lag_1',
       'item_shop_type_avg_sales_month_lag_2',
       'item_shop_type_avg_sales_month_lag_3

In [27]:
all_data = downcast(all_data)

Memory usage reduced from 1613.57 MB to 1495.91 MB


In [28]:
all_data.shape

(8812244, 57)

In [29]:
joblib.dump(all_data, data_path + 'all_data2.joblib')

['./data/all_data2.joblib']