---
#### Advanced LightGBM modeling (w/Feature engineering)
---

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_theme(style="whitegrid", palette="pastel", 
              rc={"font.size": 9, "axes.titlesize": 9, "axes.labelsize": 9,
              "xtick.labelsize": 9, "ytick.labelsize": 9})  

import warnings
warnings.filterwarnings(action='ignore')  ## 경고문구 생략 
import koreanize_matplotlib               ## 한글사용 

#data loading
data_path = 'kaggle/input/competitive-data-science-predict-future-sales/'

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

print(sales_train.shape, shops.shape, items.shape, item_categories.shape, test.shape, submission.shape) 

(2935849, 6) (60, 2) (22170, 3) (84, 2) (214200, 3) (214200, 2)


In [2]:
## 1. 변수명 한글화 
sales_train = sales_train.rename(columns = {'date': '날짜', 
                                            'date_block_num': '월ID', 
                                            'shop_id': '상점ID', 
                                            'item_id': '상품ID', 
                                            'item_price': '판매가', 
                                            'item_cnt_day': '판매량'})

shops = shops.rename(columns = {'shop_name': '상점명', 'shop_id': '상점ID' })
items = items.rename(columns = {'item_name': '상품명', 'item_id':'상품ID', 'item_category_id':'상품분류ID' })
item_categories = item_categories.rename(columns = {'item_category_name': '상품분류명', 'item_category_id': '상품분류ID'})
test = test.rename(columns = {'shop_id': '상점ID', 'item_id': '상품ID' })

In [3]:
## 2. 데이터 down casting 
def downcast(df, verbose=True): 
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns : 
        dtype_name = df[col].dtype.name
        if dtype_name == 'object': 
            pass
        elif dtype_name == 'bool': 
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():  
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else: 
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('{:.1f}% compressed'.format(100 * (start_mem - end_mem) / start_mem))

    return df 

all_df = [sales_train, shops, items, item_categories, test]
for df in all_df: 
    df = downcast(df)

54.2% compressed
38.5% compressed
54.2% compressed
39.8% compressed
70.8% compressed


In [4]:
sales_train.head()

Unnamed: 0,날짜,월ID,상점ID,상품ID,판매가,판매량
0,02.01.2013,0,59,22154,999.0,1
1,03.01.2013,0,25,2552,899.0,1
2,05.01.2013,0,25,2552,899.0,-1
3,06.01.2013,0,25,2554,1709.05,1
4,15.01.2013,0,25,2555,1099.0,1


In [5]:
## 3. sales_train : outlier 제거 
# 판매가/판매량이 음수, 판매가 > 50,000 , 판매량 > 1,000 --> 제거 
sales_train = sales_train[sales_train['판매가'] > 0] 
sales_train = sales_train[sales_train['판매량'] > 0] 
sales_train = sales_train[sales_train['판매가'] < 50000] 
sales_train = sales_train[sales_train['판매량'] < 10000] 

In [6]:
shops.head()

Unnamed: 0,상점명,상점ID
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [7]:
## 4. 상점명이 유사한것 일치시키기 
print(shops['상점명'][0],  '||', shops['상점명'][57])  
print(shops['상점명'][1],  '||', shops['상점명'][58])  
print(shops['상점명'][10],  '||', shops['상점명'][11])  
print(shops['상점명'][39],  '||', shops['상점명'][40])  

!Якутск Орджоникидзе, 56 фран || Якутск Орджоникидзе, 56
!Якутск ТЦ "Центральный" фран || Якутск ТЦ "Центральный"
Жуковский ул. Чкалова 39м? || Жуковский ул. Чкалова 39м²
РостовНаДону ТРК "Мегацентр Горизонт" || РостовНаДону ТРК "Мегацентр Горизонт" Островной


In [8]:
# 훈련에 사용될 sales_train, test 데이터의 상점ID를 일치시킨다. 
sales_train.loc[sales_train['상점ID'] == 0, '상점ID'] == 57
sales_train.loc[sales_train['상점ID'] == 1, '상점ID'] == 58
sales_train.loc[sales_train['상점ID'] == 10, '상점ID'] == 11
sales_train.loc[sales_train['상점ID'] == 39, '상점ID'] == 40

test.loc[test['상점ID'] == 0, '상점ID'] == 57
test.loc[test['상점ID'] == 1, '상점ID'] == 58
test.loc[test['상점ID'] == 10, '상점ID'] == 11
test.loc[test['상점ID'] == 39, '상점ID'] == 40

204000    False
204001    False
204002    False
204003    False
204004    False
          ...  
209095    False
209096    False
209097    False
209098    False
209099    False
Name: 상점ID, Length: 5100, dtype: bool

In [9]:
## 5. 상점명의 첫부분은 도시를 의미함 --> 변수화 가능
shops['도시'] = shops['상점명'].apply(lambda x: x.split()[0])
shops['도시'].unique()

array(['!Якутск', 'Адыгея', 'Балашиха', 'Волжский', 'Вологда', 'Воронеж',
       'Выездная', 'Жуковский', 'Интернет-магазин', 'Казань', 'Калуга',
       'Коломна', 'Красноярск', 'Курск', 'Москва', 'Мытищи', 'Н.Новгород',
       'Новосибирск', 'Омск', 'РостовНаДону', 'СПб', 'Самара', 'Сергиев',
       'Сургут', 'Томск', 'Тюмень', 'Уфа', 'Химки', 'Цифровой', 'Чехов',
       'Якутск', 'Ярославль'], dtype=object)

In [10]:
# 일부 도시명 앞의 ! 제거 
shops.loc[shops['도시'] == '!Якутск', '도시'] == 'Якутск'

0    False
1    False
Name: 도시, dtype: bool

In [11]:
## 6. "도시" : category feature --> label 
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
shops['도시'] = label_encoder.fit_transform(shops['도시'])

shops = shops.drop('상점명', axis= 1)
shops.head()

Unnamed: 0,상점ID,도시
0,0,0
1,1,0
2,2,1
3,3,2
4,4,3


In [12]:
items.head()

Unnamed: 0,상품명,상품ID,상품분류ID
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [13]:
## 7. 상품명 제거, 첫 판매월 생성&추가
items = items.drop(['상품명'], axis=1)

# 첫판매월 = 판매데이터를 상품ID기준 그룹화 후, 그룹에서 최소 월ID를 구한다. 
# 이때 결측치(판매된 적이 없는 상품)는 test데이터의 기준월인 2015년 11월(월ID=34)로 간주 
items['첫판매월'] = sales_train.groupby('상품ID').agg({'월ID': 'min'})['월ID']
items['첫판매월'] = items['첫판매월'].fillna(34)
items.head()

Unnamed: 0,상품ID,상품분류ID,첫판매월
0,0,40,20.0
1,1,76,15.0
2,2,40,19.0
3,3,40,18.0
4,4,40,20.0


In [14]:
item_categories.head()

Unnamed: 0,상품분류명,상품분류ID
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [16]:
## 8. 상품분류명의 앞부분은 대분류에 해당 --> 대분류 변수 만들기 
item_categories['대분류'] = item_categories['상품분류명'].apply(lambda x : x.split()[0])
item_categories['대분류'].value_counts()

대분류
Игры          14
Книги         13
Подарки       12
Игровые        8
Аксессуары     7
Музыка         6
Программы      6
Карты          5
Кино           5
Служебные      2
Чистые         2
PC             1
Билеты         1
Доставка       1
Элементы       1
Name: count, dtype: int64

In [18]:
# 5건 이하는 기타(etc)로 
def make_etc(x) : 
    if len(item_categories[item_categories['대분류'] == x]) >= 5 : 
        return x
    else: 
        return 'etc'
    
item_categories['대분류'] = item_categories['대분류'].apply(make_etc)
item_categories['대분류'].value_counts()

대분류
Игры          14
Книги         13
Подарки       12
etc            8
Игровые        8
Аксессуары     7
Музыка         6
Программы      6
Карты          5
Кино           5
Name: count, dtype: int64

In [20]:
# label encoding 
label_encoder = LabelEncoder()
item_categories['대분류'] = label_encoder.fit_transform(item_categories['대분류'])

item_categories = item_categories.drop('상품분류명', axis = 1)
item_categories.head()

Unnamed: 0,상품분류ID,대분류
0,0,0
1,1,1
2,2,1
3,3,1
4,4,1
