### Data preprocessing

In [1]:
# linear algebra
import numpy as np

#working with data in table structers
import pandas as pd
import dask.dataframe as dd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# working with files
import sys
import os

# to off warnings
import warnings
warnings.filterwarnings('ignore')

# data preprocessing
from itertools import product
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

###### Load data

Clean data

In [2]:
# add data path to sys.path 
clean_data_path = "C:\\Repository\\DS-Intership-data\\clean_data\\"
sys.path.append(clean_data_path)

# initiate dict for data
to_read_clean_data = {}

# fill to_read
for dir_name, _, files in os.walk(clean_data_path):
    for file in files:
        to_read_clean_data[file] = dir_name + file

In [3]:
# check to_read
to_read_clean_data

{'items.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\items.csv',
 'item_categories.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\item_categories.csv',
 'sales_train.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\sales_train.csv',
 'sample_submission.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\sample_submission.csv',
 'shops.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\shops.csv',
 'test.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\test.csv'}

In [4]:
%%time
data = {}
# read data
for file, path in to_read_clean_data.items():
    data[file.split('.')[0]] = pd.read_csv(os.path.join(os.path.dirname(path), file), index_col=0)

CPU times: total: 1.25 s
Wall time: 1.33 s


Cluster data

In [5]:
# add data path to sys.path 
cluster_data_path = "C:\\Repository\\DS-Intership-data\\cluster_data\\"
sys.path.append(cluster_data_path)

# initiate dict for data
to_read_cluster_data = {}

# fill to_read
for dir_name, _, files in os.walk(cluster_data_path):
    for file in files:
        to_read_cluster_data[file] = dir_name + file

In [6]:
# check to_read
to_read_cluster_data

{'item_category_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\item_category_clusters.csv',
 'item_price_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\item_price_clusters.csv',
 'shop_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\shop_clusters.csv',
 'subtype_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\subtype_clusters.csv',
 'type_code_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\type_code_clusters.csv'}

In [7]:
%%time
cluster_data = {}
# read data
for file, path in to_read_cluster_data.items():
    cluster_data[file.split('.')[0]] = pd.read_csv(os.path.join(os.path.dirname(path), file), index_col=0)

CPU times: total: 375 ms
Wall time: 401 ms


### Data preproccesing

###### Reduce memory usage for big dataset  

In [8]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

###### Build pretrained data

In [9]:
%%time
# combinate month, shop and item in order of increasing month
pretrained_data  = []
cols  = ["date_block_num", "shop_id", "item_id"]
for i in data['sales_train'].date_block_num.unique():
    sales = data['sales_train'][data['sales_train'].date_block_num == i]
    pretrained_data.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique()))))

pretrained_data = pd.DataFrame(np.vstack(pretrained_data), columns = cols).reset_index()
pretrained_data.sort_values(cols, inplace = True)

CPU times: total: 6.19 s
Wall time: 6.26 s


In [10]:
%%time
# merge datasets for data preprocessing
pretrained_data = pd.merge(pretrained_data, data['shops'], on = ["shop_id"], how = "left" )
pretrained_data = pd.merge(pretrained_data, data['items'], on = ["item_id"], how = "left")
pretrained_data = pd.merge(pretrained_data, data['item_categories'], on = ["item_category_id"], how = "left" )
pretrained_data.shape

CPU times: total: 2.44 s
Wall time: 2.47 s


(10902924, 9)

In [11]:
%%time
# add count of sold items per month
group = data['sales_train'].groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_cnt_day": "sum"})
group.columns = ["item_cnt_month"]
group.reset_index(inplace = True)
pretrained_data = pd.merge(pretrained_data, group, on = cols, how = "left")
pretrained_data["item_cnt_month"] = pretrained_data["item_cnt_month"].fillna(0)
pretrained_data.shape

CPU times: total: 2.77 s
Wall time: 2.88 s


(10902924, 10)

###### Add datafields for predictions

In [12]:
%%time
data['test']['date_block_num'] = 34
pretrained_data = pd.concat([pretrained_data, data['test']], ignore_index=True, sort=False, keys= ['date_block_num','shop_id','item_id'])
pretrained_data.merge(data['shops'], on = ["shop_id"], how = "left")
pretrained_data['city'].fillna(pretrained_data.groupby('shop_id')['city'].transform('first'), inplace=True)
pretrained_data['shop_name'].fillna(pretrained_data.groupby('shop_id')['shop_name'].transform('first'), inplace=True)
pretrained_data['item_name'].fillna(pretrained_data.groupby('item_id')['item_name'].transform('first'), inplace=True)
pretrained_data['item_category_id'].fillna(pretrained_data.groupby('item_id')['item_category_id'].transform('first'), inplace=True)
pretrained_data['item_category_name'].fillna(pretrained_data.groupby('item_category_id')['item_category_name'].transform('first'), inplace=True)
pretrained_data = pretrained_data.drop(columns=['ID', 'index'])
pretrained_data.tail(5)

CPU times: total: 4.45 s
Wall time: 4.61 s


Unnamed: 0,date_block_num,shop_id,item_id,shop_name,city,item_name,item_category_id,item_category_name,item_cnt_month
11117119,34,45,18454,"Самара ТЦ ""ПаркХаус""",Самара,СБ. Союз 55,55.0,Музыка - CD локального производства,
11117120,34,45,16188,"Самара ТЦ ""ПаркХаус""",Самара,Настольная игра Нано Кёрлинг,64.0,Подарки - Настольные игры,
11117121,34,45,15757,"Самара ТЦ ""ПаркХаус""",Самара,НОВИКОВ АЛЕКСАНДР Новая коллекция,55.0,Музыка - CD локального производства,
11117122,34,45,19648,"Самара ТЦ ""ПаркХаус""",Самара,ТЕРЕМ - ТЕРЕМОК сб.м/ф (Регион),40.0,Кино - DVD,
11117123,34,45,969,"Самара ТЦ ""ПаркХаус""",Самара,3 ДНЯ НА УБИЙСТВО (BD),37.0,Кино - Blu-Ray,


###### Substruct some categories from string data 

In [13]:
%%time
# item categories data
pretrained_data["type_code"] = pretrained_data.item_category_name.apply(lambda x: str(x).split(" ")[0]).astype(str)
pretrained_data["split"] = pretrained_data.item_category_name.apply(lambda x: str(x).split("-"))
pretrained_data["subtype"] = pretrained_data.split.apply(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
pretrained_data["subtype_code"] = LabelEncoder().fit_transform(pretrained_data["subtype"])
pretrained_data['type_code'] = LabelEncoder().fit_transform(pretrained_data['type_code'])
pretrained_data = pretrained_data.drop(columns=['item_category_name', 'split', 'subtype'])
pretrained_data.shape

CPU times: total: 18 s
Wall time: 18.3 s


(11117124, 10)

In [14]:
%%time
# shops data
pretrained_data["shop_city"] = pretrained_data.shop_name.str.split(" ").map(lambda x: x[0])
pretrained_data["shop_category"] = pretrained_data.shop_name.str.split(" ").map(lambda x: x[1])
pretrained_data["shop_category"] = LabelEncoder().fit_transform(pretrained_data.shop_category)
pretrained_data["shop_city"] = LabelEncoder().fit_transform(pretrained_data.shop_city)
pretrained_data = pretrained_data.drop(columns=['shop_name', 'city'])
pretrained_data.shape

CPU times: total: 23.1 s
Wall time: 23.3 s


(11117124, 10)

In [15]:
%%time
# item data
pretrained_data["name1"], pretrained_data["name2"] = pretrained_data.item_name.str.split('[', n=1).str[0], pretrained_data.item_name.str.split('[', n=1).str[1]
pretrained_data["name1"], pretrained_data["name3"] = pretrained_data.item_name.str.split('(', n=1).str[0], pretrained_data.item_name.str.split('(', n=1).str[1]

# replace special characters and turn to lower case
pretrained_data["name2"] = pretrained_data.name2.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
pretrained_data["name3"] = pretrained_data.name3.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()

pretrained_data = pretrained_data.fillna('0')

pretrained_data.name2 = LabelEncoder().fit_transform(pretrained_data.name2)
pretrained_data.name3 = LabelEncoder().fit_transform(pretrained_data.name3)

pretrained_data = pretrained_data.drop(columns=['name1', 'item_name']) 
pretrained_data.shape

CPU times: total: 44 s
Wall time: 44.3 s


(11117124, 11)

###### time series feature extration

In [16]:
def extract_lag_feature(df, lags, col):
    tmp = df[['date_block_num', 'shop_id', 'item_id', col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [17]:
%%time
# item count per month lag
pretrained_data = extract_lag_feature(pretrained_data, [1, 2, 3, 12], 'item_cnt_month')
pretrained_data.tail(2)

CPU times: total: 17.2 s
Wall time: 17.3 s


Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,name3,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_12
11117122,34,45,19648,40.0,0,8,4,20,8,5,1377,0.0,0.0,0.0,
11117123,34,45,969,37.0,0,8,1,20,8,5,554,0.0,0.0,0.0,0.0


In [18]:
%%time
# lag of average item count sales per month 

group = pretrained_data.groupby(["date_block_num"] ).agg({"item_cnt_month" : "mean"})
group.columns = ["avg_by_month_item_cnt"]
group.reset_index()


pretrained_data = pd.merge(pretrained_data, group, on = ["date_block_num"], how = "left")
del(group)
pretrained_data = extract_lag_feature(pretrained_data, [1, 2, 3, 12], "avg_by_month_item_cnt")
pretrained_data.drop(["avg_by_month_item_cnt"], axis = 1, inplace = True)
pretrained_data.tail(2)

CPU times: total: 20.9 s
Wall time: 21.1 s


Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,name3,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_12,avg_by_month_item_cnt_lag_1,avg_by_month_item_cnt_lag_2,avg_by_month_item_cnt_lag_3,avg_by_month_item_cnt_lag_12
11117122,34,45,19648,40.0,0,8,4,20,8,5,1377,0.0,0.0,0.0,,0.28,0.3,0.31,
11117123,34,45,969,37.0,0,8,1,20,8,5,554,0.0,0.0,0.0,0.0,0.28,0.3,0.31,0.35


In [19]:
%%time
# lag of average item sales per month of each item_id

group = pretrained_data.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['avg_by_month_item_id_item_cnt']
group = group.reset_index()


pretrained_data = pd.merge(pretrained_data, group, on=['date_block_num', 'item_id'], how='left')
del(group)
pretrained_data = extract_lag_feature(pretrained_data, [1, 2, 3], "avg_by_month_item_id_item_cnt")
pretrained_data.drop(columns = ['avg_by_month_item_id_item_cnt'], axis = 1, inplace = True)
pretrained_data.tail(2)

CPU times: total: 33 s
Wall time: 33.2 s


Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,...,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_12,avg_by_month_item_cnt_lag_1,avg_by_month_item_cnt_lag_2,avg_by_month_item_cnt_lag_3,avg_by_month_item_cnt_lag_12,avg_by_month_item_id_item_cnt_lag_1,avg_by_month_item_id_item_cnt_lag_2,avg_by_month_item_id_item_cnt_lag_3
11117122,34,45,19648,40.0,0,8,4,20,8,5,...,0.0,0.0,,0.28,0.3,0.31,,0.05,0.07,0.17
11117123,34,45,969,37.0,0,8,1,20,8,5,...,0.0,0.0,0.0,0.28,0.3,0.31,0.35,0.07,0.12,0.02


In [20]:
%%time
# lag of average item sales per month of each shop

group = pretrained_data.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['avg_by_month_shop_item_cnt']
group = group.reset_index()

pretrained_data = pd.merge(pretrained_data, group, on=['date_block_num', 'shop_id'], how='left')

pretrained_data = extract_lag_feature(pretrained_data, [1, 2, 3], "avg_by_month_shop_item_cnt")
pretrained_data.drop(columns = ['avg_by_month_shop_item_cnt'], axis = 1, inplace = True)
pretrained_data.tail(2)

CPU times: total: 20.5 s
Wall time: 20.9 s


Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,...,avg_by_month_item_cnt_lag_1,avg_by_month_item_cnt_lag_2,avg_by_month_item_cnt_lag_3,avg_by_month_item_cnt_lag_12,avg_by_month_item_id_item_cnt_lag_1,avg_by_month_item_id_item_cnt_lag_2,avg_by_month_item_id_item_cnt_lag_3,avg_by_month_shop_item_cnt_lag_1,avg_by_month_shop_item_cnt_lag_2,avg_by_month_shop_item_cnt_lag_3
11117122,34,45,19648,40.0,0,8,4,20,8,5,...,0.28,0.3,0.31,,0.05,0.07,0.17,0.13,0.13,0.14
11117123,34,45,969,37.0,0,8,1,20,8,5,...,0.28,0.3,0.31,0.35,0.07,0.12,0.02,0.13,0.13,0.14


In [21]:
%%time
# lag of average item sales per month of each city

group = pretrained_data.groupby(['date_block_num', 'shop_city']).agg({'item_cnt_month': 'mean'})
group.columns = ['avg_by_month_city_item_cnt']
group = group.reset_index()


pretrained_data = pd.merge(pretrained_data, group, on=['date_block_num', 'shop_city'], how='left')
del(group)
pretrained_data = extract_lag_feature( pretrained_data, [1], "avg_by_month_city_item_cnt" )
pretrained_data.drop(columns = ['avg_by_month_city_item_cnt'], axis = 1, inplace = True)
pretrained_data.tail(2)

CPU times: total: 8.59 s
Wall time: 8.97 s


Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,...,avg_by_month_item_cnt_lag_2,avg_by_month_item_cnt_lag_3,avg_by_month_item_cnt_lag_12,avg_by_month_item_id_item_cnt_lag_1,avg_by_month_item_id_item_cnt_lag_2,avg_by_month_item_id_item_cnt_lag_3,avg_by_month_shop_item_cnt_lag_1,avg_by_month_shop_item_cnt_lag_2,avg_by_month_shop_item_cnt_lag_3,avg_by_month_city_item_cnt_lag_1
11117122,34,45,19648,40.0,0,8,4,20,8,5,...,0.3,0.31,,0.05,0.07,0.17,0.13,0.13,0.14,0.14
11117123,34,45,969,37.0,0,8,1,20,8,5,...,0.3,0.31,0.35,0.07,0.12,0.02,0.13,0.13,0.14,0.14


In [22]:
%%time
# lag of average item sales per month of each category

group = pretrained_data.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month': 'mean'})
group.columns = ['avg_by_month_cat_item_cnt']
group = group.reset_index()


pretrained_data = pd.merge(pretrained_data, group, on=['date_block_num', 'item_category_id'], how='left')
del(group)
pretrained_data = extract_lag_feature(pretrained_data, [1], "avg_by_month_cat_item_cnt")
pretrained_data.drop(columns = ['avg_by_month_cat_item_cnt'], axis = 1, inplace = True)
pretrained_data.tail(2)

CPU times: total: 11.9 s
Wall time: 12.2 s


Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,...,avg_by_month_item_cnt_lag_3,avg_by_month_item_cnt_lag_12,avg_by_month_item_id_item_cnt_lag_1,avg_by_month_item_id_item_cnt_lag_2,avg_by_month_item_id_item_cnt_lag_3,avg_by_month_shop_item_cnt_lag_1,avg_by_month_shop_item_cnt_lag_2,avg_by_month_shop_item_cnt_lag_3,avg_by_month_city_item_cnt_lag_1,avg_by_month_cat_item_cnt_lag_1
11117122,34,45,19648,40.0,0,8,4,20,8,5,...,0.31,,0.05,0.07,0.17,0.13,0.13,0.14,0.14,0.22
11117123,34,45,969,37.0,0,8,1,20,8,5,...,0.31,0.35,0.07,0.12,0.02,0.13,0.13,0.14,0.14,0.25


In [23]:
%%time
# price mean grouped by item_id

group = data['sales_train'].groupby(['item_id']).agg({'item_price': ['mean']})
group.columns = ['avg_item_price']
group.reset_index(inplace=True)

pretrained_data = pd.merge(pretrained_data, group, on=['item_id'], how='left')
del(group)

# add price mean grouped by month and item_id
group = data['sales_train'].groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['avg_item_price_month']
group.reset_index(inplace=True)

pretrained_data = pd.merge(pretrained_data, group, on=['date_block_num','item_id'], how='left')
del(group)
pretrained_data.tail(2)

CPU times: total: 3.69 s
Wall time: 3.72 s


Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,...,avg_by_month_item_id_item_cnt_lag_1,avg_by_month_item_id_item_cnt_lag_2,avg_by_month_item_id_item_cnt_lag_3,avg_by_month_shop_item_cnt_lag_1,avg_by_month_shop_item_cnt_lag_2,avg_by_month_shop_item_cnt_lag_3,avg_by_month_city_item_cnt_lag_1,avg_by_month_cat_item_cnt_lag_1,avg_item_price,avg_item_price_month
11117122,34,45,19648,40.0,0,8,4,20,8,5,...,0.05,0.07,0.17,0.13,0.13,0.14,0.14,0.22,98.1,
11117123,34,45,969,37.0,0,8,1,20,8,5,...,0.07,0.12,0.02,0.13,0.13,0.14,0.14,0.25,502.3,


In [24]:
def select_trends(row) :
    for i in lags:
        if row["delta_price_lag_" + str(i)]:
            return row["delta_price_lag_" + str(i)]
    return 0

In [25]:
%%time
# calculate lag of month column to provide price for test set
lags = [1, 2, 3]
pretrained_data = extract_lag_feature(pretrained_data, lags, "avg_item_price_month")
pretrained_data.tail()

CPU times: total: 18.5 s
Wall time: 20.2 s


Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,...,avg_by_month_shop_item_cnt_lag_1,avg_by_month_shop_item_cnt_lag_2,avg_by_month_shop_item_cnt_lag_3,avg_by_month_city_item_cnt_lag_1,avg_by_month_cat_item_cnt_lag_1,avg_item_price,avg_item_price_month,avg_item_price_month_lag_1,avg_item_price_month_lag_2,avg_item_price_month_lag_3
11117119,34,45,18454,55.0,0,10,2,20,8,5,...,0.13,0.13,0.14,0.14,0.2,188.68,,99.0,99.0,99.0
11117120,34,45,16188,64.0,0,11,43,20,8,5,...,0.13,0.13,,0.14,0.16,1256.97,,1359.0,1222.97,
11117121,34,45,15757,55.0,0,10,2,20,8,5,...,0.13,0.13,0.14,0.14,0.2,198.11,,229.0,229.0,229.0
11117122,34,45,19648,40.0,0,8,4,20,8,5,...,0.13,0.13,0.14,0.14,0.22,98.1,,89.1,99.0,94.86
11117123,34,45,969,37.0,0,8,1,20,8,5,...,0.13,0.13,0.14,0.14,0.25,502.3,,198.0,198.0,198.0


In [26]:
%%time
# calculation of lag of month column to provide price for test set (change of item price by month)
for i in lags:
    pretrained_data["delta_price_lag_" + str(i) ] = (pretrained_data["avg_item_price_month_lag_" + str(i)]\
                                                     - pretrained_data["avg_item_price"] )\
                                                     / pretrained_data["avg_item_price"]

pretrained_data["delta_price_lag"] = pretrained_data.apply(select_trends, axis = 1)
pretrained_data["delta_price_lag"].fillna(0, inplace = True)

features_to_drop = ["avg_item_price_month", "avg_item_price"]
for i in lags:
    features_to_drop.append("avg_item_price_month_lag_" + str(i))
    features_to_drop.append("delta_price_lag_" + str(i))
pretrained_data.drop(features_to_drop, axis = 1, inplace = True)
pretrained_data.tail(2)

CPU times: total: 1min
Wall time: 1min


Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,...,avg_by_month_item_cnt_lag_12,avg_by_month_item_id_item_cnt_lag_1,avg_by_month_item_id_item_cnt_lag_2,avg_by_month_item_id_item_cnt_lag_3,avg_by_month_shop_item_cnt_lag_1,avg_by_month_shop_item_cnt_lag_2,avg_by_month_shop_item_cnt_lag_3,avg_by_month_city_item_cnt_lag_1,avg_by_month_cat_item_cnt_lag_1,delta_price_lag
11117122,34,45,19648,40.0,0,8,4,20,8,5,...,,0.05,0.07,0.17,0.13,0.13,0.14,0.14,0.22,-0.09
11117123,34,45,969,37.0,0,8,1,20,8,5,...,0.35,0.07,0.12,0.02,0.13,0.13,0.14,0.14,0.25,-0.61


###### Add clusters features

In [27]:
%%time
# categories clusterization
pretrained_data['item_category_id'] = pretrained_data['item_category_id'].astype(np.int64) 
pretrained_data = pd.merge(pretrained_data, cluster_data['item_category_clusters'], on='item_category_id', how='left')
pretrained_data.shape

CPU times: total: 1.47 s
Wall time: 1.5 s


(11117124, 29)

In [28]:
%%time
# price clusterization
cluster_data['item_price_clusters'].drop_duplicates(subset='item_id', inplace=True)
pretrained_data = pd.merge(pretrained_data, cluster_data['item_price_clusters'], on='item_id', how='right')
pretrained_data.shape

CPU times: total: 12.1 s
Wall time: 12.6 s


(11101374, 30)

In [29]:
%%time
# shop clusterization
pretrained_data = pd.merge(pretrained_data, cluster_data['shop_clusters'], on='shop_id', how='left')

CPU times: total: 2.55 s
Wall time: 2.6 s


###### Other usefull features

In [30]:
# dates of fisrt sale for each shop and for each item
pretrained_data['item_shop_first_sale'] = pretrained_data['date_block_num'] - pretrained_data.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
pretrained_data['item_first_sale'] = pretrained_data['date_block_num'] - pretrained_data.groupby('item_id')['date_block_num'].transform('min')

In [31]:
pretrained_data.tail(2)

Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,...,avg_by_month_shop_item_cnt_lag_2,avg_by_month_shop_item_cnt_lag_3,avg_by_month_city_item_cnt_lag_1,avg_by_month_cat_item_cnt_lag_1,delta_price_lag,category_cluster_id,item_price_cluster_id,cluster_id,item_shop_first_sale,item_first_sale
11101372,33,58,7640,64,0.0,11,43,29,8,5,...,,,,,0.0,4,3,0,0,0
11101373,33,59,7640,64,0.0,11,43,30,8,5,...,,,,,0.0,4,3,3,0,0


###### Rebuild data for train, test and submission example datasets

In [32]:
train_data = pretrained_data[pretrained_data.date_block_num != 34]
test_data = pretrained_data[pretrained_data.date_block_num == 34].drop(columns='item_cnt_month')
submission_example = data['test'].drop(columns=['date_block_num'])

In [33]:
# reduce memory usage
train_data = reduce_mem_usage(train_data)
test_data = reduce_mem_usage(test_data)
submission_example = reduce_mem_usage(submission_example)

Mem. usage decreased to 1705.25 Mb (33.9% reduction)
Mem. usage decreased to 29.52 Mb (35.0% reduction)
Mem. usage decreased to  3.06 Mb (53.1% reduction)


###### Load train test data

In [34]:
%%time
train_data = {
    'train_data' : train_data,
    'test_data' : test_data,
    'submission_example' : submission_example
}

# add path to load to sys.path 
path_to_load = "C:\\Repository\\DS-Intership-data\\train_test_data\\"
os.makedirs(path_to_load, exist_ok=True)
sys.path.append(path_to_load)

# load data 
for file, df in train_data.items():
    df.to_csv(path_to_load+file+".csv")

CPU times: total: 1min 54s
Wall time: 1min 56s


In [35]:
from os import listdir
from os.path import isfile, join
print(*[f+"\n" for f in listdir(path_to_load) if isfile(join(path_to_load, f))])

submission_example.csv
 test_data.csv
 train_data.csv

