In [51]:
# linear algebra
import numpy as np

#working with data in table structers
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# data visualization
#import seaborn as sns
#import matplotlib.pyplot as plt


# working with files
import sys
import os

# to off warnings
import warnings
warnings.filterwarnings('ignore')

# data preprocessing
from itertools import product
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [52]:
# # using ray engine for parallel calculation(for oprimization)
# %env MODIN_ENGINE=ray
# import modin.pandas as mpd

###### Load data

Clean data

In [53]:
# add data path to sys.path 
clean_data_path = "C:\\Repository\\DS-Intership-data\\clean_data\\"
sys.path.append(clean_data_path)

# initiate dict for data
to_read_clean_data = {}

# fill to_read
for dir_name, _, files in os.walk(clean_data_path):
    for file in files:
        to_read_clean_data[file] = dir_name + file

In [54]:
# check to_read
to_read_clean_data

{'items.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\items.csv',
 'item_categories.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\item_categories.csv',
 'sales_train.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\sales_train.csv',
 'sample_submission.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\sample_submission.csv',
 'shops.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\shops.csv',
 'test.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\test.csv'}

In [55]:
%%time
data = {}
# read data
for file, path in to_read_clean_data.items():
    data[file.split('.')[0]] = pd.read_csv(os.path.join(os.path.dirname(path), file), index_col=0)

CPU times: total: 1.23 s
Wall time: 1.43 s


Cluster data

In [56]:
# add data path to sys.path 
cluster_data_path = "C:\\Repository\\DS-Intership-data\\cluster_data\\"
sys.path.append(cluster_data_path)

# initiate dict for data
to_read_cluster_data = {}

# fill to_read
for dir_name, _, files in os.walk(cluster_data_path):
    for file in files:
        to_read_cluster_data[file] = dir_name + file

In [57]:
# check to_read
to_read_cluster_data

{'item_category_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\item_category_clusters.csv',
 'item_price_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\item_price_clusters.csv',
 'shop_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\shop_clusters.csv',
 'subtype_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\subtype_clusters.csv',
 'type_code_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\type_code_clusters.csv'}

In [58]:
%%time
cluster_data = {}
# read data
for file, path in to_read_cluster_data.items():
    cluster_data[file.split('.')[0]] = pd.read_csv(os.path.join(os.path.dirname(path), file), index_col=0)

CPU times: total: 359 ms
Wall time: 417 ms


### Data proccesing

###### Reduce memory usage for big dataset  

In [59]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [60]:
for df_name, df in data.items():
    df = pretrained_data = reduce_mem_usage(df, verbose=True)

Mem. usage decreased to  0.40 Mb (40.6% reduction)
Mem. usage decreased to  0.00 Mb (29.2% reduction)
Mem. usage decreased to 89.25 Mb (50.0% reduction)
Mem. usage decreased to  2.86 Mb (41.7% reduction)
Mem. usage decreased to  0.00 Mb (21.9% reduction)
Mem. usage decreased to  3.06 Mb (53.1% reduction)


###### Build pretrained data

In [61]:
%%time
# combinate month, shop and item in order of increasing month
pretrained_data  = []
cols  = ["date_block_num", "shop_id", "item_id"]
for i in range(34):
    sales = data['sales_train'][data['sales_train'].date_block_num == i]
    pretrained_data.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique()))))

pretrained_data = pd.DataFrame(np.vstack(pretrained_data), columns = cols )
pretrained_data.sort_values(cols, inplace = True)

# merge datasets for data preprocessing
pretrained_data = pd.merge(pretrained_data, data['shops'], on = ["shop_id"], how = "left" )
pretrained_data = pd.merge(pretrained_data, data['items'], on = ["item_id"], how = "left")
pretrained_data = pd.merge(pretrained_data, data['item_categories'], on = ["item_category_id"], how = "left" )

CPU times: total: 7.97 s
Wall time: 9 s


In [62]:
%%time
# add profit from sales
pretrained_data["profit"] = data['sales_train']["item_cnt_day"] * data['sales_train']["item_price"]

CPU times: total: 469 ms
Wall time: 509 ms


In [63]:
%%time
# add count of sold items per month
group = data['sales_train'].groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_cnt_day": ["sum"]})
group.columns = ["item_cnt_month"]
group.reset_index(inplace = True)
pretrained_data = pd.merge(pretrained_data, group, on = cols, how = "left")
pretrained_data["item_cnt_month"] = pretrained_data["item_cnt_month"].fillna(0)
pretrained_data.head(2)

CPU times: total: 2.55 s
Wall time: 2.88 s


Unnamed: 0,date_block_num,shop_id,item_id,shop_name,city,item_name,item_category_id,item_category_name,profit,item_cnt_month
0,0,2,19,"Адыгея ТЦ ""Мега""",Адыгея,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,Кино - DVD,999.0,0.0
1,0,2,27,"Адыгея ТЦ ""Мега""",Адыгея,"007 Legends [PS3, русская версия]",19,Игры - PS3,899.0,1.0


###### Substruct some categories from string data 

In [64]:
%%time
# item categories data
pretrained_data["type_code"] = pretrained_data.item_category_name.apply(lambda x: x.split(" ")[0]).astype(str)
pretrained_data.type_code = LabelEncoder().fit_transform(pretrained_data.type_code)
pretrained_data["split"] = pretrained_data.item_category_name.apply(lambda x: x.split("-"))
pretrained_data["subtype"] = pretrained_data.split.apply(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
pretrained_data["subtype_code"] = LabelEncoder().fit_transform(pretrained_data["subtype"])
pretrained_data = pretrained_data.drop(columns=['item_category_name', 'split', 'subtype'])
pretrained_data.head(2)

CPU times: total: 18.4 s
Wall time: 19.1 s


Unnamed: 0,date_block_num,shop_id,item_id,shop_name,city,item_name,item_category_id,profit,item_cnt_month,type_code,subtype_code
0,0,2,19,"Адыгея ТЦ ""Мега""",Адыгея,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,999.0,0.0,7,4
1,0,2,27,"Адыгея ТЦ ""Мега""",Адыгея,"007 Legends [PS3, русская версия]",19,899.0,1.0,5,10


In [65]:
%%time
# shops data
pretrained_data["shop_city"] = pretrained_data.shop_name.str.split(" ").map(lambda x: x[0])
pretrained_data["shop_category"] = pretrained_data.shop_name.str.split(" ").map(lambda x: x[1])
pretrained_data["shop_category"] = LabelEncoder().fit_transform(pretrained_data.shop_category)
pretrained_data["shop_city"] = LabelEncoder().fit_transform(pretrained_data.shop_city)
pretrained_data = pretrained_data.drop(columns=['shop_name', 'city'])
pretrained_data.head(2)

CPU times: total: 23.8 s
Wall time: 24.3 s


Unnamed: 0,date_block_num,shop_id,item_id,item_name,item_category_id,profit,item_cnt_month,type_code,subtype_code,shop_city,shop_category
0,0,2,19,/ЗОЛОТАЯ КОЛЛЕКЦИЯ м/ф-72,40,999.0,0.0,7,4,0,8
1,0,2,27,"007 Legends [PS3, русская версия]",19,899.0,1.0,5,10,0,8


In [66]:
%%time
# item data
pretrained_data["name1"], pretrained_data["name2"] = pretrained_data.item_name.str.split('[', n=1).str[0], pretrained_data.item_name.str.split('[', n=1).str[1]
pretrained_data["name1"], pretrained_data["name3"] = pretrained_data.item_name.str.split('(', n=1).str[0], pretrained_data.item_name.str.split('(', n=1).str[1]

# replace special characters and turn to lower case
pretrained_data["name2"] = pretrained_data.name2.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
pretrained_data["name3"] = pretrained_data.name3.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()

pretrained_data = pretrained_data.fillna('0')

pretrained_data.name2 = LabelEncoder().fit_transform(pretrained_data.name2)
pretrained_data.name3 = LabelEncoder().fit_transform(pretrained_data.name3)

pretrained_data = pretrained_data.drop(columns=['name1', 'item_name']) 
pretrained_data.head(2)

CPU times: total: 43.9 s
Wall time: 45.4 s


Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,profit,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,name3
0,0,2,19,40,999.0,0.0,7,4,0,8,5,40
1,0,2,27,19,899.0,1.0,5,10,0,8,77,40


In [67]:
pretrained_data = reduce_mem_usage(pretrained_data)

Mem. usage decreased to 239.15 Mb (51.1% reduction)


###### time series features

In [68]:
# def extract_lag_feature(df, lags, col):
#     tmp = df[['date_block_num', 'shop_id', 'item_id', col]]
#     for i in lags:
#         shifted = tmp.copy()
#         shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
#         shifted['date_block_num'] += i
#         df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
#     return df

In [69]:
# %%time
# # item count per month lag
# pretrained_data = extract_lag_feature(pretrained_data, [1], 'item_cnt_month')
# pretrained_data.tail(2)

In [70]:
# %%time
# # lag of average item count sales per month 

# group = pretrained_data.groupby(["date_block_num"] ).agg({"item_cnt_month" : "mean"})
# group.columns = ["avg_by_month_item_cnt"]
# group.reset_index()


# pretrained_data = pd.merge(pretrained_data, group, on = ["date_block_num"], how = "left")
# del(group)
# pretrained_data = extract_lag_feature(pretrained_data, [1], "avg_by_month_item_cnt")
# pretrained_data.drop(["avg_by_month_item_cnt"], axis = 1, inplace = True)
# pretrained_data.tail(2)

In [71]:
# %%time
# # lag of average item sales per month of each item_id

# group = pretrained_data.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': 'mean'})
# group.columns = ['avg_by_month_item_id_item_cnt']
# group = group.reset_index()


# pretrained_data = pd.merge(pretrained_data, group, on=['date_block_num', 'item_id'], how='left')
# del(group)
# pretrained_data = extract_lag_feature(pretrained_data, [1], "avg_by_month_item_id_item_cnt")
# pretrained_data.drop(columns = ['avg_by_month_item_id_item_cnt'], axis = 1, inplace = True)
# pretrained_data.tail(2)

In [72]:
# %%time
# # lag of average item sales per month of each shop

# group = pretrained_data.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': 'mean'})
# group.columns = ['avg_by_month_shop_item_cnt']
# group = group.reset_index()

# pretrained_data = pd.merge(pretrained_data, group, on=['date_block_num', 'shop_id'], how='left')

# pretrained_data = extract_lag_feature(pretrained_data, [1], "avg_by_month_shop_item_cnt")
# pretrained_data.drop(columns = ['avg_by_month_shop_item_cnt'], axis = 1, inplace = True)
# pretrained_data.tail(2)

In [73]:
# %%time
# # lag of average item sales per month of each city

# group = pretrained_data.groupby(['date_block_num', 'shop_city']).agg({'item_cnt_month': 'mean'})
# group.columns = ['avg_by_month_city_item_cnt']
# group = group.reset_index()


# pretrained_data = pd.merge(pretrained_data, group, on=['date_block_num', 'shop_city'], how='left')
# del(group)
# pretrained_data = extract_lag_feature( pretrained_data, [1], "avg_by_month_city_item_cnt" )
# pretrained_data.drop(columns = ['avg_by_month_city_item_cnt'], axis = 1, inplace = True)
# pretrained_data.tail(2)

In [74]:
# %%time
# # lag of average item sales per month of each category

# group = pretrained_data.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month': 'mean'})
# group.columns = ['avg_by_month_cat_item_cnt']
# group = group.reset_index()


# pretrained_data = pd.merge(pretrained_data, group, on=['date_block_num', 'item_category_id'], how='left')
# del(group)
# pretrained_data = extract_lag_feature(pretrained_data, [1], "avg_by_month_cat_item_cnt")
# pretrained_data.drop(columns = ['avg_by_month_cat_item_cnt'], axis = 1, inplace = True)
# pretrained_data.tail(2)

In [75]:
# %%time
# # price mean grouped by item_id

# group = data['sales_train'].groupby(['item_id']).agg({'item_price': ['mean']})
# group.columns = ['avg_item_price']
# group.reset_index(inplace=True)

# pretrained_data = pd.merge(pretrained_data, group, on=['item_id'], how='left')
# del(group)

# # add price mean grouped by month and item_id
# group = data['sales_train'].groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
# group.columns = ['avg_item_price_month']
# group.reset_index(inplace=True)

# pretrained_data = pd.merge(pretrained_data, group, on=['date_block_num','item_id'], how='left')
# del(group)
# pretrained_data.tail(2)

In [76]:
# def select_trends(row) :
#     for i in lags:
#         if row["delta_price_lag_" + str(i)]:
#             return row["delta_price_lag_" + str(i)]
#     return 0

In [77]:
# # calculate lag of month column to provide price for test set
# lags = [1]
# pretrained_data = extract_lag_feature(pretrained_data, lags, "avg_item_price_month")
# pretrained_data.tail()

In [78]:
# # calculation of lag of month column to provide price for test set (change of sales of item/price by month)
# for i in lags:
#     pretrained_data["delta_price_lag_" + str(i) ] = (pretrained_data["avg_item_price_month_lag_" + str(i)]\
#                                                      - pretrained_data["avg_item_price"] )\
#                                                      / pretrained_data["avg_item_price"]

# pretrained_data["delta_price_lag"] = pretrained_data.apply(select_trends, axis = 1)
# pretrained_data["delta_price_lag"].fillna(0, inplace = True)

# features_to_drop = ["avg_item_price_month", "avg_item_price"]
# for i in lags:
#     features_to_drop.append("avg_item_price_month_lag_" + str(i))
#     features_to_drop.append("delta_price_lag_" + str(i))
# pretrained_data.drop(features_to_drop, axis = 1, inplace = True)
# pretrained_data.tail(2)

###### Add clusters features

In [79]:
%%time
# categories clusterization
pretrained_data = pd.merge(pretrained_data, cluster_data['item_category_clusters'], on='item_category_id', how='left')

CPU times: total: 531 ms
Wall time: 560 ms


In [83]:
%%time
# price clusterization
pretrained_data = pd.merge(pretrained_data, cluster_data['item_price_clusters'], on='item_id', how='left')

MemoryError: Unable to allocate 21.3 GiB for an array with shape (2857479640,) and data type int64

In [None]:
%%time
# shop clusterization
pretrained_data = pd.merge(pretrained_data, cluster_data['shop_clusters'], on='shop_id', how='left')

###### Other usefull features

In [None]:
# # dates of fisrt sale for each shop and for each item
# pretrained_data['item_shop_first_sale'] = pretrained_data['date_block_num'] - pretrained_data.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
# pretrained_data['item_first_sale'] = pretrained_data['date_block_num'] - pretrained_data.groupby('item_id')['date_block_num'].transform('min')

In [81]:
pretrained_data.tail(2)

Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,profit,item_cnt_month,type_code,subtype_code,shop_city,shop_category,name2,name3,category_cluster_id
10902922,33,59,22166,54,0,0.0,8,61,30,8,176,40,4
10902923,33,59,22167,49,0,0.0,8,39,30,8,5,27,4


In [82]:
pretrained_data = reduce_mem_usage(pretrained_data)

Mem. usage decreased to 249.55 Mb (22.6% reduction)


###### Load train test data

In [84]:
%%time
train_data = {
    'train_data' : pretrained_data,
    'test_data' : data['test'],
    'sample_submission' : data['sample_submission']
}

# add path to load to sys.path 
path_to_load = "C:\\Repository\\DS-Intership-data\\train_test_data\\"
os.makedirs(path_to_load, exist_ok=True)
sys.path.append(path_to_load)

# load data 
for file, df in train_data.items():
    df.to_csv(path_to_load+file+".csv")

CPU times: total: 26.3 s
Wall time: 28.8 s


In [85]:
from os import listdir
from os.path import isfile, join
print(*[f+"\n" for f in listdir(path_to_load) if isfile(join(path_to_load, f))])

sample_submission.csv
 test_data.csv
 train_data.csv

