### Data preprocessing

###### Load data

In [1]:
# linear algebra
import numpy as np

#working with data in table structers
import pandas as pd
import dask.dataframe as dd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# data visualization
#import seaborn as sns
#import matplotlib.pyplot as plt


# working with files
import sys
import os

# to off warnings
import warnings
warnings.filterwarnings('ignore')

# data preprocessing
from itertools import product
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
# add data path to sys.path 
clean_data_path = "C:\\Repository\\DS-Intership-data\\clean_data\\"
sys.path.append(clean_data_path)

# initiate dict for data
to_read_clean_data = {}

# fill to_read
for dir_name, _, files in os.walk(clean_data_path):
    for file in files:
        to_read_clean_data[file] = dir_name + file

In [3]:
# check to_read
to_read_clean_data

{'items.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\items.csv',
 'item_categories.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\item_categories.csv',
 'sales_train.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\sales_train.csv',
 'sample_submission.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\sample_submission.csv',
 'shops.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\shops.csv',
 'test.csv': 'C:\\Repository\\DS-Intership-data\\clean_data\\test.csv'}

In [4]:
%%time
data = {}
# read data
for file, path in to_read_clean_data.items():
    data[file.split('.')[0]] = dd.read_csv(os.path.join(os.path.dirname(path), file)).set_index('Unnamed: 0')

CPU times: total: 969 ms
Wall time: 1.5 s


In [5]:
# add data path to sys.path 
cluster_data_path = "C:\\Repository\\DS-Intership-data\\cluster_data\\"
sys.path.append(cluster_data_path)

# initiate dict for data
to_read_cluster_data = {}

# fill to_read
for dir_name, _, files in os.walk(cluster_data_path):
    for file in files:
        to_read_cluster_data[file] = dir_name + file

In [6]:
# check to_read
to_read_cluster_data

{'item_category_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\item_category_clusters.csv',
 'item_price_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\item_price_clusters.csv',
 'shop_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\shop_clusters.csv',
 'subtype_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\subtype_clusters.csv',
 'type_code_clusters.csv': 'C:\\Repository\\DS-Intership-data\\cluster_data\\type_code_clusters.csv'}

In [7]:
%%time
cluster_data = {}
# read data
for file, path in to_read_cluster_data.items():
    cluster_data[file.split('.')[0]] = dd.read_csv(os.path.join(os.path.dirname(path), file), dtype='object').\
                                                                                            set_index('Unnamed: 0')

CPU times: total: 4.97 s
Wall time: 5.47 s


##### Build dataset

In [8]:
%%time
# combinate month, shop and item in order of increasing month
pretrained_data  = []
cols  = ["date_block_num", "shop_id", "item_id"]
for i in data['sales_train'].date_block_num.unique():
    sales = data['sales_train'][data['sales_train'].date_block_num == i]
    pretrained_data.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique()))))

pretrained_data = dd.from_pandas(data=pd.DataFrame(np.vstack(pretrained_data), columns = cols), npartitions=100)#.reset_index()
pretrained_data.sort_values(cols, inplace = True)

CPU times: total: 1min 32s
Wall time: 2min 2s


Unnamed: 0_level_0,date_block_num,shop_id,item_id
npartitions=33,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,int32,int32,int32
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [9]:
%%time
pretrained_data.compute().head()

CPU times: total: 31.2 ms
Wall time: 47 ms


Unnamed: 0,date_block_num,shop_id,item_id
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564


In [10]:
%%time
# merge datasets for data preprocessing
pretrained_data = dd.merge(pretrained_data, data['shops'], on = ["shop_id"], how = "left" )
pretrained_data = dd.merge(pretrained_data, data['items'], on = ["item_id"], how = "left")
pretrained_data = dd.merge(pretrained_data, data['item_categories'], on = ["item_category_id"], how = "left" )
pretrained_data.shape

CPU times: total: 0 ns
Wall time: 26 ms


(Delayed('int-11292269-1b32-4274-a36b-c5c487dd07c5'), 8)

In [11]:
%%time
pretrained_data.compute().head()

CPU times: total: 1.33 s
Wall time: 4.78 s


Unnamed: 0,date_block_num,shop_id,item_id,shop_name,city,item_name,item_category_id,item_category_name
0,0,59,22154,"Ярославль ТЦ ""Альтаир""",Ярославль,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,0,59,2552,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
2,0,59,2554,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил
3,0,59,2555,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства
4,0,59,2564,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,Музыка - Музыкальное видео


In [12]:
%%time
# add count of sold items per month
group = data['sales_train'].groupby(["date_block_num", "shop_id", "item_id"]).agg({"item_cnt_day": ["sum"]})
group.columns = ["item_cnt_month"]
#group.reset_index(inplace = True)
pretrained_data = dd.merge(pretrained_data, group, on = cols, how = "left")
pretrained_data["item_cnt_month"] = pretrained_data["item_cnt_month"].fillna(0)
pretrained_data.shape

CPU times: total: 15.6 ms
Wall time: 27.1 ms


(Delayed('int-f0a62309-b779-4763-89a9-465595a7844d'), 9)

In [13]:
%%time
pretrained_data.compute().head()

CPU times: total: 59.5 s
Wall time: 15.1 s


Unnamed: 0,date_block_num,shop_id,item_id,shop_name,city,item_name,item_category_id,item_category_name,item_cnt_month
0,0,59,22154,"Ярославль ТЦ ""Альтаир""",Ярославль,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray,1.0
1,0,59,2552,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,0.0
2,0,59,2554,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил,0.0
3,0,59,2555,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства,0.0
4,0,59,2564,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,Музыка - Музыкальное видео,0.0


###### Add clusters features

In [14]:
#for df_name, df in cluster_data.items():
#    for col in df.columns:
#        df[col] = dd.to_numeric(df[col])

In [15]:
%%time
# categories clusterization
#pretrained_data = dd.merge(pretrained_data, cluster_data['item_category_clusters'], on='item_category_id', how='left')

CPU times: total: 0 ns
Wall time: 0 ns


In [16]:
%%time
#pretrained_data.compute().head()

CPU times: total: 0 ns
Wall time: 0 ns


In [17]:
%%time
# price clusterization
#pretrained_data = dd.merge(pretrained_data, cluster_data['item_price_clusters'], on='item_id', how='left')

CPU times: total: 0 ns
Wall time: 0 ns


In [18]:
%%time
#pretrained_data.compute().head()

CPU times: total: 0 ns
Wall time: 0 ns


In [19]:
%%time
# price clusterization
#pretrained_data = dd.merge(pretrained_data, cluster_data['item_price_clusters'], on='item_id', how='left')

CPU times: total: 0 ns
Wall time: 0 ns


In [20]:
%%time
#pretrained_data.compute().head()

CPU times: total: 0 ns
Wall time: 0 ns


###### Substruct some categories from string data 

In [21]:
%%time
# item categories data
pretrained_data["type_code"] = pretrained_data.item_category_name.apply(lambda x: x.split(" ")[0]).astype(str)
#pretrained_data.type_code = LabelEncoder().fit_transform(pretrained_data.type_code)
pretrained_data["split"] = pretrained_data.item_category_name.apply(lambda x: x.split("-"))
pretrained_data["subtype"] = pretrained_data.split.apply(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
#pretrained_data["subtype_code"] = LabelEncoder().fit_transform(pretrained_data["subtype"])
pretrained_data = pretrained_data.drop(columns=['item_category_name', 'split', 'subtype'])
pretrained_data.shape

CPU times: total: 15.6 ms
Wall time: 16 ms


(Delayed('int-237ab908-b7eb-4033-b28b-ff851659784e'), 9)

In [22]:
%%time
pretrained_data.compute().head()

CPU times: total: 1min 8s
Wall time: 33.8 s


Unnamed: 0,date_block_num,shop_id,item_id,shop_name,city,item_name,item_category_id,item_cnt_month,type_code
0,0,59,22154,"Ярославль ТЦ ""Альтаир""",Ярославль,ЯВЛЕНИЕ 2012 (BD),37,1.0,Кино
1,0,59,2552,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE The House Of Blue Light LP,58,0.0,Музыка
2,0,59,2554,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE Who Do You Think We Are LP,58,0.0,Музыка
3,0,59,2555,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,0.0,Музыка
4,0,59,2564,"Ярославль ТЦ ""Альтаир""",Ярославль,DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,0.0,Музыка


In [23]:
%%time
# shops data
pretrained_data["shop_city"] = pretrained_data.shop_name.str.split(" ").map(lambda x: x[0])
pretrained_data["shop_category"] = pretrained_data.shop_name.str.split(" ").map(lambda x: x[1])
#pretrained_data["shop_category"] = LabelEncoder().fit_transform(pretrained_data.shop_category)
#pretrained_data["shop_city"] = LabelEncoder().fit_transform(pretrained_data.shop_city)
pretrained_data = pretrained_data.drop(columns=['shop_name', 'city'])
pretrained_data.shape

CPU times: total: 0 ns
Wall time: 12 ms


(Delayed('int-f4000768-c54a-42bb-aa10-d3d7690f33b9'), 9)

In [24]:
%%time
pretrained_data.compute().head()

CPU times: total: 1min 28s
Wall time: 1min 4s


Unnamed: 0,date_block_num,shop_id,item_id,item_name,item_category_id,item_cnt_month,type_code,shop_city,shop_category
0,0,59,22154,ЯВЛЕНИЕ 2012 (BD),37,1.0,Кино,Ярославль,ТЦ
1,0,59,2552,DEEP PURPLE The House Of Blue Light LP,58,0.0,Музыка,Ярославль,ТЦ
2,0,59,2554,DEEP PURPLE Who Do You Think We Are LP,58,0.0,Музыка,Ярославль,ТЦ
3,0,59,2555,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,0.0,Музыка,Ярославль,ТЦ
4,0,59,2564,DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,0.0,Музыка,Ярославль,ТЦ


In [25]:
%%time
# item data
pretrained_data["name1"], pretrained_data["name2"] = pretrained_data.item_name.str.split('[', n=1).str[0], pretrained_data.item_name.str.split('[', n=1).str[1]
pretrained_data["name1"], pretrained_data["name3"] = pretrained_data.item_name.str.split('(', n=1).str[0], pretrained_data.item_name.str.split('(', n=1).str[1]

# replace special characters and turn to lower case
pretrained_data["name2"] = pretrained_data.name2.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()
pretrained_data["name3"] = pretrained_data.name3.str.replace('[^A-Za-z0-9А-Яа-я]+', " ").str.lower()

pretrained_data = pretrained_data.fillna('0')

#pretrained_data.name2 = LabelEncoder().fit_transform(pretrained_data.name2)
#pretrained_data.name3 = LabelEncoder().fit_transform(pretrained_data.name3)

pretrained_data = pretrained_data.drop(columns=['name1', 'item_name']) 
pretrained_data.shape

CPU times: total: 15.6 ms
Wall time: 33 ms


(Delayed('int-7f6ced97-913d-436b-8fea-704b2c6d36db'), 10)

In [26]:
%%time
pretrained_data.compute().head()

CPU times: total: 1min 47s
Wall time: 1min 36s


Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month,type_code,shop_city,shop_category,name2,name3
0,0,59,22154,37,1.0,Кино,Ярославль,ТЦ,0,bd)
1,0,59,2552,58,0.0,Музыка,Ярославль,ТЦ,0,0
2,0,59,2554,58,0.0,Музыка,Ярославль,ТЦ,0,0
3,0,59,2555,56,0.0,Музыка,Ярославль,ТЦ,0,фирм.)
4,0,59,2564,59,0.0,Музыка,Ярославль,ТЦ,0,кир.)


###### Timeseries feature extration

In [27]:
def extract_lag_feature(df, lags, col):
    tmp = df[['date_block_num', 'shop_id', 'item_id', col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = dd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [28]:
%%time
# item count per month lag
pretrained_data = extract_lag_feature(pretrained_data, [1], 'item_cnt_month')

CPU times: total: 0 ns
Wall time: 158 ms


In [29]:
%%time
pretrained_data.compute().head()

ignoring exception in ensure_cleanup_on_exception
Traceback (most recent call last):
  File "C:\Users\maxim\anaconda3\Lib\site-packages\dask\dataframe\shuffle.py", line 930, in ensure_cleanup_on_exception
    yield
  File "C:\Users\maxim\anaconda3\Lib\site-packages\dask\dataframe\shuffle.py", line 945, in shuffle_group_3
    p.append(d, fsync=True)
  File "C:\Users\maxim\anaconda3\Lib\site-packages\partd\encode.py", line 23, in append
    data = valmap(self.encode, data)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\maxim\anaconda3\Lib\site-packages\toolz\dicttoolz.py", line 85, in valmap
    rv.update(zip(d.keys(), map(func, d.values())))
  File "C:\Users\maxim\anaconda3\Lib\site-packages\partd\pandas.py", line 180, in serialize
    col_header, col_bytes = index_to_header_bytes(df.columns)
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\maxim\anaconda3\Lib\site-packages\partd\pandas.py", line 113, in index_to_header_bytes
    header = (type(ind),

AttributeError: 'Index' object has no attribute '_get_attributes_dict'

In [30]:
%%time
pretrained_data.isna().sum().compute()

AttributeError: 'Index' object has no attribute '_get_attributes_dict'

ignoring exception in ensure_cleanup_on_exception
Traceback (most recent call last):
  File "C:\Users\maxim\anaconda3\Lib\site-packages\dask\dataframe\shuffle.py", line 930, in ensure_cleanup_on_exception
    yield
  File "C:\Users\maxim\anaconda3\Lib\site-packages\dask\dataframe\shuffle.py", line 945, in shuffle_group_3
    p.append(d, fsync=True)
  File "C:\Users\maxim\anaconda3\Lib\site-packages\partd\encode.py", line 23, in append
    data = valmap(self.encode, data)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\maxim\anaconda3\Lib\site-packages\toolz\dicttoolz.py", line 85, in valmap
    rv.update(zip(d.keys(), map(func, d.values())))
  File "C:\Users\maxim\anaconda3\Lib\site-packages\partd\pandas.py", line 180, in serialize
    col_header, col_bytes = index_to_header_bytes(df.columns)
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\maxim\anaconda3\Lib\site-packages\partd\pandas.py", line 113, in index_to_header_bytes
    header = (type(ind),