In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from googletrans import Translator

In [3]:
translator = Translator()

In [4]:
shops = pd.read_csv('./input/shops.csv')
items = pd.read_csv('./input/items.csv')
items_cat = pd.read_csv('./input/item_categories.csv')

## Items analysis

In [5]:
items_cat['name_eng'] = items_cat['item_category_name'].apply(lambda x: translator.translate(x).text)

In [6]:
p_c = re.compile(r'\s\(([^\)]+)\)')
p_sc = re.compile(r'\(([^)]+)\)')

def get_code(x, reg_pattern):
    codes = x.split(' - ')
    return re.sub(reg_pattern, '', codes[0]).lower()

def get_subcode(x, reg_pattern):
    codes = x.split(' - ')
    if len(codes) > 1:
        return codes[1].lower()
    else:
        if len(codes) == 1:
            cat_code = codes[0]
            sub_code = re.findall(reg_pattern, cat_code)
            return sub_code[0].lower() if len(sub_code)>0 else 'none'

In [7]:
items_cat['cat_code'] = items_cat['name_eng'].apply(lambda x: get_code(x, p_c))
items_cat['cat_subcode'] = items_cat['name_eng'].apply(lambda x: get_subcode(x, p_sc))

In [8]:
items_cat.cat_code.nunique()

20

In [9]:
items_cat.cat_subcode.nunique()

64

In [10]:
items_data = pd.merge(left=items, right=items_cat, how='left', left_on='item_category_id', right_on='item_category_id')

In [11]:
items_data

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,name_eng,cat_code,cat_subcode
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40,Кино - DVD,Movie - DVD,movie,dvd
1,!ABBYY FineReader 12 Professional Edition Full...,1,76,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Digital),programs,for home and office (digital)
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,Movie - DVD,movie,dvd
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,Movie - DVD,movie,dvd
4,***КОРОБКА (СТЕКЛО) D,4,40,Кино - DVD,Movie - DVD,movie,dvd
...,...,...,...,...,...,...,...
22165,"Ядерный титбит 2 [PC, Цифровая версия]",22165,31,Игры PC - Цифра,PC Games - Number,pc games,number
22166,Язык запросов 1С:Предприятия [Цифровая версия],22166,54,Книги - Цифра,Books - Number,books,number
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,22167,49,Книги - Методические материалы 1С,Books - Methodical materials 1C,books,methodical materials 1c
22168,Яйцо для Little Inu,22168,62,"Подарки - Гаджеты, роботы, спорт","Gifts - Gadgets, Robots, Sports",gifts,"gadgets, robots, sports"


In [12]:
items_data.drop(columns=['item_name', 'item_category_name', 'name_eng'], inplace=True)

In [13]:
items_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_id           22170 non-null  int64 
 1   item_category_id  22170 non-null  int64 
 2   cat_code          22170 non-null  object
 3   cat_subcode       22170 non-null  object
dtypes: int64(2), object(2)
memory usage: 866.0+ KB


In [14]:
items_data.to_feather('./input/items_data.ftr')

## Shops Analysis

In [15]:
shops['shop_name'] = shops['shop_name'].apply(lambda x: translator.translate(x).text.lower())

In [16]:
shops.sample(10)

Unnamed: 0,shop_name,shop_id
52,"ufa tc ""central""",52
56,"chekhov sec ""carnival""",56
2,"adygea shopping center ""mega""",2
42,"spb tc ""nevsky center""",42
16,"kolomna tc ""rio""",16
59,"yaroslavl tc ""altair""",59
29,"moscow tc ""new age"" (novokosino)",29
50,"tyumen tc ""goodwin""",50
7,"voronezh trc ""maksimir""",7
5,"vologda sec ""marmelad""",5


In [17]:
p_rem_special= re.compile(r'[\!*()\[\],.]')

In [18]:
shops['shop_name'] = shops['shop_name'].apply(lambda x: re.sub(p_rem_special,'', x))

In [19]:
patternlocation = re.compile(r'[\w\-]+')

In [20]:
shops['location'] = shops['shop_name'].apply(lambda x: re.findall(patternlocation, x)[0])

In [21]:
shops.sample(10)

Unnamed: 0,shop_name,shop_id,location
38,"omsk tc ""mega""",38,omsk
11,zhukovsky st chkalov 39m²,11,zhukovsky
37,"novosibirsk tc ""mega""",37,novosibirsk
31,"moscow tc ""semenovsky""",31,moscow
30,moscow perlovsky shopping center,30,moscow
19,"kursk tc ""pushkinsky""",19,kursk
2,"adygea shopping center ""mega""",2,adygea
54,"khimki tc ""mega""",54,khimki
22,moscow shop c21,22,moscow
49,"tyumen sec ""crystal""",49,tyumen


In [22]:
p_shop_identifier = re.compile(r'\"([\w]+)\"')
def shop_identifier(x, pat):
    identifier = re.findall(p_shop_identifier, x)
    if len(identifier)==1: return identifier[0]
    else: return 'none'

In [23]:
shops['identifier'] = shops['shop_name'].apply(lambda x: shop_identifier(x, p_shop_identifier))

In [24]:
shops.identifier.value_counts()

none           28
mega            5
central         3
budenovskiy     2
rio             2
behetle         1
parkhouse       1
semenovsky      1
city            1
carnival        1
maksimir        1
areal           1
goodwin         1
altair          1
pushkinsky      1
melody          1
sale            1
parkhaus        1
fantasy         1
marmelad        1
june            1
family          1
atrium          1
sennaya         1
crystal         1
Name: identifier, dtype: int64

In [25]:
shops.location.value_counts()

moscow           13
yakutsk           4
tyumen            3
voronezh          3
krasnoyarsk       2
nnovgorod         2
zhukovsky         2
samara            2
novosibirsk       2
spb               2
kazan             2
ufa               2
khimki            1
rostov            1
balashikha        1
chekhov           1
adygea            1
vologda           1
rostov-on-don     1
mytischi          1
omsk              1
rostovnadonu      1
surgut            1
emergency         1
offsite           1
kaluga            1
digital           1
sergiev           1
kolomna           1
volzhsky          1
kursk             1
tomsk             1
yaroslavl         1
Name: location, dtype: int64

In [26]:
shops.loc[shops.location == 'rostovnadonu', 'location'] = 'rostov-on-don'

In [27]:
shops.location.value_counts()

moscow           13
yakutsk           4
voronezh          3
tyumen            3
krasnoyarsk       2
nnovgorod         2
rostov-on-don     2
zhukovsky         2
samara            2
novosibirsk       2
kazan             2
ufa               2
spb               2
balashikha        1
rostov            1
adygea            1
khimki            1
vologda           1
mytischi          1
omsk              1
chekhov           1
surgut            1
emergency         1
offsite           1
kaluga            1
digital           1
sergiev           1
kolomna           1
volzhsky          1
kursk             1
tomsk             1
yaroslavl         1
Name: location, dtype: int64

In [28]:
shops.drop(columns='identifier', inplace=True)

In [29]:
shops.to_feather('./input/shops_data.ftr')