In [145]:
import pandas as pd
import numpy as np
import re

In [2]:
from googletrans import Translator

In [3]:
translator = Translator()

In [4]:
shops = pd.read_csv('./input/shops.csv')
items = pd.read_csv('./input/items.csv')
items_cat = pd.read_csv('./input/item_categories.csv')

## Items analysis

In [5]:
items_cat['name_eng'] = items_cat['item_category_name'].apply(lambda x: translator.translate(x).text)

In [121]:
p_c = re.compile(r'\s\(([^\)]+)\)')
p_sc = re.compile(r'\(([^)]+)\)')

def get_code(x, reg_pattern):
    codes = x.split(' - ')
    return re.sub(reg_pattern, '', codes[0]).lower()

def get_subcode(x, reg_pattern):
    codes = x.split(' - ')
    if len(codes) > 1:
        return codes[1].lower()
    else:
        if len(codes) == 1:
            cat_code = codes[0]
            sub_code = re.findall(reg_pattern, cat_code)
            return sub_code[0].lower() if len(sub_code)>0 else 'none'

In [122]:
items_cat['cat_code'] = items_cat['name_eng'].apply(lambda x: get_code(x, p_c))
items_cat['cat_subcode'] = items_cat['name_eng'].apply(lambda x: get_subcode(x, p_sc))

In [128]:
items_cat.cat_code.nunique()

20

In [129]:
items_cat.cat_subcode.nunique()

64

In [125]:
items_data = pd.merge(left=items, right=items_cat, how='left', left_on='item_category_id', right_on='item_category_id')

In [126]:
items_data

Unnamed: 0,item_name,item_id,item_category_id,item_category_name,name_eng,cat_code,cat_subcode
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40,Кино - DVD,Movie - DVD,movie,dvd
1,!ABBYY FineReader 12 Professional Edition Full...,1,76,Программы - Для дома и офиса (Цифра),Programs - For Home and Office (Digital),programs,for home and office (digital)
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40,Кино - DVD,Movie - DVD,movie,dvd
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40,Кино - DVD,Movie - DVD,movie,dvd
4,***КОРОБКА (СТЕКЛО) D,4,40,Кино - DVD,Movie - DVD,movie,dvd
...,...,...,...,...,...,...,...
22165,"Ядерный титбит 2 [PC, Цифровая версия]",22165,31,Игры PC - Цифра,PC Games - Number,pc games,number
22166,Язык запросов 1С:Предприятия [Цифровая версия],22166,54,Книги - Цифра,Books - Number,books,number
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,22167,49,Книги - Методические материалы 1С,Books - Methodical materials 1C,books,methodical materials 1c
22168,Яйцо для Little Inu,22168,62,"Подарки - Гаджеты, роботы, спорт","Gifts - Gadgets, Robots, Sports",gifts,"gadgets, robots, sports"


In [130]:
items_data.drop(columns=['item_name', 'item_category_name', 'name_eng'], inplace=True)

In [131]:
items_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_id           22170 non-null  int64 
 1   item_category_id  22170 non-null  int64 
 2   cat_code          22170 non-null  object
 3   cat_subcode       22170 non-null  object
dtypes: int64(2), object(2)
memory usage: 866.0+ KB


In [132]:
items_data.to_csv('./input/items_data.csv', index=False)

## Shops Analysis

In [133]:
shops['shop_name'] = shops['shop_name'].apply(lambda x: translator.translate(x).text.lower())

In [143]:
shops.sample(10)

Unnamed: 0,shop_name,shop_id,location
17,"krasnoyarsk tc ""vzletka plaza""",17,"tc ""vzletka plaza"""
47,"surgut sec ""city mall""",47,"sec ""city mall"""
29,"moscow shopping center ""new age"" (novokosino)",29,"shopping center ""new age"" (novokosino)"
4,"volzhsky shopping center ""volga mall""",4,"shopping center ""volga mall"""
10,zhukovsky st. chkalov 39m?,10,st. chkalov 39m?
57,"yakutsk ordzhonikidze, 56",57,"ordzhonikidze, 56"
53,"ufa tc ""family"" 2",53,"tc ""family"" 2"
19,"kursk tc ""pushkinsky""",19,"tc ""pushkinsky"""
46,sergiev posad 7ya shopping center,46,posad 7ya shopping center
59,"yaroslavl tc ""altair""",59,"tc ""altair"""


In [147]:
p_rem_special= re.compile(r'[\!*()\[\],.]')

In [151]:
shops['shop_name'] = shops['shop_name'].apply(lambda x: re.sub(p_rem_special,'', x))

In [153]:
patternlocation = re.compile(r'[\w\-]+')

In [154]:
shops['location'] = shops['shop_name'].apply(lambda x: re.findall(patternlocation, x)[0])

In [156]:
shops.sample(10)

Unnamed: 0,shop_name,shop_id,location
10,zhukovsky st chkalov 39m?,10,zhukovsky
29,"moscow shopping center ""new age"" novokosino",29,moscow
53,"ufa tc ""family"" 2",53,ufa
46,sergiev posad 7ya shopping center,46,sergiev
5,"vologda sec ""marmelad""",5,vologda
30,moscow perlovsky shopping center,30,moscow
39,rostov-on-don megacenter horizon,39,rostov-on-don
37,"novosibirsk tc ""mega""",37,novosibirsk
1,"yakutsk tc ""central"" fran",1,yakutsk
57,yakutsk ordzhonikidze 56,57,yakutsk


In [162]:
p_shop_identifier = re.compile(r'\"([\w]+)\"')
def shop_identifier(x, pat):
    identifier = re.findall(p_shop_identifier, x)
    if len(identifier)==1: return identifier[0]
    else: return 'none'

In [164]:
shops['identifier'] = shops['shop_name'].apply(lambda x: shop_identifier(x, p_shop_identifier))

In [167]:
shops.identifier.value_counts()

none           28
mega            5
central         3
rio             2
budenovskiy     2
behetle         1
goodwin         1
altair          1
maksimir        1
city            1
crystal         1
areal           1
parkhaus        1
melody          1
atrium          1
marmelad        1
carnival        1
semenovsky      1
fantasy         1
family          1
sennaya         1
june            1
parkhouse       1
pushkinsky      1
sale            1
Name: identifier, dtype: int64

In [168]:
shops.location.value_counts()

moscow           13
yakutsk           4
voronezh          3
tyumen            3
ufa               2
spb               2
novosibirsk       2
rostov-on-don     2
krasnoyarsk       2
kazan             2
zhukovsky         2
samara            2
nnovgorod         2
kaluga            1
vologda           1
yaroslavl         1
sergiev           1
kursk             1
tomsk             1
chekhov           1
rostovnadonu      1
balashikha        1
volzhsky          1
offsite           1
khimki            1
kolomna           1
adygea            1
mytischi          1
digital           1
emergency         1
omsk              1
surgut            1
Name: location, dtype: int64

In [169]:
shops.loc[shops.location == 'rostovnadonu', 'location'] = 'rostov-on-don'

In [170]:
shops.location.value_counts()

moscow           13
yakutsk           4
tyumen            3
rostov-on-don     3
voronezh          3
nnovgorod         2
ufa               2
kazan             2
krasnoyarsk       2
zhukovsky         2
samara            2
novosibirsk       2
spb               2
kaluga            1
chekhov           1
kursk             1
yaroslavl         1
vologda           1
sergiev           1
tomsk             1
digital           1
balashikha        1
mytischi          1
volzhsky          1
offsite           1
omsk              1
khimki            1
emergency         1
kolomna           1
adygea            1
surgut            1
Name: location, dtype: int64

In [171]:
shops.drop(columns='identifier', inplace=True)

In [172]:
shops.to_csv('./input/shops_data.csv', index=False)