In [1]:
import xml.etree.ElementTree as ET
import requests
import pandas as pd

In [2]:
class XML2DataFrame:
    def __init__(self, xml_data):
        self.root = ET.XML(xml_data)

    def parse_root(self, root):
        return [self.parse_element(child) for child in iter(root)]

    def parse_element(self, element, parsed=None):
        if parsed is None:
            parsed = dict()
        for key in element.keys():
            parsed[key] = element.attrib.get(key)
        if element.text:
            m = {
                'Пол':'sex',
                'Год выпуска':'start_year',
                'Объем':'vol',
                'Тестер':'tester',
                'Семейство':'family',
                'Пробник':'probnik',
                }
            if element.tag == 'param':
                if element.attrib.get('name') == 'Объем':
                    try:
                        ml = ' '+str(element.attrib.get('unit'))
                    except:
                        ml = ''
                    parsed[m[element.attrib.get('name')]] = element.text+ml
                else:
                    parsed[m[element.attrib.get('name')]] = element.text
            else:
                parsed[element.tag] = element.text
        for child in list(element):
            self.parse_element(child, parsed)
        return parsed

    def process_data(self):
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)

In [3]:
%%time
xml2df = XML2DataFrame(requests.get('https://pompadoo.ru/catalog/yml/').content)

Wall time: 13.5 s


In [4]:
#получаем поле offers
xml2df.root = xml2df.root[0][5]

In [5]:
xml_df = xml2df.process_data()

In [6]:
xml_df.head(10)

Unnamed: 0,available,categoryId,country_of_origin,cpa,currencyId,delivery,description,family,id,model,...,probnik,sex,start_year,store,tester,unit,url,vendor,vendorCode,vol
0,True,8,Франция,1,RUR,True,,цветочные фруктовые,10148,40,...,,Женский,2015.0,False,,мл,https://pompadoo.ru/product/4725/?sku=10148,Givenchy,4725,40 мл
1,True,8,Франция,1,RUR,True,,цветочные фруктовые,8991,75,...,,Женский,2015.0,False,Да,мл,https://pompadoo.ru/product/4725/?sku=8991,Givenchy,4725,75 мл
2,True,8,Франция,1,RUR,True,,восточные цитрусовые,4992,35,...,,Женский,2012.0,False,,мл,https://pompadoo.ru/product/2801/?sku=4992,Chanel,2801,35 мл
3,True,8,Франция,1,RUR,True,,восточные цитрусовые,8182,50,...,,Женский,2012.0,False,,мл,https://pompadoo.ru/product/2801/?sku=8182,Chanel,2801,50 мл
4,True,8,Япония,1,RUR,True,,,11126,80,...,,Женский,,False,,мл,https://pompadoo.ru/product/5549/?sku=11126,Masaki Matsushima,5549,80 мл
5,True,8,Франция,1,RUR,True,,восточные цветочные,9375,30,...,,Женский,2012.0,False,,мл,https://pompadoo.ru/product/4608/?sku=9375,Yves Saint Laurent,4608,30 мл
6,True,8,Франция,1,RUR,True,,восточные цветочные,11203,50,...,,Женский,2012.0,False,,мл,https://pompadoo.ru/product/4608/?sku=11203,Yves Saint Laurent,4608,50 мл
7,True,8,Франция,1,RUR,True,,восточные цветочные,8650,90,...,,Женский,2012.0,False,Да,мл,https://pompadoo.ru/product/4608/?sku=8650,Yves Saint Laurent,4608,90 мл
8,True,8,Германия,1,RUR,True,,цветочные фруктовые,5648,75,...,,Женский,2010.0,False,Да,мл,https://pompadoo.ru/product/3191/?sku=5648,Hugo Boss,3191,75 мл
9,True,14,Франция,1,RUR,True,,древесные пряные,6166,2,...,Да,Унисекс,2009.0,False,,мл,https://pompadoo.ru/product/3489/?sku=6166,Montale,3489,2 мл


In [7]:
xml_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4760 entries, 0 to 4759
Data columns (total 26 columns):
available            4760 non-null object
categoryId           4760 non-null object
country_of_origin    4478 non-null object
cpa                  4760 non-null object
currencyId           4760 non-null object
delivery             4760 non-null object
description          31 non-null object
family               4118 non-null object
id                   4760 non-null object
model                4760 non-null object
name                 4760 non-null object
offer                4760 non-null object
oldprice             976 non-null object
pickup               4760 non-null object
picture              4760 non-null object
price                4760 non-null object
probnik              71 non-null object
sex                  4760 non-null object
start_year           4143 non-null object
store                4760 non-null object
tester               1159 non-null object
unit            

In [8]:
xml_df.fillna('-', inplace=True)

In [9]:
xml_df.head(10)

Unnamed: 0,available,categoryId,country_of_origin,cpa,currencyId,delivery,description,family,id,model,...,probnik,sex,start_year,store,tester,unit,url,vendor,vendorCode,vol
0,True,8,Франция,1,RUR,True,-,цветочные фруктовые,10148,40,...,-,Женский,2015,False,-,мл,https://pompadoo.ru/product/4725/?sku=10148,Givenchy,4725,40 мл
1,True,8,Франция,1,RUR,True,-,цветочные фруктовые,8991,75,...,-,Женский,2015,False,Да,мл,https://pompadoo.ru/product/4725/?sku=8991,Givenchy,4725,75 мл
2,True,8,Франция,1,RUR,True,-,восточные цитрусовые,4992,35,...,-,Женский,2012,False,-,мл,https://pompadoo.ru/product/2801/?sku=4992,Chanel,2801,35 мл
3,True,8,Франция,1,RUR,True,-,восточные цитрусовые,8182,50,...,-,Женский,2012,False,-,мл,https://pompadoo.ru/product/2801/?sku=8182,Chanel,2801,50 мл
4,True,8,Япония,1,RUR,True,-,-,11126,80,...,-,Женский,-,False,-,мл,https://pompadoo.ru/product/5549/?sku=11126,Masaki Matsushima,5549,80 мл
5,True,8,Франция,1,RUR,True,-,восточные цветочные,9375,30,...,-,Женский,2012,False,-,мл,https://pompadoo.ru/product/4608/?sku=9375,Yves Saint Laurent,4608,30 мл
6,True,8,Франция,1,RUR,True,-,восточные цветочные,11203,50,...,-,Женский,2012,False,-,мл,https://pompadoo.ru/product/4608/?sku=11203,Yves Saint Laurent,4608,50 мл
7,True,8,Франция,1,RUR,True,-,восточные цветочные,8650,90,...,-,Женский,2012,False,Да,мл,https://pompadoo.ru/product/4608/?sku=8650,Yves Saint Laurent,4608,90 мл
8,True,8,Германия,1,RUR,True,-,цветочные фруктовые,5648,75,...,-,Женский,2010,False,Да,мл,https://pompadoo.ru/product/3191/?sku=5648,Hugo Boss,3191,75 мл
9,True,14,Франция,1,RUR,True,-,древесные пряные,6166,2,...,Да,Унисекс,2009,False,-,мл,https://pompadoo.ru/product/3489/?sku=6166,Montale,3489,2 мл


In [10]:
vendors = xml_df.filter(['vendor'])

In [11]:
vendors['lowVendor'] = vendors.apply(lambda vendors: vendors['vendor'].lower(), axis=1)

In [12]:
vendors.head()

Unnamed: 0,vendor,lowVendor
0,Givenchy,givenchy
1,Givenchy,givenchy
2,Chanel,chanel
3,Chanel,chanel
4,Masaki Matsushima,masaki matsushima


In [13]:
vendors.to_csv('VendorsList.csv', index=False, encoding='utf-8')

In [14]:
brands = pd.read_csv('PerfumesList.csv')

In [15]:
brands['lowBrand'] = brands.apply(lambda brands: brands['name'].lower(), axis=1)

In [16]:
brands.head()

Unnamed: 0,letter,name,url,lowBrand
0,A,A Bathing Ape,/designers/A-Bathing-Ape.html,a bathing ape
1,A,A Beautiful Life Brands,/designers/A-Beautiful-Life-Brands.html,a beautiful life brands
2,A,A Dozen Roses,/designers/A-Dozen-Roses.html,a dozen roses
3,A,A La Russe,/designers/A-La-Russe.html,a la russe
4,A,A Lab on Fire,/designers/A-Lab-on-Fire.html,a lab on fire


In [17]:
brands = brands.rename(columns={'name': 'brand'})

In [18]:
brands = brands[['brand', 'url', 'letter', 'lowBrand']]

In [19]:
brands.head()

Unnamed: 0,brand,url,letter,lowBrand
0,A Bathing Ape,/designers/A-Bathing-Ape.html,A,a bathing ape
1,A Beautiful Life Brands,/designers/A-Beautiful-Life-Brands.html,A,a beautiful life brands
2,A Dozen Roses,/designers/A-Dozen-Roses.html,A,a dozen roses
3,A La Russe,/designers/A-La-Russe.html,A,a la russe
4,A Lab on Fire,/designers/A-Lab-on-Fire.html,A,a lab on fire


In [20]:
brands.to_csv('BrandsList.csv', index=False, encoding='utf-8')

In [21]:
df = vendors.copy()

In [22]:
df['brand'] = brands['brand']
df['lowBrand'] = brands['lowBrand']
df['url'] = brands['url']

In [23]:
df.head()

Unnamed: 0,vendor,lowVendor,brand,lowBrand,url
0,Givenchy,givenchy,A Bathing Ape,a bathing ape,/designers/A-Bathing-Ape.html
1,Givenchy,givenchy,A Beautiful Life Brands,a beautiful life brands,/designers/A-Beautiful-Life-Brands.html
2,Chanel,chanel,A Dozen Roses,a dozen roses,/designers/A-Dozen-Roses.html
3,Chanel,chanel,A La Russe,a la russe,/designers/A-La-Russe.html
4,Masaki Matsushima,masaki matsushima,A Lab on Fire,a lab on fire,/designers/A-Lab-on-Fire.html


In [24]:
unmatched = [set(df['lowVendor']) - set(df['lowBrand'])]

In [42]:
unmatched


[{'abercrombie & fitch',
  'acqua di parma',
  'adidas',
  'agent provocateur',
  'ajmal',
  'al hamatt',
  'al haramain',
  'alaia',
  'alexandre j.',
  'alfred dunhill',
  'alla pugacheva',
  'amorino prive',
  'amouage',
  'amouroud',
  'angel schlesser',
  'anna sui',
  'antonio banderas',
  'aquolina',
  'aramis',
  'armand basi',
  'armani',
  'asgharali',
  'atelier cologne',
  'atkinsons',
  'azzaro',
  'baldessarini',
  'baldinini',
  'balenciaga cristobal',
  'balmain',
  'bamotte',
  'banana republic',
  'bebe',
  'belletete',
  'bentley',
  'beyonce',
  'bill blass',
  'blumarine',
  'boadicea the victorious',
  'bottega veneta',
  'boucheron',
  'britney spears',
  'bruno banani',
  'burberry',
  'bvlgari',
  'byc moze',
  'byredo',
  'cacharel',
  'cafe-cafe',
  'caldion',
  'calvin klein',
  'canali',
  'carner barcelona',
  'carolina herrera',
  'cartier',
  'cerruti',
  'chambor',
  'chanel',
  'chevignon',
  'chloe',
  'chopard',
  'christian audigier',
  'christian l

In [52]:
len(s)

1

In [59]:
result_list = []
for x in unmatched:
    for y in list(x):
        result_list.append(y)
print(result_list)

['geoffrey beene', 'jessica mcclintock', 'john galliano', 'asgharali', 'escada', 'ulric de varens', 'david yurman', 'swarovski', 'orchid perfumes', 'myrurgia', 'jean patou', 'emilio pucci', 'bvlgari', 'courvoisier', 'bill blass', 'guerlain', 'donna karan dkny', 'marc jacobs', 'balenciaga cristobal', 'house of sillage', 'caldion', 'karl lagerfeld', 'simone cosac profumi', 'aramis', 'michael kors', 'roberto cavalli', 'burberry', 'bebe', 'atkinsons', 'custo barcelona', 'versace', 'fly falcone', 'tom ford', 'ferragamo salvatore', 'valentino', 'serge lutens', 'esteban', 'escentric molecules', 'amorino prive', 'kilian', 'bentley', 'christian lacroix', 'krizia', 'mexx', 'christina aguilera', 'gian marco venturi', 'ferre', 'chloe', 'carolina herrera', 'jo malone', 'umbro', 'louis feraud', 'jesus del pozo', 'cristiano ronaldo', 'thierry mugler', 'elie saab le parfum', 'yves saint laurent', 'elizabeth arden', 'jean paul gaultier', 'gloria vanderbilt', 'mandarina duck', 'estee lauder', 'giulietta

In [60]:
len(result_list)

234

In [61]:
v = {}
for count, el in enumerate(result_list):
    print(el)
    s = str(input()).lower()
    v[count] = s

geoffrey beene
geoffrey beene
jessica mcclintock
Jessica McClintock
john galliano
John Galliano
asgharali
Asgharali
escada
Escada
ulric de varens
Ulric de Varens
david yurman
David Yurman
swarovski
Swarowski
orchid perfumes
-
myrurgia
Myrurgia
jean patou
Jean PAtou
emilio pucci
Emilio Pucci
bvlgari
Bvlgari
courvoisier
Courvoisier Cognac
bill blass
bill blass
guerlain
guerlain
donna karan dkny
donna karan
marc jacobs
marc jacobs
balenciaga cristobal
balenciaga
house of sillage
house of sillage
caldion
-
karl lagerfeld
karl lagerfeld
simone cosac profumi
simone cosac profumi
aramis
aramis
michael kors
michael kors
roberto cavalli
roberto cavalli
burberry
burberry
bebe
bebe
atkinsons
atkinsons
custo barcelona
custo barcelona
versace
versace
fly falcone
-
tom ford
tom ford
ferragamo salvatore
Salvatore Ferragamo
valentino
valentino
serge lutens
serge lutens
esteban
esteban.
escentric molecules
escentric molecules
amorino prive
-
kilian
by kilian
bentley
bentley
christian lacroix
christian 

In [62]:
v

{0: 'geoffrey beene',
 1: 'jessica mcclintock',
 2: 'john galliano',
 3: 'asgharali',
 4: 'escada',
 5: 'ulric de varens',
 6: 'david yurman',
 7: 'swarowski',
 8: '-',
 9: 'myrurgia',
 10: 'jean patou',
 11: 'emilio pucci',
 12: 'bvlgari',
 13: 'courvoisier cognac',
 14: 'bill blass',
 15: 'guerlain',
 16: 'donna karan',
 17: 'marc jacobs',
 18: 'balenciaga',
 19: 'house of sillage',
 20: '-',
 21: 'karl lagerfeld',
 22: 'simone cosac profumi',
 23: 'aramis',
 24: 'michael kors',
 25: 'roberto cavalli',
 26: 'burberry',
 27: 'bebe',
 28: 'atkinsons',
 29: 'custo barcelona',
 30: 'versace',
 31: '-',
 32: 'tom ford',
 33: 'salvatore ferragamo',
 34: 'valentino',
 35: 'serge lutens',
 36: 'esteban.',
 37: 'escentric molecules',
 38: '-',
 39: 'by kilian',
 40: 'bentley',
 41: 'christian lacroix',
 42: 'krizia',
 43: 'mexx',
 44: 'christina aguilera',
 45: 'gianmarco venturi',
 46: 'gianfranco ferre',
 47: 'chloe',
 48: 'carolina herrera',
 49: 'jo malone london',
 50: 'umbro',
 51: 'lou

In [63]:
v[36] = 'esteban'

In [64]:
v

{0: 'geoffrey beene',
 1: 'jessica mcclintock',
 2: 'john galliano',
 3: 'asgharali',
 4: 'escada',
 5: 'ulric de varens',
 6: 'david yurman',
 7: 'swarowski',
 8: '-',
 9: 'myrurgia',
 10: 'jean patou',
 11: 'emilio pucci',
 12: 'bvlgari',
 13: 'courvoisier cognac',
 14: 'bill blass',
 15: 'guerlain',
 16: 'donna karan',
 17: 'marc jacobs',
 18: 'balenciaga',
 19: 'house of sillage',
 20: '-',
 21: 'karl lagerfeld',
 22: 'simone cosac profumi',
 23: 'aramis',
 24: 'michael kors',
 25: 'roberto cavalli',
 26: 'burberry',
 27: 'bebe',
 28: 'atkinsons',
 29: 'custo barcelona',
 30: 'versace',
 31: '-',
 32: 'tom ford',
 33: 'salvatore ferragamo',
 34: 'valentino',
 35: 'serge lutens',
 36: 'esteban',
 37: 'escentric molecules',
 38: '-',
 39: 'by kilian',
 40: 'bentley',
 41: 'christian lacroix',
 42: 'krizia',
 43: 'mexx',
 44: 'christina aguilera',
 45: 'gianmarco venturi',
 46: 'gianfranco ferre',
 47: 'chloe',
 48: 'carolina herrera',
 49: 'jo malone london',
 50: 'umbro',
 51: 'loui

In [65]:
v1 = v.copy() # несмэтченные

In [84]:
matched = list(set(df['lowVendor'].unique().tolist()) and set(df['lowBrand'].unique().tolist()))

In [85]:
matched

[nan,
 ' peter jacobs parfum  ',
 ' jean d`albret  ',
 ' jivago  ',
 ' juliana paes  ',
 ' mayssa  ',
 ' salvatore ferragamo  ',
 ' jill stuart  ',
 ' beyonce  ',
 ' héloïse de v.  ',
 ' jean reno  ',
 ' la sucrerie de la montagne  ',
 ' tom rebl  ',
 ' lexington  ',
 ' centurion parfums  ',
 ' grassroots  ',
 ' be. colonias  ',
 ' des filles a la vanille  ',
 ' frank govers  ',
 ' bath house  ',
 ' martine marie laurent  ',
 ' mimmina  ',
 ' musc et madame  ',
 ' theo parfums  ',
 ' maori collection  ',
 ' morgane le fay  ',
 ' angel schlesser  ',
 ' emporio body store  ',
 ' chiara boni  ',
 ' lulah  ',
 ' titanium man  ',
 ' olivier durbano  ',
 ' jane booke  ',
 ' cvs essence of beauty  ',
 ' olivina men  ',
 ' stella mccartney  ',
 ' anastacia  ',
 ' dynasty  ',
 ' stilllife  ',
 ' electimuss  ',
 ' massoïa secrets  ',
 ' poiray  ',
 ' arabiyat  ',
 ' cristina ferreira  ',
 ' luce fragrance  ',
 ' arielle shoshana  ',
 ' arrogance  ',
 ' ricki hall and captain fawcett  ',
 ' bobbi

In [89]:
len(matched)

3189

In [93]:
v2 = matched + list(v1) # смэтченные и несмэтченные

In [94]:
len(v2)

3423