Importamos las librerías necesarias

In [2]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}

## Creamos las clases:
- Scapper -> Clase abstracta para scrapping

In [3]:
from abc import ABC,abstractmethod
class Scrapper(ABC):
     def __init__(self,url):
          self.url = url

     def fetch_content(self,url=None):
          if not url:
               url =self.url
          try:
               response = requests.get(url,headers)
               if (response.status_code==200):
                    html_text = response.text
                    self.soup = BeautifulSoup(html_text,'html5lib') #library that makes it easy to scrape information from web pages
               else:
                    self.soup = None
               return self.soup,response.status_code
          except requests.RequestException as e:
               return f"Error haciendo el fetch de {url}: {e}",404
          
     

     def execute_scrapping(self,tag_container,class_container,tag_name) -> list[any]:
          pass

### clase prenda
Descripre lo necesario en que consiste una entidad prenda

In [4]:
class Prenda:
     def __init__(self,tienda:str,category:str,nombre:str,precio,combo:bool=False):
          self.tienda = tienda
          self.category = category
          self.nombre = nombre
          self.precio = precio
          self.combo = combo

     def __eq__(self,other):
          if not isinstance(other,type(self)):
               return NotImplemented
          return (self.nombre==other.nombre and self.precio==other.precio)        
     def __repr__(self) -> str:
          return f"Show(nombre={self.nombre},precio={self.precio})"
     
     def __hash__(self):
          return hash((self.nombre,self.precio))

### Clase TiendaNubeScrapping
- clase que hereda de Scrapper y define un scrapping de tienda nube

In [5]:
class TiendaNubeScrapping(Scrapper):
    def __init__(self, url :str ,store_name: str,tag_sublinks,tag_container,a_class,subtag_name):
        super().__init__(url)
        self.store_name = store_name
        self.tag_sublinks = tag_sublinks
        self.tag_container=tag_container
        self.a_class = a_class
        self.subtag_name = subtag_name

    def create_df(self,data_list):
          return pd.DataFrame([prenda.__dict__ for prenda in data_list])

    def _plural_to_singular(self,word):
    # Caso para palabras terminadas en 'ies' (ej. parties -> party)
        if re.search(r'IES$', word):
            return re.sub(r'IES$', 'y', word)
    # Caso para palabras terminadas en 'es' (excluyendo las terminadas en 'ies', como dishes -> dish)
        elif re.search(r'([^i])ES$', word):
            return re.sub(r'ES$', '', word)
    # Caso general para palabras terminadas en 's' (ej. cats -> cat)
        elif re.search(r'S$', word):
            return re.sub(r'S$', '', word)
    # Retorna la palabra si no cumple con los anteriores casos
        return word

    def _get_category(self,nombre_prenda : str):
          combo = False
          characters_to_replace = ['SET','PACK']
          nombre = ""
          words = nombre_prenda.split() 
          for index,word in enumerate(words):
               if (word not in characters_to_replace):
                    nombre = word
                    if(index!=0):
                        combo = True

                    break
          return self._plural_to_singular(nombre),combo
         

    def execute_scrapping(self) -> list[Prenda]:
        print(f"Obteniendo los datos de: {self.store_name}...")
        prendas = set()     
        soup_maura_ = self.soup.find_all(self.tag_sublinks['tag'],class_=self.tag_sublinks['class'])
        soup_elements = soup_maura_[0].find_all('a',class_=self.a_class, href=True)
        for element in soup_elements:
          
                if element and element.has_attr('href') and element['href'] != '#' and len(element['href'])>15:
                        s2,status = self.fetch_content(element['href'])
                        if (status ==200):
                            print(f"url consultada {element['href']}")
                            soup = s2.find_all(self.tag_container['tag'],class_=self.tag_container['class'])
                            
                            for s in soup:
                                
                                try:
                                    nombre = s.find_all(self.subtag_name,class_=re.compile('item-name'))[0].text.strip()
                                    precio = s.find_all('span',class_=re.compile('js-price-display item-price'))[0].text.strip()
                                    category,combo = self._get_category(nombre.upper())
                                    prenda = Prenda(tienda=self.store_name.upper(),
                                                    category=category,
                                                    nombre=nombre,
                                                    precio=precio,
                                                    combo=combo)
                                    print(prenda)
                                    
                                    prendas.add(prenda)
                                except IndexError as e:    
                                    print(f"Error: {e}. No se pudo obtener los datos de {element['href']} por error de indice")
                                        
                                
                        else:
                             print("No entró")        
        return list(prendas)

# Create the instances and generate the complete dataframe of Tienda Nube
This will include implement the creation of the instances and then putting all together to make the scrapping

In [6]:
def creacion_df_tienda(tienda):
        df_tienda = pd.DataFrame()
        _, status = tienda.fetch_content()
        if(status==200):
                list_tienda =tienda.execute_scrapping()
                df_tienda = tienda.create_df(list_tienda)
        else:
                print("No se pudo cargar la pagina")
        return df_tienda


In [7]:
def anexar_df_complete(tiendas,df_complete):
    for tienda in tiendas:
        df = creacion_df_tienda(tienda)
        df_complete = pd.concat([df_complete,df])
    return df_complete

## Instanciando tiendas

In [8]:
pda = TiendaNubeScrapping(
    url="https://www.pda.com.ar",
    store_name="PDA",
    tag_sublinks ={"tag": "div","class": 'nav-primary'},
    tag_container={"tag": "div","class": 'item-description py-4 px-3'},
    a_class='nav-list-link',
    subtag_name='div')
kazuma = TiendaNubeScrapping(
    url="https://kazuma.com.ar",
    store_name="Kazuma",
    tag_sublinks ={"tag": "div","class": 'nav-primary'},
    tag_container={"tag": "div","class": 'item-description'},
    a_class='nav-list-link',
    subtag_name='div')

In [9]:
torm_facha = TiendaNubeScrapping(url="https://tormentadefacha.mitiendanube.com",
    store_name="Tormenta de facha",
    tag_sublinks={"tag": "div","class": 'desktop-nav-container'},
    tag_container={"tag": "div","class": 'span3 item-container m-bottom-half'},
    a_class='desktop-nav-link',
    subtag_name='a')

In [10]:
lemouton = TiendaNubeScrapping(url="https://lemoutonbebeshop.com.ar",
    store_name="LEMOUTON_BEBE_SHOP",
    tag_sublinks={"tag": "ul","class": "js-desktop-nav desktop-nav hidden-phone font-small"},
    tag_container={"tag": "div","class": 'span3 item-container m-bottom-half'},
    a_class='desktop-nav-link',
    subtag_name='a')

In [11]:
maura = TiendaNubeScrapping(url="https://maura.mitiendanube.com",
    store_name="Maura",
    tag_sublinks={"tag": "div","class": 'row-fluid hidden-phone'},
    tag_container={"tag": "div","class": 'item-info-container m-top-half m-bottom-half'},
    a_class='desktop-nav-link',
    subtag_name='h6')

In [12]:
bodacious = TiendaNubeScrapping(url="https://www.bodaciousclothing.com",
    store_name="Bodacious",
    tag_sublinks={"tag": "div","class": 'modal-with-fixed-footer'},
    tag_container={"tag": "a","class": 'item-link'},
    subtag_name='div',
    a_class='nav-list-link')

In [13]:
discobolojeans = TiendaNubeScrapping(url="https://www.discobolojeans.com",
    store_name="Discobolo",
    tag_sublinks={"tag": "div","class": 'nav-primary'},
    a_class='nav-list-link',
    tag_container={"tag": "div","class": 'item-description'},
    subtag_name='div'
    )

In [44]:
guillerminaregalado = TiendaNubeScrapping(url="https://www.guillerminaregalado.com.ar",
    store_name="guillerminaregalado",
    tag_sublinks={"tag": "div","class": 'nav-primary'},
    a_class='nav-list-link',
    tag_container={"tag": "div","class": 'item-description'},
    subtag_name='div'
    )

rudshoes = TiendaNubeScrapping(url="https://www.rudshoes.com",
    store_name="rudshoes",
    tag_sublinks={"tag": "div","class": 'nav-primary'},
    a_class='nav-list-link',
    tag_container={"tag": "div","class": 'item-description'},
    subtag_name='div'
    )


zannavara = TiendaNubeScrapping(url="https://zannavara.mitiendanube.com",
    store_name="zannavara",
    tag_sublinks={"tag": "div","class": 'nav-primary'},
    a_class='nav-list-link',
    tag_container={"tag": "div","class": 'item-description py-4 px-1'},
    subtag_name='div'
    )

maeintimates = TiendaNubeScrapping(url="https://www.maeintimates.com",
    store_name="maeintimates",
    tag_sublinks={"tag": "div","class": 'nav-primary'},
    a_class='nav-list-link',
    tag_container={"tag": "div","class": 'item-description'},
    subtag_name='div'
    )
ankaras = TiendaNubeScrapping(url="https://ankarabsas.com.ar",
    store_name="ankaras",
    tag_sublinks={"tag": "div","class": 'nav-primary'},
    a_class='nav-list-link',
    tag_container={"tag": "div","class": 'item-description'},
    subtag_name='div'
    )

alabama = TiendaNubeScrapping(url="https://alabamatienda.com.ar",
    store_name="alabama",
    tag_sublinks={"tag": "div","class": 'nav-primary'},
    a_class='nav-list-link',
    tag_container={"tag": "div","class": 'item-description'},
    subtag_name='div'
    )

lepou = TiendaNubeScrapping(url="https://www.lepouaccesorios.com.ar",
    store_name="lepou accesorios",
    tag_sublinks={"tag": "ul","class": 'desktop-nav-list'},
    a_class='desktop-nav-link',
    tag_container={"tag": "div","class": 'item-info-container'},
    subtag_name='a'
    )

rome = TiendaNubeScrapping(url="https://ankarabsas.com.ar",
    store_name="rome indumentaria",
    tag_sublinks={"tag": "div","class": 'nav-primary'},
    a_class='nav-list-link',
    tag_container={"tag": "div","class": 'item-description'},
    subtag_name='div'
    )

conmpania_sombreros = TiendaNubeScrapping(url="https://www.companiadesombreros.com.ar",
    store_name="compania de sombreros",
    tag_sublinks={"tag": "ul","class": 'desktop-nav-list'},
    a_class='desktop-nav-link',
    tag_container={"tag": "div","class": 'item-info-container'},
    subtag_name='a'
    )

tag_sublinks={"tag": "ul","class": 'desktop-nav-list'}
a_class='desktop-nav-link'
tag_container={"tag": "div","class": 'item-info-container'}
subtag_name='a'

In [15]:
tiendas = [guillerminaregalado,rudshoes,discobolojeans,pda,kazuma,
           torm_facha,lemouton,maura,maeintimates,zannavara,lepou,rome, 
           ankaras,alabama,conmpania_sombreros]
df_tiendas_nube = pd.DataFrame()
df_tiendas_nube= anexar_df_complete(tiendas,df_tiendas_nube) 

Obteniendo los datos de: guillerminaregalado...
url consultada https://guillerminaregalado.com.ar/productos/
Show(nombre=SWEATER GRIS,precio=$64.990)
Show(nombre=PALAZO DENIM NEVADO GRIS,precio=$45.990)
Show(nombre=LEGGINGS OXFORD DENIM GRIS,precio=$45.990)
Show(nombre=PREVENTA POLERA DENIM GRIS,precio=$45.990)
Show(nombre=VESTIDO STRAPLESS DENIM GRIS,precio=$47.990)
Show(nombre=PROXIMAMENTE MAXI CAMISA DENIM GRIS,precio=$49.990)
Show(nombre=MAXI CAMISA FILO LINO,precio=$56.990)
Show(nombre=VESTIDO MANGA LARGA DENIM NEVADO,precio=$49.990)
Show(nombre=PROXIMAMENTE CHALECO PUFFER DENIM VISUA,precio=$119.990)
Show(nombre=LEGGINGS OXFORD DENIM VISUA,precio=$45.990)
Show(nombre=PROXIMAMENTE MAXI CAMISA DENIM VISÚA,precio=$49.990)
Show(nombre=PALAZO DENIM VISUA,precio=$45.990)


KeyboardInterrupt: 

In [42]:
''''''
tiendas = []
df_tiendas_nube= anexar_df_complete(tiendas,df_tiendas_nube)

Obteniendo los datos de: lepou accesorios...
url consultada https://www.lepouaccesorios.com.ar/collares/
Show(nombre=Collar Relicario Vintage Botanic,precio=$14.000)
Show(nombre=Collar Secret Heart,precio=$12.710)
Show(nombre=Collar Diffuser,precio=$11.967)
Show(nombre=Collar Petit Fairy,precio=$9.990)
Show(nombre=Collar Fairy Silver,precio=$7.500)
Show(nombre=Collar Positive Intentions,precio=$12.000)
Show(nombre=Collar Hills,precio=$23.837)
Show(nombre=Collar Relicario Romeo & Juliet,precio=$13.500)
Show(nombre=Collar Jaipur,precio=$5.000)
Show(nombre=Collar Aura,precio=$15.125)
Show(nombre=Collar Batik,precio=$23.100)
Show(nombre=Collar Nuit,precio=$23.100)
Show(nombre=Collar Relicario Wallflowers,precio=$15.000)
Show(nombre=Collar Perfumero Muse,precio=$44.770)
Show(nombre=Collar Square Perfumer -fluorita-,precio=$65.582)
Show(nombre=Collar Clementine,precio=$8.855)
Show(nombre=Collar Lotus,precio=$11.110)
Show(nombre=Collar Fairy - de outlet -,precio=$6.100)
Show(nombre=Collar Squ

In [43]:

df_tiendas_nube

Unnamed: 0,tienda,category,nombre,precio,combo
0,LEPOU ACCESORIOS,COLLAR,Collar Batik,$23.100,False
1,LEPOU ACCESORIOS,OPCIÓN,Opción empaque especial,$1.210,False
2,LEPOU ACCESORIOS,ARO,Aros Raw,$25.630,False
3,LEPOU ACCESORIOS,COLLAR,Collar Orbit Perfumer cobre - amatista -,$35.514,False
4,LEPOU ACCESORIOS,ARO,Aros Stone,$10.230,False
...,...,...,...,...,...
97,CONMPANIA DE SOMBREROS,BOINA,Boina Peaky Blinders Gabardina,$24.000,False
98,CONMPANIA DE SOMBREROS,BUFANDA,Bufanda Plisada,$30.000,False
99,CONMPANIA DE SOMBREROS,PILUSO,Piluso Paño Clash reversible,$18.900,False
100,CONMPANIA DE SOMBREROS,PILUSO,Piluso Fisher Vintage,$16.500,False


# Transforming the data

### Analysing categories

In [None]:
df_tiendas_nube['category'].unique().size

178

Removing wrong or non representative categories

In [None]:
category_group = df_tiendas_nube.groupby('category').count()
cat = category_group[category_group['tienda']==1].sort_values('tienda',ascending=True)
category_group[category_group['tienda']<5]

Unnamed: 0_level_0,tienda,nombre,precio,combo
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3/4,2,2,2,2
ABOTINADO,1,1,1,1
AJUAR,3,3,3,3
ALMOHADÓN,2,2,2,2
AMANDA,1,1,1,1
...,...,...,...,...
VASO,2,2,2,2
VINCHA,1,1,1,1
X,1,1,1,1
X2,3,3,3,3


In [None]:
category_to_filter = category_group[category_group['tienda']>5]
df_tiendas_nube_filter = df_tiendas_nube[~df_tiendas_nube['category'].isin(category_to_filter)]
#df_tiendas_nube_filter['precio'].groupby(df_tiendas_nube_filter['category']).mean()

df_tiendas_nube_filter['precio'] = df_tiendas_nube_filter['precio'].str.replace('$','').str.split(',').str[0].astype(float).round(2) 
df_tiendas_nube_filter
df_tiendas_nube_filter['category'].unique().size
df_tiendas_nube_filter.sort_values('tienda',ascending=False)

Unnamed: 0,tienda,category,nombre,precio,combo
38,ZANNAVARA,PANTALON,Pantalon Cuvie,9.90,False
19,ZANNAVARA,PANTALON,Pantalon Koral,12.00,False
17,ZANNAVARA,BLUSA,Blusa Bella,9.60,False
16,ZANNAVARA,REMERA,Remera Sanir Combinado Con Bolsillo,1.75,False
15,ZANNAVARA,BENGALINA,Bengalina con Lazo,6.90,False
...,...,...,...,...,...
40,DISCOBOLO,REMERA,REMERA M/C SLIM 'TIRA TEJIDA',15.40,False
39,DISCOBOLO,REMERA,REMERA M/C REGULAR 'NATURE',15.90,False
38,DISCOBOLO,CAMISOLA,CAMISOLA ESTAMPADA 'PLANTS',24.00,False
37,DISCOBOLO,CINTURON,CINTURON SERRUCHO BORDE,23.10,False


### Getting the mean of the price grouping by category

In [None]:
grouping_by_category = df_tiendas_nube_filter['precio'].groupby([df_tiendas_nube_filter['category'],df_tiendas_nube_filter['tienda']])

mean_prices =grouping_by_category.mean().reset_index()
mean_prices
remeras_mean = mean_prices[mean_prices['category']=="PANTALON"]
remeras_mean

Unnamed: 0,category,tienda,precio
170,PANTALON,DISCOBOLO,43.0
171,PANTALON,KAZUMA,43.7
172,PANTALON,LEMOUTON_BEBE_SHOP,8.625
173,PANTALON,MAEINTIMATES,22.41
174,PANTALON,PDA,17.99
175,PANTALON,ZANNAVARA,15.354545


### Tests para nuevas paginas

In [39]:
'''
alabama = TiendaNubeScrapping(url="https://alabamatienda.com.ar",
    store_name="alabama",
    tag_sublinks={"tag": "ul","class": 'desktop-nav-list'},
    a_class='desktop-nav-link',
    tag_container={"tag": "div","class": 'item-info-container'},
    subtag_name='a'
    )
'''
testing_url = "https://alabamatienda.com.ar"
store_name="testing"
tag_sublinks={"tag": "ul","class": 'desktop-nav-list'}
a_class='desktop-nav-link'
tag_container={"tag": "div","class": 'item-info-container'}
subtag_name='a'
#testing_url= "https://tormentadefacha.mitiendanube.com"
testing_instance = TiendaNubeScrapping(url=testing_url,
    store_name = store_name,
    tag_sublinks=tag_sublinks,
    tag_container=tag_container,
    subtag_name=subtag_name,
    a_class=a_class)
soup,_ = testing_instance.fetch_content()
soup

<!DOCTYPE html>
<html lang="es" xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/"><head>
<link href="https://acdn.mitiendanube.com" rel="preconnect"/>
<link href="https://acdn.mitiendanube.com" rel="dns-prefetch"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Alabama</title>
<meta content="Tienda de ropa plus size" name="description"/>
<link as="style" href="//acdn.mitiendanube.com/stores/001/577/745/themes/idea/style-critical-e49385a65e4d378aa38d6f7a44f7486b.css" rel="preload"/>
<link as="style" href="//acdn.mitiendanube.com/stores/001/577/745/themes/idea/style-colors-0cbd494193ec0a960cde053b7228c262.css" rel="preload"/>
<link as="image" href="//acdn.mitiendanube.com/stores/001/577/745/themes/idea/1-slide-1668169486279-2572957541-0a79079021b520

In [40]:
soup_maura_ = soup.find_all(tag_sublinks['tag'],class_=tag_sublinks['class'])
soup_elements = soup_maura_[0].find_all('a',class_=a_class, href=True)
soup_elements


IndexError: list index out of range

In [35]:
for element in [soup_elements[3]]:

    if element and element.has_attr('href') and element['href'] != '#' and len(element['href'])>15:
            s2,status = testing_instance.fetch_content(element['href'])
            if (status ==200):
                print(f"url consultada {element['href']}")
                #soup = s2.find_all(tag_container['tag'],class_=tag_container['class'])
                soup = s2.find_all(tag_container['tag'],class_=tag_container['class'])[0]
                nombre = s2.find_all(subtag_name,class_=re.compile('item-name'))[0].text.strip()
                precio = s2.find_all('span',class_=re.compile('js-price-display item-price'))[0].text.strip()
                print(nombre)
                print(precio)
                

url consultada https://www.companiadesombreros.com.ar/invierno/pilusos/
Piluso Fisher
$18.500
