Importamos las librerías necesarias

In [315]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}

## Creamos las clases:
- Scapper -> Clase abstracta para scrapping

In [316]:
from abc import ABC,abstractmethod
class Scrapper(ABC):
     def __init__(self,url):
          self.url = url

     def fetch_content(self,url=None):
          if not url:
               url =self.url
          try:
               response = requests.get(url,headers)
               if (response.status_code==200):
                    html_text = response.text
                    self.soup = BeautifulSoup(html_text,'html5lib') #library that makes it easy to scrape information from web pages
               else:
                    self.soup = None
               return self.soup,response.status_code
          except requests.RequestException as e:
               return f"Error haciendo el fetch de {url}: {e}",404
          
     

     def execute_scrapping(self,tag_container,class_container,tag_name) -> list[any]:
          pass

### clase prenda
Descripre lo necesario en que consiste una entidad prenda

In [317]:
class Prenda:
     def __init__(self,tienda:str,category:str,nombre:str,precio,combo:bool=False):
          self.tienda = tienda
          self.category = category
          self.nombre = nombre
          self.precio = precio
          self.combo = combo

     def __eq__(self,other):
          if not isinstance(other,type(self)):
               return NotImplemented
          return (self.nombre==other.nombre and self.precio==other.precio)        
     def __repr__(self) -> str:
          return f"Show(nombre={self.nombre},precio={self.precio})"
     
     def __hash__(self):
          return hash((self.nombre,self.precio))

### Clase TiendaNubeScrapping
- clase que hereda de Scrapper y define un scrapping de tienda nube

In [318]:
class TiendaNubeScrapping(Scrapper):
    def __init__(self, url :str ,store_name: str,tag_sublinks,tag_container,a_class,subtag_name):
        super().__init__(url)
        self.store_name = store_name
        self.tag_sublinks = tag_sublinks
        self.tag_container=tag_container
        self.a_class = a_class
        self.subtag_name = subtag_name

    def create_df(self,data_list):
          return pd.DataFrame([prenda.__dict__ for prenda in data_list])

    def _plural_to_singular(self,word):
    # Caso para palabras terminadas en 'ies' (ej. parties -> party)
        if re.search(r'IES$', word):
            return re.sub(r'IES$', 'y', word)
    # Caso para palabras terminadas en 'es' (excluyendo las terminadas en 'ies', como dishes -> dish)
        elif re.search(r'([^i])ES$', word):
            return re.sub(r'ES$', '', word)
    # Caso general para palabras terminadas en 's' (ej. cats -> cat)
        elif re.search(r'S$', word):
            return re.sub(r'S$', '', word)
    # Retorna la palabra si no cumple con los anteriores casos
        return word

    def _get_category(self,nombre_prenda : str):
          combo = False
          characters_to_replace = ['SET','PACK']
          nombre = ""
          words = nombre_prenda.split() 
          for index,word in enumerate(words):
               if (word not in characters_to_replace):
                    nombre = word
                    if(index!=0):
                        combo = True

                    break
          return self._plural_to_singular(nombre),combo
         

    def execute_scrapping(self) -> list[Prenda]:
        print(f"Obteniendo los datos de: {self.store_name}...")
        prendas = set()     
        soup_maura_ = self.soup.find_all(self.tag_sublinks['tag'],class_=self.tag_sublinks['class'])
        soup_elements = soup_maura_[0].find_all('a',class_=self.a_class, href=True)
        for element in soup_elements:
          
                if element and element.has_attr('href') and element['href'] != '#' and len(element['href'])>15:
                        s2,status = self.fetch_content(element['href'])
                        if (status ==200):
                            print(f"url consultada {element['href']}")
                            soup = s2.find_all(self.tag_container['tag'],class_=self.tag_container['class'])
                            
                            for s in soup:
                                
                                try:
                                    nombre = s.find_all(self.subtag_name,class_=re.compile('item-name'))[0].text.strip()
                                    precio = s.find_all('span',class_=re.compile('js-price-display item-price'))[0].text.strip()
                                    category,combo = self._get_category(nombre.upper())
                                    prenda = Prenda(tienda=self.store_name.upper(),
                                                    category=category,
                                                    nombre=nombre,
                                                    precio=precio,
                                                    combo=combo)
                                    
                                    prendas.add(prenda)
                                except IndexError as e:    
                                    print(f"Error: {e}. No se pudo obtener los datos de {element['href']} por error de indice")
                                        
                                
                        else:
                             print("No entró")        
        return list(prendas)

# Create the instances and generate the complete dataframe of Tienda Nube
This will include implement the creation of the instances and then putting all together to make the scrapping

In [302]:
def creacion_df_tienda(tienda):
        df_tienda = pd.DataFrame()
        _, status = tienda.fetch_content()
        if(status==200):
                list_tienda =tienda.execute_scrapping()
                df_tienda = tienda.create_df(list_tienda)
        else:
                print("No se pudo cargar la pagina")
        return df_tienda


In [303]:
def anexar_df_complete(tiendas,df_complete):
    for tienda in tiendas:
        df = creacion_df_tienda(tienda)
        df_complete = pd.concat([df_complete,df])
    return df_complete

Initialize tiendas_nube

In [304]:
df_tiendas_nube = pd.DataFrame()

In [323]:
pda = TiendaNubeScrapping(
    url="https://www.pda.com.ar",
    store_name="PDA",
    tag_sublinks ={"tag": "div","class": 'nav-primary'},
    tag_container={"tag": "div","class": 'item-description py-4 px-3'},
    a_class='nav-list-link',
    subtag_name='div')
kazuma = TiendaNubeScrapping(
    url="https://kazuma.com.ar",
    store_name="Kazuma",
    tag_sublinks ={"tag": "div","class": 'nav-primary'},
    tag_container={"tag": "div","class": 'item-description'},
    a_class='nav-list-link',
    subtag_name='div')

In [282]:
torm_facha = TiendaNubeScrapping(url="https://tormentadefacha.mitiendanube.com",
    store_name="Tormenta de facha",
    tag_sublinks={"tag": "div","class": 'desktop-nav-container'},
    tag_container={"tag": "div","class": 'span3 item-container m-bottom-half'},
    a_class='desktop-nav-link',
    subtag_name='a')

In [324]:
lemouton = TiendaNubeScrapping(url="https://lemoutonbebeshop.com.ar",
    store_name="LEMOUTON_BEBE_SHOP",
    tag_sublinks={"tag": "ul","class": "js-desktop-nav desktop-nav hidden-phone font-small"},
    tag_container={"tag": "div","class": 'span3 item-container m-bottom-half'},
    a_class='desktop-nav-link',
    subtag_name='a')

In [325]:
maura = TiendaNubeScrapping(url="https://maura.mitiendanube.com",
    store_name="Maura",
    tag_sublinks={"tag": "div","class": 'row-fluid hidden-phone'},
    tag_container={"tag": "div","class": 'item-info-container m-top-half m-bottom-half'},
    a_class='desktop-nav-link',
    subtag_name='h6')

Create tienda nube complete dataframe

In [None]:
df_tiendas_nube = pd.DataFrame()
tiendas = [pda,kazuma,torm_facha,lemouton,maura]
df_tiendas_nube= anexar_df_complete(tiendas,df_tiendas_nube)


In [290]:
bodacious = TiendaNubeScrapping(url="https://www.bodaciousclothing.com",
    store_name="Bodacious",
    tag_sublinks={"tag": "div","class": 'modal-with-fixed-footer'},
    tag_container={"tag": "a","class": 'item-link'},
    subtag_name='div',
    a_class='nav-list-link')

In [None]:
discobolojeans = TiendaNubeScrapping(url="https://www.discobolojeans.com",
    store_name="Discobolo",
    tag_sublinks={"tag": "div","class": 'nav-primary'},
    a_class='nav-list-link',
    tag_container={"tag": "div","class": 'item-description'},
    subtag_name='div'
    )

In [None]:
guillerminaregalado = TiendaNubeScrapping(url="https://www.guillerminaregalado.com.ar",
    store_name="guillerminaregalado",
    tag_sublinks={"tag": "div","class": 'nav-primary'},
    a_class='nav-list-link',
    tag_container={"tag": "div","class": 'item-description'},
    subtag_name='div'
    )

rudshoes = TiendaNubeScrapping(url="https://www.rudshoes.com",
    store_name="rudshoes",
    tag_sublinks={"tag": "div","class": 'nav-primary'},
    a_class='nav-list-link',
    tag_container={"tag": "div","class": 'item-description'},
    subtag_name='div'
    )

In [326]:
tiendas = [guillerminaregalado,rudshoes,discobolojeans,pda,kazuma,torm_facha,lemouton,maura]
df_tiendas_nube = pd.DataFrame()
df_tiendas_nube= anexar_df_complete(tiendas,df_tiendas_nube)

Obteniendo los datos de: guillerminaregalado...
url consultada https://guillerminaregalado.com.ar/productos/
url consultada https://www.guillerminaregalado.com.ar/nuevos-ingresos/
url consultada https://www.guillerminaregalado.com.ar/todo-regalado/
url consultada https://www.guillerminaregalado.com.ar/gr-denim/
url consultada https://www.guillerminaregalado.com.ar/gr-fiesta/
url consultada https://www.guillerminaregalado.com.ar/gr-movimiento/
url consultada https://www.guillerminaregalado.com.ar/2da-seleccion/
url consultada https://www.guillerminaregalado.com.ar/gift-card/
url consultada https://www.guillerminaregalado.com.ar/abrigos/
url consultada https://www.guillerminaregalado.com.ar/pantalones/
url consultada https://www.guillerminaregalado.com.ar/tops-y-remeras/
url consultada https://www.guillerminaregalado.com.ar/faldas-y-shorts/
url consultada https://www.guillerminaregalado.com.ar/camisas/
url consultada https://www.guillerminaregalado.com.ar/sweaters/
url consultada https:/

In [327]:
df_tiendas_nube

Unnamed: 0,tienda,category,nombre,precio,combo
0,GUILLERMINAREGALADO,SHORT,SHORT TEJIDO LIMA,$31.990,False
1,GUILLERMINAREGALADO,VESTIDO,VESTIDO MANGA LARGA DENIM GRIS - FALLA TRES,$39.990,False
2,GUILLERMINAREGALADO,CHALECO,CHALECO VESTIDO LINO TOSTADO - FALLA DOS,$38.990,False
3,GUILLERMINAREGALADO,PALAZO,PALAZO DENIM VISUA,$45.990,False
4,GUILLERMINAREGALADO,CHALECO,CHALECO VESTIDO LINO LIMA,$49.990,False
...,...,...,...,...,...
85,MAURA,VESTIDO,Vestido Pasión lima,$23.940,False
86,MAURA,BLAZER,Blazer Jazmín verde,$48.000,False
87,MAURA,TAPADO,Tapado Jade crudo,$97.750,False
88,MAURA,VESTIDO,Vestido Pasión rosa,$23.940,False


# Transforming the data

### Analysing categories

In [328]:
df_tiendas_nube['category'].unique()

array(['SHORT', 'VESTIDO', 'CHALECO', 'PALAZO', 'PROXIMAMENTE', 'MINI',
       'MONO', 'LEGGING', 'SWEATER', 'TOP', 'ENAGUA', 'BLUSA', 'ARANDELA',
       'VINCHA', 'CINTO', 'PRONTO', 'FALDA', 'REMERA', 'COLLAR', 'POLERA',
       'PREVENTA', 'GIFT', 'BLAZER', 'ARO', 'MAXI', 'CALZA', 'BUZO',
       'SANDALIA', 'BIKINI', 'ZUECO', 'BOTA', 'MOCASÍN', 'VASO', 'MEDIA',
       'ABOTINADO', 'CORDON', 'POLLERA', 'SNEAKER', 'SPRAY', 'CUADERNO',
       'CANGURO', 'CAMPERA', 'CAMISOLA', 'JEAN', 'CAMISACO', 'MORRAL',
       'PANTALON', 'JOGGING', 'GORRO', 'CARDIGAN', 'GORRA', 'ROMPEVIENTO',
       'JOGGER', 'PILUSO', 'CHOMBA', 'ZAPATILLA', 'RIÑONERA', 'CAMISA',
       'MOM', 'CINTURON', 'BILLETERA', 'CARPINTERO', 'BOXER', 'CAMPERON',
       'PARKA', 'DELANTAL', 'BERMUDA', 'X', 'PANTALÓN', 'NECESER',
       'PORTA', 'PERFUME', 'MUSCULOSA', 'ENTERITO', 'CUADRO', 'BODY',
       'SAQUITO', 'BABUCHA', 'BABITA', 'BANDANA', 'AJUAR', 'SONAJERO',
       'JARDINERO', 'RANITA', 'CONJUNTO', 'TARJETA', 'CAMBIADO

Removing wrong or non representative categories

In [336]:
category_group = df_tiendas_nube.groupby('category').count()
cat = category_group[category_group['tienda']==1].sort_values('tienda',ascending=True)
cat['tienda'].count()

46

In [None]:
category_to_filter = ['X','GIFT']
df_tiendas_nube_filter = df_tiendas_nube[~df_tiendas_nube['category'].isin(category_to_filter)]
#df_tiendas_nube_filter['precio'].groupby(df_tiendas_nube_filter['category']).mean()

df_tiendas_nube_filter['precio'] = df_tiendas_nube_filter['precio'].str.replace('$','').str.split(',').str[0].astype(float).round(2) 
df_tiendas_nube_filter.info()


### Getting the mean of the price grouping by category

In [376]:
grouping_by_category = df_tiendas_nube_filter['precio'].groupby([df_tiendas_nube_filter['category'],df_tiendas_nube_filter['tienda']])

mean_prices =grouping_by_category.mean().reset_index()
mean_prices
remeras_mean = mean_prices[mean_prices['category']=="REMERA"]
remeras_mean

Unnamed: 0,category,tienda,precio
137,REMERA,DISCOBOLO,16.033333
138,REMERA,GUILLERMINAREGALADO,19.99
139,REMERA,KAZUMA,15.124
140,REMERA,LEMOUTON_BEBE_SHOP,9.786522
141,REMERA,MAURA,8.4
142,REMERA,PDA,15.350769
143,REMERA,TORMENTA DE FACHA,14.826087


### Tests para nuevas paginas

In [306]:
testing_url = "https://www.guillerminaregalado.com.ar/"
store_name="Discobolo"
tag_sublinks={"tag": "div","class": 'nav-primary'}
a_class='nav-list-link'
tag_container={"tag": "div","class": 'item-description '}
subtag_name='div'
#testing_url= "https://tormentadefacha.mitiendanube.com"
testing_instance = TiendaNubeScrapping(url=testing_url,
    store_name = store_name,
    tag_sublinks=tag_sublinks,
    tag_container=tag_container,
    subtag_name=subtag_name,
    a_class=a_class)
soup,_ = testing_instance.fetch_content()
soup


<!DOCTYPE html>
<html lang="es" xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/"><head>
<link href="https://acdn.mitiendanube.com" rel="preconnect"/>
<link href="https://acdn.mitiendanube.com" rel="dns-prefetch"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Discobolo</title>
<meta content="La filosofía de Discobolo es la calidad de las prendas, sin dejar de lado el diseño y el confort. " name="description"/>
<link as="style" href="//acdn.mitiendanube.com/stores/238/782/themes/idea/style-critical-e49385a65e4d378aa38d6f7a44f7486b.css" rel="preload"/>
<link as="style" href="//acdn.mitiendanube.com/stores/238/782/themes/idea/style-colors-0cbd494193ec0a960cde053b7228c262.css" rel="preload"/>
<link as="image" href="//acdn.mitiendanube.com/stores/238

In [312]:
soup_maura_ = soup.find_all(tag_sublinks['tag'],class_=tag_sublinks['class'])
soup_elements = soup_maura_[0].find_all('a',class_=a_class, href=True)
soup_elements

[<a class="nav-list-link" href="/">Inicio</a>,
 <a class="js-toggle-page-accordion nav-list-link" href="#">
 Tienda
 <span class="nav-list-arrow transition-soft">
 <svg><use xlink:href="#long-arrow-down"></use></svg>
 </span>
 </a>,
 <a class="nav-list-link" href="https://www.discobolojeans.com/productos/">
 Ver todos los productos
 </a>,
 <a class="nav-list-link" href="https://www.discobolojeans.com/accesorios/">ACCESORIOS</a>,
 <a class="js-toggle-page-accordion nav-list-link" href="#">
 ABRIGOS
 <span class="nav-list-arrow transition-soft">
 <svg><use xlink:href="#long-arrow-down"></use></svg>
 </span>
 </a>,
 <a class="nav-list-link" href="https://www.discobolojeans.com/abrigos/">
 Ver todo en ABRIGOS
 </a>,
 <a class="nav-list-link" href="https://www.discobolojeans.com/abrigos/camisacos1/">CAMISACOS</a>,
 <a class="nav-list-link" href="https://www.discobolojeans.com/abrigos/camperas1/">CAMPERAS</a>,
 <a class="nav-list-link" href="https://www.discobolojeans.com/abrigos/parkas1/">P