Extração de Dados HTML

Fonte: H&M <https://www2.hm.com/en_us/men/products/jeans.html>

Dados a serem coletados para criação de tabela:

- id
- product_name
- product_type
- product_color
- composition
- price

In [1]:
import requests

import pandas as pd

import numpy as np

import math

from datetime import datetime

from bs4 import BeautifulSoup


In [2]:
url1 = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

page = requests.get( url1, headers=headers )

In [3]:
page

<Response [200]>

In [4]:
# Retornando o HTML da página

page.text



In [5]:
# Instanciando objeto Beautiful Soup

soup = BeautifulSoup(page.text, 'html.parser')

In [6]:
soup

<!DOCTYPE HTML>

<html class="no-js en-us" is-in-aem="false" lang="en-US" ng-app="hmApp">
<head>
<link href="https://s1-cdn.hm.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://s1-cdn.hm.com" rel="preconnect"/>
<link href="https://tags.tiqcdn.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://tags.tiqcdn.com" rel="preconnect"/>
<link href="https://lp2.hm.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://lp2.hm.com" rel="preconnect"/>
<link href="https://cdn-pci.optimizely.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://cdn-pci.optimizely.com" rel="preconnect"/>
<link href="https://s.go-mpulse.net" rel="dns-prefetch"/>
<link crossorigin="" href="https://s.go-mpulse.net" rel="preconnect"/>
<script src="https://cdn.cookielaw.org/consent/511a7faf-cc2d-4b69-86fb-a37761829422/OtAutoBlock.js" type="text/javascript"></script>
<script charset="UTF-8" data-document-language="true" data-domain-script="511a7faf-cc2d-4b69-86fb-a37761829422" src="https://

In [7]:
# Paginação: Retornando todas as páginas do site

soup.find_all('h2', class_='load-more-heading')

[<h2 class="load-more-heading" data-items-shown="36" data-total="87">SHOWING 36 of 87 Items</h2>]

In [8]:
# Retornando todos os produtos de todas as páginas do site

total_item = soup.find_all('h2', class_='load-more-heading')[0].get('data-total')
total_item

'87'

In [9]:
# Transformando em inteiro para saber quantas páginas presciso
# Cabem 36 produtos por páginas, com isso presciso de 2.41 páginas

int(total_item)/36

2.4166666666666665

In [10]:
# Arredondando os valores

#page_number = np.round(int(total_item)/36)
#page_number

In [11]:
# Arredondando os valores para saber quantas páginas preciso

page_number = math.ceil(int(total_item)/36)
page_number

3

In [12]:
url02 = url1 + '?pagesize=' + str(int(page_number*36))
url02

'https://www2.hm.com/en_us/men/products/jeans.html?pagesize=108'

In [13]:
# Estrutura HTML onde a vitrine está armazenada:

products = soup.find('ul', class_ = 'products-listing small')

In [14]:
products

<ul class="products-listing small">
<li class="product-item">
<article class="hm-product-item" data-articlecode="1074475001" data-brand="H&amp;M" data-category="men_jeans_loose" data-energy-interval="" data-pre-access-end-date="" data-pre-access-groups="" data-pre-access-start-date="" data-style-with-articlecodes="" onclick="setOsaParameters(utag_data.category_id,'SMALL','1074475001'); setNotificationTicket('Oy9wbHAvcHJvZHVjdC1saXN0LXdpdGgtY291bnQvcHJvZHVjdC1saXN0OyM7cHJvZHVjdF9rZXk7MTA3NDQ3NV9ncm91cF8wMDFfZW5fdXM7MTA3NDQ3NTAwMV9lbl91cztPQkpFQ1RJVkUkO05PTkU6Tk9ORTs3OTs','1074475001');">
<div class="image-container">
<a class="item-link" href="/en_us/productpage.1074475001.html" title="Loose Jeans">
<img alt="Loose JeansModel" class="item-image" data-altimage="//lp2.hm.com/hmgoepprod?set=source[/e6/fe/e6fedc2bef692e5ad2ec9c48d71832c36c498b29.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[m],hmver[2]&amp;call=url[file:/product/style]" data-alttext="Loose Jeans" data-src="//lp

In [15]:
# Estrutura HTML onde todos os produtos estão armazenados:
# Dados a serem coletados: data-articlecode, data-category

product_list = products.find_all('article', class_= 'hm-product-item')

In [16]:
product_list

[<article class="hm-product-item" data-articlecode="1074475001" data-brand="H&amp;M" data-category="men_jeans_loose" data-energy-interval="" data-pre-access-end-date="" data-pre-access-groups="" data-pre-access-start-date="" data-style-with-articlecodes="" onclick="setOsaParameters(utag_data.category_id,'SMALL','1074475001'); setNotificationTicket('Oy9wbHAvcHJvZHVjdC1saXN0LXdpdGgtY291bnQvcHJvZHVjdC1saXN0OyM7cHJvZHVjdF9rZXk7MTA3NDQ3NV9ncm91cF8wMDFfZW5fdXM7MTA3NDQ3NTAwMV9lbl91cztPQkpFQ1RJVkUkO05PTkU6Tk9ORTs3OTs','1074475001');">
 <div class="image-container">
 <a class="item-link" href="/en_us/productpage.1074475001.html" title="Loose Jeans">
 <img alt="Loose JeansModel" class="item-image" data-altimage="//lp2.hm.com/hmgoepprod?set=source[/e6/fe/e6fedc2bef692e5ad2ec9c48d71832c36c498b29.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[m],hmver[2]&amp;call=url[file:/product/style]" data-alttext="Loose Jeans" data-src="//lp2.hm.com/hmgoepprod?set=source[/85/1f/851f99bd6669fa43c0cb

In [17]:
# Coleta primeiro elemento da lista

product_list[1]

<article class="hm-product-item" data-articlecode="1071707001" data-brand="H&amp;M" data-category="men_jeans_relaxed" data-energy-interval="" data-pre-access-end-date="" data-pre-access-groups="" data-pre-access-start-date="" data-style-with-articlecodes="" onclick="setOsaParameters(utag_data.category_id,'SMALL','1071707001'); setNotificationTicket('Oy9wbHAvcHJvZHVjdC1saXN0LXdpdGgtY291bnQvcHJvZHVjdC1saXN0OyM7cHJvZHVjdF9rZXk7MTA3MTcwN19ncm91cF8wMDFfZW5fdXM7MTA3MTcwNzAwMV9lbl91cztPQkpFQ1RJVkUkO05PTkU6Tk9ORTs3OTs','1071707001');">
<div class="image-container">
<a class="item-link" href="/en_us/productpage.1071707001.html" title="Relaxed Jeans">
<img alt="Relaxed JeansModel" class="item-image" data-altimage="//lp2.hm.com/hmgoepprod?set=source[/f0/6b/f06baaff711d89d8ca7b2f79ca544762a4874a31.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[m],hmver[2]&amp;call=url[file:/product/style]" data-alttext="Relaxed Jeans" data-src="//lp2.hm.com/hmgoepprod?set=source[/2a/67/2a67a9b93adf8f07

In [18]:
# Quantidade de artigos na lista

len(product_list)

36

In [19]:
# Coleta primeiro item da lista de dados data-articlecode

product_list[0].get('data-articlecode')

'1074475001'

In [20]:
# Looping para coleta de todos os itens data-articlecode da lista 
# Product Id

product_id = [p.get('data-articlecode') for p in product_list]
product_id

['1074475001',
 '1071707001',
 '1024256001',
 '0690449056',
 '0985159001',
 '1008549006',
 '1024256005',
 '1024256003',
 '1024256002',
 '1008549001',
 '1004199002',
 '1024256008',
 '0985159008',
 '1004199004',
 '0985159007',
 '1024256004',
 '0875105018',
 '1004199003',
 '1008549003',
 '0811993036',
 '0985159004',
 '1024256007',
 '1008110001',
 '0690449022',
 '0811993040',
 '1004199001',
 '0875105016',
 '1008549002',
 '0690449043',
 '0690449051',
 '0985159006',
 '0985159005',
 '1013317010',
 '0875105024',
 '1008549008',
 '1008110002']

In [21]:
# Coleta primeiro item da lista de dados data-category

product_list[0].get('data-category')

'men_jeans_loose'

In [22]:
# Looping para coleta de todos os itens data-category da lista 
# Product Type

product_type = [p.get('data-category') for p in product_list]
product_type

['men_jeans_loose',
 'men_jeans_relaxed',
 'men_jeans_slim',
 'men_jeans_ripped',
 'men_jeans_skinny',
 'men_jeans_regular',
 'men_jeans_slim',
 'men_jeans_slim',
 'men_jeans_slim',
 'men_jeans_regular',
 'men_jeans_skinny',
 'men_jeans_slim',
 'men_jeans_skinny',
 'men_jeans_skinny',
 'men_jeans_skinny',
 'men_jeans_slim',
 'men_jeans_relaxed',
 'men_jeans_skinny',
 'men_jeans_regular',
 'men_jeans_regular',
 'men_jeans_skinny',
 'men_jeans_slim',
 'men_jeans_slim',
 'men_jeans_ripped',
 'men_jeans_regular',
 'men_jeans_skinny',
 'men_jeans_relaxed',
 'men_jeans_regular',
 'men_jeans_ripped',
 'men_jeans_ripped',
 'men_jeans_skinny',
 'men_jeans_skinny',
 'men_jeans_joggers',
 'men_jeans_relaxed',
 'men_jeans_regular',
 'men_jeans_slim']

In [23]:
# Estrutura HTML onde todos nomes dos produtos estão armazenados:
# Dados aserem coletados: product_name

product_list = products.find_all('a', class_='link')
product_list

[<a class="link" href="/en_us/productpage.1074475001.html">Loose Jeans</a>,
 <a class="link" href="/en_us/productpage.1071707001.html">Relaxed Jeans</a>,
 <a class="link" href="/en_us/productpage.1024256001.html">Slim Jeans</a>,
 <a class="link" href="/en_us/productpage.0690449056.html">Skinny Jeans</a>,
 <a class="link" href="/en_us/productpage.0985159001.html">Skinny Jeans</a>,
 <a class="link" href="/en_us/productpage.1008549006.html">Regular Jeans</a>,
 <a class="link" href="/en_us/productpage.1024256005.html">Slim Jeans</a>,
 <a class="link" href="/en_us/productpage.1024256003.html">Slim Jeans</a>,
 <a class="link" href="/en_us/productpage.1024256002.html">Slim Jeans</a>,
 <a class="link" href="/en_us/productpage.1008549001.html">Regular Jeans</a>,
 <a class="link" href="/en_us/productpage.1004199002.html">Skinny Cropped Jeans</a>,
 <a class="link" href="/en_us/productpage.1024256008.html">Slim Jeans</a>,
 <a class="link" href="/en_us/productpage.0985159008.html">Skinny Jeans</a>,

In [24]:
# Coleta primeiro item da lista de dados product_name

product_list[0].get_text()

'Loose Jeans'

In [25]:
# Looping para coleta de todos os itens product_name da lista 
# Product Name

product_name = [p.get_text() for p in product_list]
product_name

['Loose Jeans',
 'Relaxed Jeans',
 'Slim Jeans',
 'Skinny Jeans',
 'Skinny Jeans',
 'Regular Jeans',
 'Slim Jeans',
 'Slim Jeans',
 'Slim Jeans',
 'Regular Jeans',
 'Skinny Cropped Jeans',
 'Slim Jeans',
 'Skinny Jeans',
 'Skinny Cropped Jeans',
 'Skinny Jeans',
 'Slim Jeans',
 'Relaxed Jeans',
 'Skinny Cropped Jeans',
 'Regular Jeans',
 'Regular Jeans',
 'Skinny Jeans',
 'Slim Jeans',
 'Freefit® Slim Jeans',
 'Skinny Jeans',
 'Regular Jeans',
 'Skinny Cropped Jeans',
 'Relaxed Jeans',
 'Regular Jeans',
 'Skinny Jeans',
 'Skinny Jeans',
 'Skinny Jeans',
 'Skinny Jeans',
 'Hybrid Regular Tapered Joggers',
 'Relaxed Jeans',
 'Regular Jeans',
 'Freefit® Slim Jeans']

In [26]:
# Estrutura HTML onde o price está armazenado
# Dados aserem coletados: product_price

product_list = products.find_all('span', class_='price regular')
product_list

[<span class="price regular">$ 39.99</span>,
 <span class="price regular">$ 29.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 39.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 29.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 29.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 29.99</span>,
 <span class="price regular">$ 29.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 29.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span cla

In [27]:
# Coleta primeiro item da lista de dados product_price

product_list[0].get_text()

'$ 39.99'

In [28]:
# Looping para coleta de todos os itens product_price da lista 
# Product Price

product_price = [p.get_text() for p in product_list]
product_price

['$ 39.99',
 '$ 29.99',
 '$ 19.99',
 '$ 39.99',
 '$ 19.99',
 '$ 19.99',
 '$ 19.99',
 '$ 19.99',
 '$ 19.99',
 '$ 19.99',
 '$ 29.99',
 '$ 19.99',
 '$ 19.99',
 '$ 29.99',
 '$ 19.99',
 '$ 19.99',
 '$ 29.99',
 '$ 29.99',
 '$ 19.99',
 '$ 29.99',
 '$ 19.99',
 '$ 19.99',
 '$ 49.99',
 '$ 39.99',
 '$ 29.99',
 '$ 29.99',
 '$ 29.99',
 '$ 19.99',
 '$ 39.99',
 '$ 39.99',
 '$ 19.99',
 '$ 19.99',
 '$ 39.99',
 '$ 29.99',
 '$ 19.99',
 '$ 49.99']

In [29]:
pd.DataFrame([product_id,
              product_name,
              product_type,
              product_price,]).T

Unnamed: 0,0,1,2,3
0,1074475001,Loose Jeans,men_jeans_loose,$ 39.99
1,1071707001,Relaxed Jeans,men_jeans_relaxed,$ 29.99
2,1024256001,Slim Jeans,men_jeans_slim,$ 19.99
3,690449056,Skinny Jeans,men_jeans_ripped,$ 39.99
4,985159001,Skinny Jeans,men_jeans_skinny,$ 19.99
5,1008549006,Regular Jeans,men_jeans_regular,$ 19.99
6,1024256005,Slim Jeans,men_jeans_slim,$ 19.99
7,1024256003,Slim Jeans,men_jeans_slim,$ 19.99
8,1024256002,Slim Jeans,men_jeans_slim,$ 19.99
9,1008549001,Regular Jeans,men_jeans_regular,$ 19.99


In [30]:
data = pd.DataFrame([product_id,
              product_name,
              product_type,
              product_price,]).T

In [31]:
data.columns

RangeIndex(start=0, stop=4, step=1)

In [32]:
data.columns = ['product_id',
                'product_name',
                'product_type',
                'product_price']

In [33]:
# scrapy datetime

data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [34]:
data.head()

Unnamed: 0,product_id,product_name,product_type,product_price,scrapy_datetime
0,1074475001,Loose Jeans,men_jeans_loose,$ 39.99,2022-05-31 14:09:35
1,1071707001,Relaxed Jeans,men_jeans_relaxed,$ 29.99,2022-05-31 14:09:35
2,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-05-31 14:09:35
3,690449056,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-05-31 14:09:35
4,985159001,Skinny Jeans,men_jeans_skinny,$ 19.99,2022-05-31 14:09:35


In [35]:
# Retornando os detalhes de cada produto

# One product

url = 'https://www2.hm.com/en_us/productpage.1074475001.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

page = requests.get( url, headers=headers )

In [36]:
page

<Response [200]>

In [37]:
page.text



In [38]:
# Instanciando objeto Beautiful Soup

soup = BeautifulSoup(page.text, 'html.parser')

In [39]:
soup

<!DOCTYPE HTML>

<html class="no-js en-us" lang="en-US">
<head>
<link href="https://s1-cdn.hm.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://s1-cdn.hm.com" rel="preconnect"/>
<link href="https://tags.tiqcdn.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://tags.tiqcdn.com" rel="preconnect"/>
<link href="https://lp2.hm.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://lp2.hm.com" rel="preconnect"/>
<link href="https://cdn-pci.optimizely.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://cdn-pci.optimizely.com" rel="preconnect"/>
<link href="https://s.go-mpulse.net" rel="dns-prefetch"/>
<link crossorigin="" href="https://s.go-mpulse.net" rel="preconnect"/>
<script src="https://cdn.cookielaw.org/consent/511a7faf-cc2d-4b69-86fb-a37761829422/OtAutoBlock.js" type="text/javascript"></script>
<script charset="UTF-8" data-document-language="true" data-domain-script="511a7faf-cc2d-4b69-86fb-a37761829422" src="https://cdn.cookielaw.org/scripttemplates

In [40]:
type(soup)

bs4.BeautifulSoup

In [44]:
#Color name

soup.find_all('li', class_='list-item')

[<li class="list-item">
 <a class="link" data-remodal-trigger="signin-account" href="/en_us/account">My Account</a>
 </li>,
 <li class="list-item">
 <a class="link" href="/en_us/member/info.html">Loyalty Program Info</a>
 </li>,
 <li class="list-item">
 <a class="link" data-remodal-trigger="join" href="#">Not a Member yet? Join here!</a>
 </li>,
 <li class="list-item">
 <a class="link" href="/en_us/account">My Account</a>
 </li>,
 <li class="list-item">
 <a class="link" href="/en_us/member/info.html">Loyalty Program Info</a>
 </li>,
 <li class="list-item">
 <a class="link" href="/en_us/logout" onclick="trackLogout()">Sign out</a>
 </li>,
 <li class="list-item">
 <a aria-checked="true" class="filter-option miniature active" data-articlecode="1074475001" data-color="Light denim blue" data-sizes="" href="/en_us/productpage.1074475001.html" id="filter-colour-1074475001" role="radio" title="Light denim blue">
 <noscript data-alt="Light denim blue" data-src="//lp2.hm.com/hmgoepprod?set=quali

In [42]:
# Primeiro elemento do list item
soup.find_all('li', class_='list-item')[0]

<li class="list-item">
<a class="link" data-remodal-trigger="signin-account" href="/en_us/account">My Account</a>
</li>

In [56]:
soup.find_all('a', class_='filter-option miniature active')[0]

<a aria-checked="true" class="filter-option miniature active" data-articlecode="1074475001" data-color="Light denim blue" data-sizes="" href="/en_us/productpage.1074475001.html" id="filter-colour-1074475001" role="radio" title="Light denim blue">
<noscript data-alt="Light denim blue" data-src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2Fe6%2Ffe%2Fe6fedc2bef692e5ad2ec9c48d71832c36c498b29.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5B%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]">
<img alt="Light denim blue" src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2Fe6%2Ffe%2Fe6fedc2bef692e5ad2ec9c48d71832c36c498b29.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5B%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]"/>
</noscript>
<span></span>
</a>

In [55]:
soup.find_all('a', class_='filter-option miniature active')[0].get('data-color')

'Light denim blue'

In [63]:
product_list = soup.find_all('a', class_='filter-option miniature active')

In [64]:
color_name = [p.get('data-color') for p in product_list]
color_name

['Light denim blue']

In [65]:
product_list[0]

<a aria-checked="true" class="filter-option miniature active" data-articlecode="1074475001" data-color="Light denim blue" data-sizes="" href="/en_us/productpage.1074475001.html" id="filter-colour-1074475001" role="radio" title="Light denim blue">
<noscript data-alt="Light denim blue" data-src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2Fe6%2Ffe%2Fe6fedc2bef692e5ad2ec9c48d71832c36c498b29.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5B%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]">
<img alt="Light denim blue" src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2Fe6%2Ffe%2Fe6fedc2bef692e5ad2ec9c48d71832c36c498b29.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5B%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]"/>
</noscript>
<span></span>
</a>

In [67]:
product_id_c = [p.get('data-articlecode') for p in product_list]

In [68]:
product_id_c

['1074475001']

In [71]:
df_color = pd.DataFrame([product_id_c, color_name]).T
df_color

Unnamed: 0,0,1
0,1074475001,Light denim blue


In [72]:
df_color.columns = ['product_id', 'color_name']

In [73]:
df_color.head()

Unnamed: 0,product_id,color_name
0,1074475001,Light denim blue
