# Scrapy - Extracción de información de páginas web

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [2]:
import json

class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [3]:
class ChedrauiSpider(scrapy.Spider):
    name = "chedraui"
    start_urls = ["https://www.chedraui.com.mx/"]
    custom_settings = {
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'chedraui.json'                        # Used for pipeline 2
    }
    def parse(self, response):
        for product in response.css("div.carousel__item"):
            yield {
                "name": product.css("a div.carousel__item--name::text").extract_first(),
                "price": product.css("a div.carousel__item--price::text").extract_first()
            }

In [4]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(ChedrauiSpider)
process.start()

2018-12-08 11:21:17 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2018-12-08 11:21:17 [scrapy.utils.log] INFO: Versions: lxml 4.2.5.0, libxml2 2.9.8, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 18.9.0, Python 2.7.15 (default, Aug  7 2018, 10:44:55) - [GCC 4.2.1 Compatible Apple LLVM 9.1.0 (clang-902.0.39.2)], pyOpenSSL 18.0.0 (OpenSSL 1.1.0j  20 Nov 2018), cryptography 2.4.2, Platform Darwin-18.2.0-x86_64-i386-64bit
2018-12-08 11:21:17 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'chedraui.json', 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}
2018-12-08 11:21:17 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.corestats.CoreStats']
2018-12-08 11:21:17 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.do

2018-12-08 11:21:24 [scrapy.core.engine] INFO: Spider closed (finished)


In [5]:
!tail -n 2 chedraui.json

{"price": "$999.00", "name": "Set Barbie Cocina y Divi\u00e9rtete FRH73"}
]

In [6]:
import pandas as pd

df = pd.read_json("chedraui.json")

print(df)

2018-12-08 11:25:00 [matplotlib] DEBUG: $HOME=/Users/alan
2018-12-08 11:25:00 [matplotlib] DEBUG: matplotlib data path /usr/local/lib/python2.7/site-packages/matplotlib/mpl-data
2018-12-08 11:25:00 [matplotlib] DEBUG: loaded rc file /usr/local/lib/python2.7/site-packages/matplotlib/mpl-data/matplotlibrc
2018-12-08 11:25:00 [matplotlib] DEBUG: matplotlib version 2.2.2
2018-12-08 11:25:00 [matplotlib] DEBUG: interactive is False
2018-12-08 11:25:00 [matplotlib] DEBUG: platform is darwin


                                        name       price
0       Pantalla LG 65" UHD Smart TV 65UJ630  $21,995.00
1        Pantalla LG 55" UHD Smart TV 55UJ63  $12,995.00
2       Pant SANSUI 50 " SMX5019USM 4K Smart   $7,295.00
3                Pant JVC 49 " SI49FSC Curva   $8,295.00
4    Pantalla Samsung 43" Smart TV LH43SEJBG   $6,995.00
5           Pantalla JVC 43" Smart TV SI43FS   $6,295.00
6              Televisor Sansui 32"  SMX32Z1   $2,795.00
7    Consola Xbox One S + Assassin's Creed +   $7,295.00
8              Bafle AIWA 8'' AW908 50 Watts   $1,395.00
9       Autoestereo AIWA AW-3239BT Bluetooth   $1,045.00
10          Lavadora Mabe 19Kg. LMH79104WBAB   $8,990.00
11  Secadora Mabe SMG47N8MSBAB0 22 Kg Blanca  $10,795.00
12        Refrigerador Daewoo 16p3 44520GNDA  $10,295.00
13                 Estufa IEM EI3030BAPN 30"   $4,995.00
14       Asador Home Line de Carbón BB02543U   $3,995.00
15           Mesa para TV Home Line FII0761R   $2,295.00
16   Muñeca Baby Alive Bebé Va 

In [9]:
import re

df["price_number"] = df["price"].map(lambda price: float(re.search("\$(\d+)", price.replace(",", "")).group(1)))

print(df)

                                        name       price  price_number
0       Pantalla LG 65" UHD Smart TV 65UJ630  $21,995.00       21995.0
1        Pantalla LG 55" UHD Smart TV 55UJ63  $12,995.00       12995.0
2       Pant SANSUI 50 " SMX5019USM 4K Smart   $7,295.00        7295.0
3                Pant JVC 49 " SI49FSC Curva   $8,295.00        8295.0
4    Pantalla Samsung 43" Smart TV LH43SEJBG   $6,995.00        6995.0
5           Pantalla JVC 43" Smart TV SI43FS   $6,295.00        6295.0
6              Televisor Sansui 32"  SMX32Z1   $2,795.00        2795.0
7    Consola Xbox One S + Assassin's Creed +   $7,295.00        7295.0
8              Bafle AIWA 8'' AW908 50 Watts   $1,395.00        1395.0
9       Autoestereo AIWA AW-3239BT Bluetooth   $1,045.00        1045.0
10          Lavadora Mabe 19Kg. LMH79104WBAB   $8,990.00        8990.0
11  Secadora Mabe SMG47N8MSBAB0 22 Kg Blanca  $10,795.00       10795.0
12        Refrigerador Daewoo 16p3 44520GNDA  $10,295.00       10295.0
13    

In [10]:
df2 = df.query("price_number >= 10000 and price_number <= 15000")

print(df2)

                                        name       price  price_number
1        Pantalla LG 55" UHD Smart TV 55UJ63  $12,995.00       12995.0
11  Secadora Mabe SMG47N8MSBAB0 22 Kg Blanca  $10,795.00       10795.0
12        Refrigerador Daewoo 16p3 44520GNDA  $10,295.00       10295.0
