In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import logging

## set up a simple pipeline that writes the found items to a JSON file. 
* Assume each line contains one JSON element

In [2]:
import json

class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

## define the spider within scrapy

In [3]:
class QuotesSpider(scrapy.Spider):
    name ='quotes'
    allowed_domains = ['quotes.toscrape.com']
    start_urls = ['http://quotes.toscrape.com/']
    custom_settings = {
        'LOG_LEVEL':logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline':1},
        'FEED_FORMAT': 'json', 
        'FEED_URI': 'quoteresult.json' 
    }
    
    def parse(self, response):
        for quote in response.xpath('//div[@class="quote"]'): 
            #response.css('div.quote'):
            #yield {'text': quote.css('span.text::text').extract_first(),
            #      'author': quote.css('span small:text').extract_first(),
            #      'tags': quote.css('div.tags a.tag::text').extract()
            yield {
                'text': quote.xpath('./span[@class="text"]/text()').extract_first(),
                'author': quote.xpath('.//small[@class="author"]/text()').extract_first(),
                'tags': quote.xpath('.//div[@class="tags"]/a[@class="tag"]/text()').extract()
            }

        next_page_url = response.xpath('//li[@class="next"]/a/@href').extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))
            

In [4]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(QuotesSpider)


2018-10-22 13:27:47 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2018-10-22 13:27:47 [scrapy.utils.log] INFO: Versions: lxml 4.1.1.0, libxml2 2.9.5, cssselect 1.0.3, parsel 1.5.0, w3lib 1.19.0, Twisted 18.7.0, Python 3.6.6 |Anaconda custom (64-bit)| (default, Jun 28 2018, 11:27:44) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 18.0.0 (OpenSSL 1.0.2g  1 Mar 2016), cryptography 2.2.2, Platform Windows-10-10.0.16299-SP0
2018-10-22 13:27:47 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'quoteresult.json', 'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


<Deferred at 0x2bd64199940>

In [5]:
process.start()

In [6]:
%ls

 Volume in drive C has no label.
 Volume Serial Number is E6F4-CD26

 Directory of C:\Users\ARADER\GitHub\swissarmy\webscraping

10/22/2018  01:26 PM    <DIR>          .
10/22/2018  01:26 PM    <DIR>          ..
10/22/2018  01:14 PM    <DIR>          .ipynb_checkpoints
10/22/2018  01:27 PM            21,184 quoteresult.jl
10/22/2018  01:27 PM            21,186 quoteresult.json
10/22/2018  01:26 PM             8,257 ScrapyQuotesTest.ipynb
               3 File(s)         50,627 bytes
               3 Dir(s)  96,064,409,600 bytes free


## load the data

In [7]:
import pandas as pd
df = pd.read_json('quoteresult.json')
df.head()

Unnamed: 0,author,tags,text
0,Albert Einstein,"[change, deep-thoughts, thinking, world]",“The world as we have created it is a process ...
1,J.K. Rowling,"[abilities, choices]","“It is our choices, Harry, that show what we t..."
2,Albert Einstein,"[inspirational, life, live, miracle, miracles]",“There are only two ways to live your life. On...
3,Jane Austen,"[aliteracy, books, classic, humor]","“The person, be it gentleman or lady, who has ..."
4,Marilyn Monroe,"[be-yourself, inspirational]","“Imperfection is beauty, madness is genius and..."


In [8]:
process.stop()

<DeferredList at 0x2bd643c8780 current result: []>

In [9]:
len(df)

100

In [11]:
df.author.nunique()

50