# Scrapy
* An open source and collaborative framework for extracting the data you need from websites.

### 1. Start by making a scrapy project

In [None]:
# we will make a scrapy project with any name 
!scrapy startproject myproject

# my project is the name of the project and startproject is the command to start a new project
# this command will create a new scrapy project(a folder will be created) with all the 
# essential configurations and spiders/web crawlers

### 2. Write a spider to crawl a website and extract data
* Spider is a class that scrapy uses to extract data from one or multiple websites
* spiders are created in a .py file and stored under the spiders folder (in your project folder)

In [None]:
# code to write in the .py file :-
# in command line write "scrapy crawl <nameof your spider/file name>"

import scrapy


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://quotes.toscrape.com/page/1/',
            'http://quotes.toscrape.com/page/2/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = 'quotes-%s.html' % page
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log('Saved file %s' % filename)
        
# after we execute it using command line we get debug messages that we have successfully 
# saved respective files

# we will get the whole html of the specific pages in the saved files like before.

### 3.1 Using Shell and Selectors
* Use shell in command line and then can use different selectors to extract data

In [None]:
# this command initiates a new shell and crawlers 
# once a response command is received we can use selectors to get specific information
scrapy shell "http://quotes.toscrape.com/page/1/"

response.css('title').getall()
# this returns a list of class titles
# if we use get() instead of getall() we get the first item in the list

# to get just the text part and not the html we use :-
response.css('title::text').getall()[0]

# code to get quotes :-
for q in response.css("div.quote"): # each quote was enclosed in a div tage with class name quotes
    text = q.css("span.text::text").get() # actual quote is enclosed in a span names text
    print(text)


### 3.2 Saving Shell response as a JSON file
* We will modify our spider with our knowledge of selectors to create a json out of the response

In [None]:
# New spider file (edited code) :-
import scrapy


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://quotes.toscrape.com/page/1/',
            'http://quotes.toscrape.com/page/2/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = 'quotes-%s.html' % page
        for q in response.css("div.quote"): # each quote was enclosed in a div tage with class name quotes
            text = q.css("span.text::text").get() # actual quote is enclosed in a span names text
            author = q.css("small.author::text").get()
            tags = q.css("a.tag::text").getall()
        
        # we will form a dictionary as json files have dictionaries in them
            yield {
                'text' : text,
                'author' : author,
                'tags' : tags
            }
        
# To form a json file we will write the following command in command line
scrapy crawl quotes -o quotes.json
# REMEMBER THIS COMMAND IT IS VERY IMPORTANT

### 4. Forming a recursive spider to cover all pages of a website
* Find next page link on a website and use it to recursively crawl all the pages

In [None]:
# In this case the link to next page was stored under a list 
# if we have to extract attributes we will use attr(href) instead of ::text

response.css('li.next a::attr(href)').get() 
# used 'a' as it lies in anchor tag under li tage with class name next

# Another way of writing this command :-
response.css('li.next a').attrib["href"]

# Both of these response commands return '/page/2/'

# Code to be added to crawl this site recursively
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
    next_page = response.urljoin(next_page)
    yield scrapy.Request(next_page, callback=self.parse)
    
# Whole file :-

import scrapy


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://quotes.toscrape.com/page/1/'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = 'quotes-%s.html'%page
        for q in response.css("div.quote"): # each quote was enclosed in a div tage with class name quotes
            text = q.css("span.text::text").get() # actual quote is enclosed in a span names text
            author = q.css("small.author::text").get()
            tags = q.css("a.tag::text").getall()

        # we will form a dictionary as json files have dictionaries in them
            yield {
                'text':text,
                'author':author,
                'tags':tags,
            }

        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)



### Steps to run our spider:
- 1) We will copy the above code in a python script with name "quotes.py" and save it in the following directory: $./myproject/myproject/spiders/$ 
- 2) Then we will open command prompt and navigate to the directory in which we saved our "quotes.py" file
- 3) Once we are done with steps 1 & 2 we just have to paste the following command in command prompt and run it<br>
**Command**: scrapy crawl quotes -o quotes.json