# Scrapy Mini Project/Tutorial

In [1]:
#Install Scrapy
# !pip install Scrapy

In [2]:
import scrapy
import os

This tutorial will walk you through these tasks:

1. Creating a new Scrapy project
2. Writing a spider to crawl a site and extract data
3. Exporting the scraped data using the command line
4. Changing spider to recursively follow links
5. Using spider arguments
6. Load the scraped data into a SQLlite3 database

In [3]:
# !scrapy startproject scrapy_mini_project #only do this first time

In [4]:
os.chdir('./scrapy_mini_project/')

In [5]:
!scrapy crawl quotes

2021-01-28 02:12:44 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapy_mini_project)
2021-01-28 02:12:44 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.9 | packaged by conda-forge | (default, Dec  9 2020, 21:08:20) - [GCC 9.3.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-72-generic-x86_64-with-debian-stretch-sid
2021-01-28 02:12:44 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-01-28 02:12:44 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'scrapy_mini_project',
 'NEWSPIDER_MODULE': 'scrapy_mini_project.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['scrapy_mini_project.spiders']}
2021-01-28 02:12:44 [scrapy.extensions.telnet] INFO: Telnet Password: 6052527b2f6739a3
2021-01-28 02:12:44 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.exten

## Extract Text Using CSS and Xpath

In [None]:
!scrapy shell 'http://quotes.toscrape.com/page/1/' #need to  run this in terminal

2021-01-28 03:43:50 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapy_mini_project)
2021-01-28 03:43:50 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.9 | packaged by conda-forge | (default, Dec  9 2020, 21:08:20) - [GCC 9.3.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-72-generic-x86_64-with-debian-stretch-sid
2021-01-28 03:43:50 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-01-28 03:43:50 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'scrapy_mini_project',
 'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
 'LOGSTATS_INTERVAL': 0,
 'NEWSPIDER_MODULE': 'scrapy_mini_project.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['scrapy_mini_project.spiders']}
2021-01-28 03:43:50 [scrapy.extensions.telnet] INFO: Telnet Password: 22f34dce7bd2314a
2021-01-28 03:43:50 [scrapy.middleware]

In [None]:
# !response.css('title::text').getall() #need to  run this in terminal

In [None]:
# !response.css('title').getall() #need to  run this in terminal

In [9]:
# !response.css('title::text').re(r'Quotes.*') #need to  run this in terminal

In [6]:
# !response.xpath('//title') #need to  run this in terminal

In [7]:
# !response.xpath('//title/text()').get() #need to  run this in terminal

In [4]:
#do this inorder to run in Jupyter (otherwise run this in shell to get response object: scrapy shell 'http://quotes.toscrape.com/page/1/' )

import requests
import scrapy

res = requests.get('http://quotes.toscrape.com/page/1/')
response = scrapy.http.TextResponse(res.url, body=res.text, encoding='utf-8')

In [None]:
response.css("div.quote").getall()

#### Extracting quotes and authors

In [3]:
for quote in response.css("div.quote"):
    text = quote.css("span.text::text").get()
    author = quote.css("small.author::text").get()
    tags = quote.css("div.tags a.tag::text").getall()
    print(dict(text=text, author=author, tags=tags))

{'text': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'author': 'Albert Einstein', 'tags': ['change', 'deep-thoughts', 'thinking', 'world']}
{'text': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'author': 'J.K. Rowling', 'tags': ['abilities', 'choices']}
{'text': '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”', 'author': 'Albert Einstein', 'tags': ['inspirational', 'life', 'live', 'miracle', 'miracles']}
{'text': '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”', 'author': 'Jane Austen', 'tags': ['aliteracy', 'books', 'classic', 'humor']}
{'text': "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”", 'author': 'Marilyn Monroe', 'tags': ['be-yourself', 'inspirational']}


## Extracting Data Using Spider

In [12]:
os.chdir('./scrapy_mini_project')

In [16]:
# !scrapy crawl quotes_extract
!scrapy crawl quotes_extract -o 'quotes_extract.json'

2021-01-28 06:06:21 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapy_mini_project)
2021-01-28 06:06:21 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.9 | packaged by conda-forge | (default, Dec  9 2020, 21:08:20) - [GCC 9.3.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Linux-4.15.0-72-generic-x86_64-with-debian-stretch-sid
2021-01-28 06:06:21 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-01-28 06:06:21 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'scrapy_mini_project',
 'NEWSPIDER_MODULE': 'scrapy_mini_project.spiders',
 'ROBOTSTXT_OBEY': True,
 'SPIDER_MODULES': ['scrapy_mini_project.spiders']}
2021-01-28 06:06:21 [scrapy.extensions.telnet] INFO: Telnet Password: 06f7290fa47719f5
2021-01-28 06:06:21 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.exten

In [None]:
!scrapy crawl quotes_extract_rec_to_scrap_css -o css-scraper-results.json

## Save extracted json data into SQLlite

In [5]:
os.chdir('./scrapy_mini_project')
!pwd

/notebooks/mec-5.5.4-webscraping-project/scrapy_mini_project


In [6]:
import sqlite3
from sqlite3 import Error

In [120]:
database = r"./pythonsqlite.db"

# create a database connection
conn = sqlite3.connect(database) # creates a new db (if it doesn't exist)

#create a cursor object
curr = conn.cursor()

In [121]:
# !sqlite3 ./pythonsqlite.db

In [122]:
#create a quotes table
sql_create_quotes_table = """ CREATE TABLE IF NOT EXISTS quotes (
                                    id integer PRIMARY KEY,
                                    Description text NOT NULL,
                                    Author text NOT NULL,
                                    Tags text
                                ); """
    
curr.execute(sql_create_quotes_table) #to create a table

<sqlite3.Cursor at 0x7fd9377f0a40>

In [123]:
curr.description

In [124]:
# #insert data into the quote table (playground)
# sql_insert_data_quotes = """ INSERT INTO quotes(id, Description, Author, Tags) 
#                             VALUES (1,'description 1','author 1','tag 1')"""
# curr = conn.cursor()
# curr.execute(sql_insert_data_quotes)
# conn.commit() #to make our changes persistent in the database.
# last_rowid = curr.lastrowid #get the number of rows affected

In [None]:
# #query the database
# curr.execute('select * from quotes') #to run a query
# results = curr.fetchall() #to get qury results
# print(results)

In [27]:
# curr.execute('drop table quotes;') #to delete table

<sqlite3.Cursor at 0x7fd936b79180>

In [113]:
# #To delete entries from table
# for i in range(100):
#     sql_delete_query = """DELETE from quotes where id = %d""" %i
#     curr.execute(sql_delete_query)
#     conn.commit()

In [129]:
import json

def copy_data_from_json_to_db():
    #copy data from *.json into the database
    with open('css-scraper-results.json','r') as f:
        data = json.load(f)

    sql_base_cmnd = "INSERT INTO quotes(id, Description, Author, Tags) VALUES"
    for idx, d in enumerate(data):
        description = d['text'].replace('\'', '')
        author = d['author'].replace('\'', '')
        tags = ' '.join(d['tags'])

        sql_cmnd = f"{sql_base_cmnd}({idx}, '{description[1:-1]}', '{author}', '{tags}')"
        #sql_cmnd = f"{sql_base_cmnd}({idx}, {description[1:-1]}, {author}, {tags})" #doesn't work
        #print(sql_cmnd)
        curr.execute(sql_cmnd)
        conn.commit() #to make our changes persistent in the database.

    last_rowid = curr.lastrowid #get the number of rows affected
    return last_word

# copy_data_from_json_to_db()

In [125]:
last_rowid

99

In [None]:
curr.execute('select * from quotes') #to run a query
results = curr.fetchall() #to get qury results
print(results)

#### Close the database

In [127]:
curr.close()
conn.close()