In [8]:
# scrape webpage
import scrapy
from scrapy.crawler import CrawlerProcess
# text cleaning
import re

class QuotesToCsv(scrapy.Spider):
    """scrape first line of  quotes from `wikiquote` by 
    Maynard James Keenan and save to json file"""
    name = "MJKQuotesToCsv"
    start_urls = [
        'https://en.wikiquote.org/wiki/Maynard_James_Keenan',
    ]
    custom_settings = {
        'ITEM_PIPELINES': {
            '__main__.ExtractFirstLine': 1
        },
        'FEEDS': {
            'quotes.csv': {
                'format': 'csv',
                'overwrite': True
            }
        }
    }

    def parse(self, response):
        """parse data from urls"""
        for quote in response.css('div.mw-parser-output > ul > li'):
            yield {'quote': quote.extract()}

class ExtractFirstLine(object):
    def process_item(self, item, spider):
        """text processing"""
        lines = dict(item)["quote"].splitlines()
        first_line = self.__remove_html_tags__(lines[0])

        return {'quote': first_line}

    def __remove_html_tags__(self, text):
        """remove html tags from string"""
        html_tags = re.compile('<.*?>')
        return re.sub(html_tags, '', text)


In [9]:
process = CrawlerProcess()
process.crawl(QuotesToCsv)
process.start()

2023-01-02 15:55:09 [scrapy.utils.log] INFO: Scrapy 2.6.2 started (bot: scrapybot)
2023-01-02 15:55:09 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.2.0, parsel 1.6.0, w3lib 1.21.0, Twisted 22.2.0, Python 3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 1.1.1q  5 Jul 2022), cryptography 37.0.1, Platform Windows-10-10.0.22621-SP0
2023-01-02 15:55:09 [scrapy.crawler] INFO: Overridden settings:
{}
2023-01-02 15:55:09 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-01-02 15:55:09 [scrapy.extensions.telnet] INFO: Telnet Password: acd1825619af7039
2023-01-02 15:55:09 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2023-01-02 15:55:09 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloaderm

ReactorAlreadyRunning: 

2023-01-02 15:55:10 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://en.wikiquote.org/wiki/Maynard_James_Keenan> (referer: None)
2023-01-02 15:55:10 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikiquote.org/wiki/Maynard_James_Keenan>
{'quote': "Tool is not Slayer. I went to art school. I spent three years in the military. There's more to me than throwing devil horns."}
2023-01-02 15:55:10 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikiquote.org/wiki/Maynard_James_Keenan>
{'quote': 'I think there’s a reason why wine figures into so many religions. There’s something transcendent about it. It’s sort of the way that music is more than the sum of its parts. You have all these elements that make up the terroir that wine can communicate.'}
2023-01-02 15:55:10 [scrapy.core.scraper] DEBUG: Scraped from <200 https://en.wikiquote.org/wiki/Maynard_James_Keenan>
{'quote': "You can grow grapes in almost any part of the world. You just have to develop your palate en