In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import re
import pandas as pd

In [2]:
class ESSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously
    name = "ESS"
    
    # URL(s) to start with
    start_urls = [
        'http://www.everydaysexism.com',
    ]

    # Use XPath to parse the response we get
    def parse(self, response):
        
        # Iterate over every <article> element on the page
        for article in response.xpath('//article'):
            
            # Yield a dictionary with the values we want
            yield {
                'name': article.xpath('header/h2/a/@title').extract_first(),
                'date': article.xpath('header/section/span[@class="entry-date"]/text()').extract_first(),
                'text': article.xpath('section[@class="entry-content"]/p/text()').extract(),
                'tags': article.xpath('*/span[@class="tag-links"]/a/text()').extract()
            }
        # Get the URL of the previous page
        next_page = response.xpath('//div[@class="nav-previous"]/a/@href').extract_first()
        
        # There are a LOT of pages here.  For our example, we'll just scrape the first 9
        # This finds the page number. The next segment of code prevents us from going beyond page 9
        pagenum = int(re.findall(r'\d+',next_page)[0])
        
        # Recursively call the spider to run on the next page, if it exists
        if next_page is not None and pagenum < 10:
            next_page = response.urljoin(next_page)
            # Request the next page and recursively parse it the same way we did above
            yield scrapy.Request(next_page, callback=self.parse)

In [3]:
# Instantiate our crawler.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format
    'FEED_URI': 'data.json',       # Name our storage file
    'LOG_ENABLED': False,          # Turn off logging for now
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})


# Start the crawler with our spider
process.crawl(ESSpider)
process.start()
print('Success!')

Success!


In [4]:
# Checking whether we got data from all 9 pages
ESSdf=pd.read_json('data.json', orient='records')
print(ESSdf.shape)
ESSdf.head()

(90, 4)


Unnamed: 0,name,date,text,tags
0,Katie,2020-05-06,[I was raped by my female lover in the same ro...,"[#lesbianrape #invisabletopic, School]"
1,Anna,2020-05-06,[This happened a few months ago but I remember...,[Media]
2,Still Affected,2020-05-06,[I was 14 doing a Saturday job in a garden cen...,"[Underage, Workplace]"
3,Emilia,2020-05-05,[When I was a smaller I loved to play sports. ...,"[#child #genderabusers #trauma, Home, School]"
4,Other girls,2020-05-05,"[I’m a twenty-year-old woman, and I have a lot...","[Home, University]"
