In [1]:
# This let's me iterate faster by deleting it at the beginning
import os
os.remove('../uk_pro.json')

# Importing in each cell because of the kernel restarts.
import scrapy
from scrapy.crawler import CrawlerProcess

class EssayItem(scrapy.Item):
    subject = scrapy.Field()
    title = scrapy.Field()
    text = scrapy.Field()

class SESpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "SES" # for student essay spider
    
    # URL(s) to start with.
    start_urls = [
        'https://www.ukessays.com/services/samples/',
    ]

    # Use XPath to parse the response from the start_urls we declared.
    def parse(self, response):
        for i in range(1, 4):
            # Iterate over every column element on the page.
            for essay in response.xpath('//main/section[2]/div/div[{}]/a'.format(i)):
                # Create our subject from this top-level page
                subject = essay.xpath('../../h2[{}]/@id'.format(i)).extract_first()

                # Create an EssayItem called essay_item, empty for now
                essay_item = EssayItem()

                # Create new url for parse_essays to use
                essay_url = response.urljoin(essay.xpath('@href').extract_first())
                
                # New request with essay_url, pass in our essay_item
                yield scrapy.Request(essay_url, callback=self.parse_essays, 
                                     dont_filter=True,
                                     meta={'item':essay_item,
                                           'subject':subject})
            
    def parse_essays(self, response):

        # Retrieve essay items from metadata
        essay_item = response.request.meta['item']
        subject = response.request.meta['subject']
        
        # Xpath to the essay's title
        title = response.xpath('//main/section/div/div/div/h2/text()').extract_first()

        # Capture each paragraph as a list
        text = response.xpath('//main/section/div/div/div/p/text()').extract()
        
        # Pack all of our variables into out essay_item
        essay_item['subject'] = subject
        essay_item['title'] = title
        essay_item['text'] = text
        
        # This one should return the actual text, along with the rest of the item fields
        yield essay_item        
        
        
# Tell the script how to run the crawler by passing in settings.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': '../uk_pro.json',  # Name our storage file.
    'LOG_ENABLED': False,           # Turn off logging for now.
    'AUTOTHROTTLE_ENABLED' : False,
    'HTTPCACHE_ENABLED' : True,
    'ROBOTSTXT_ENABLED' : False,
    'DOWNLOAD_DELAY' : 1.5
})

# Start the crawler with our spider.
process.crawl(SESpider)
process.start()
print('Success!')

import pandas as pd

uk_pro = pd.read_json('../uk_pro.json', orient='records', encoding='latin')
uk_pro['student'] = 0
print(uk_pro.shape)
uk_pro.head()

Success!
(39, 4)


Unnamed: 0,subject,text,title,student
0,nursing,"[In 2017, the government will end NHS bursarie...",What is the Impact of Cutting down Nursing Stu...,0
1,nursing,[There is increasing prevalence of people with...,Discuss the impact of living with a long term ...,0
2,nursing,"[Leadership consists of various qualities, ski...",Discuss what effective leadership is and argue...,0
3,nursing,[This essay will critically explore the impact...,What is the Impact of Cutting down Nursing Stu...,0
4,law,"[Dworkin, R, 'Taking Rights Seriously' (1977) ...",Should judges aspire to be 'formalist' or 'nat...,0


In [2]:
import pickle

pickle.dump(uk_pro, open('../uk_pro.pkl', 'wb'))

That was part one of professional sample scraping. I (might) need to find a crap-ton more. I do not know if I should try to balance my dataset so that it matches real-world distributions or if this would work.