In [1]:
# This let's me iterate faster by deleting it at the beginning
import os
# os.remove('../uk_students.json')

# Importing in each cell because of the kernel restarts.
import scrapy
from scrapy.crawler import CrawlerProcess

class EssayItem(scrapy.Item):
    subject = scrapy.Field()
    title = scrapy.Field()
    text = scrapy.Field()

class SESpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "SES" # for student essay spider
    
    # URL(s) to start with.
    start_urls = [
        'https://www.ukessays.com/essays/',
    ]

    # Use XPath to parse the response from the start_urls we declared.
    def parse(self, response):
        # Only pull essays from the "Big Three"
        subject_list = ['Business ', 'Law ', 'Nursing ']
        
        # Iterate over every column element on the page.
        for subjects in response.xpath('//div[@class="col-lg-3 col-sm-6"]/div/ul/li'):
            
            if subjects.xpath('a/text()').extract_first() not in subject_list:
                continue
                
            # Create our subject from this top-level page
            subject = subjects.xpath('a/text()').extract_first()
            
            # Create an EssayItem called essay_item, empty for now
            essay_item = EssayItem()
            
            # Create new url for parse_essays to use
            essay_list_url = response.urljoin(subjects.xpath('a/@href').extract_first())
            
            # New request with essay_list_url, pass in our essay_item
            yield scrapy.Request(essay_list_url, callback=self.parse_essays, 
                                 dont_filter=True,
                                 meta={'item':essay_item,
                                       'subject':subject})
            
    def parse_essays(self, response):

        # Retrieve essay items from metadata
        essay_item = response.request.meta['item']
        subject = response.request.meta['subject']
        
        for i, essay in enumerate(response.xpath('//div/article/div/h4')):
                
            # Xpath to the essay's title
            title = essay.xpath('a/text()').extract_first()
            
            # Xpath into the actual essay's link... finally!
            essay_url = response.urljoin(essay.xpath('a/@href').extract_first())
            
            # This one should return the actual text, along with the rest of the item fields
            yield scrapy.Request(essay_url, callback=self.collect_essay,
                                 dont_filter=False,
                                 meta={'item':essay_item,
                                       'subject':subject,
                                       'title':title})
            
        # After loop, update the page number
        page_num = int(response.xpath('//li[@class="page-item active"]/span/text()').extract_first())
        
        if page_num < 7:
            next_page = '?page={}'.format(page_num+1)
            next_url = response.urljoin(next_page)
            yield scrapy.Request(next_url, callback=self.parse_essays,
                                 dont_filter=False,
                                 meta={'item':essay_item,
                                       'subject':subject})
        
        
    def collect_essay(self, response):
        # Retrieve our essay_item, once again, from Response's metadata
        essay_item = response.request.meta['item']
                
        # Add the text, subject, and title into our essay_item
        # Something weird was happening, probably due to the aysync nature of this stuff
        essay_item['text'] = response.xpath('//body/main/div/div/article/p/text()').extract()
        essay_item['subject'] = response.request.meta['subject']
        essay_item['title'] = response.request.meta['title']        
        
        yield essay_item
        
        
        
# Tell the script how to run the crawler by passing in settings.
process = CrawlerProcess({
    'FEED_FORMAT' : 'json',         # Store data in JSON format.
    'FEED_URI' : '../uk_students.json',  # Name our storage file.
    'LOG_ENABLED' : False,           # Turn off logging for now.
    'AUTOTHROTTLE_ENABLED' : True,
    'HTTPCACHE_ENABLED' : True,
    'ROBOTSTXT_ENABLED' : False,
    'DOWNLOAD_DELAY' : 2
})

# Start the crawler with our spider.
process.crawl(SESpider)
process.start()
print('Success!')

import pandas as pd

uk_students = pd.read_json('../uk_students.json', orient='records', encoding='latin')
uk_students['student'] = 1
print(uk_students.shape)
uk_students.head()

Success!
(734, 4)


Unnamed: 0,subject,text,title,student
0,Nursing,"[\n, \n, \n, \n, \n, Total Word Count: , This ...",Inquiry into Patient Death,1
1,Nursing,"[\n, \n, \n, \n, \n, Leadership has been descr...",Motivation Skills Development Plan for Nursing,1
2,Nursing,"[\n, \n, \n, \n, \n, Depression among pregnant...",Depression among Pregnant Adolescents: Literat...,1
3,Nursing,"[\n, \n, \n, \n, \n, Davidson, E., Daly, J., B...",Family Support Programme for ICU Patient Relat...,1
4,Nursing,"[\n, \n, \n, \n, \n, Idiopathic pulmonary fibr...",Idiopathic Pulmonary Fibrosis: An Overview,1


In [2]:
len(uk_students.title.unique())

732

In [3]:
import winsound
frequency = 2500  # Set Frequency To 2500 Hertz
duration = 1000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

In [4]:
import pickle

pickle.dump(uk_students, open('../uk_students.pkl', 'wb'))

Woo!! All done with that one. On the the harder part: finding proffessional samples. The websites I've found generally do not post very many samples, unlike this UK Student Sample Bank. There were tens of thousands of essays to choose from, but the same website only posted 40 or so samples from their professionals. I will have to manually find a bunch of websites and make scrapers for all of them :(