In [1]:
# This let's me iterate faster by deleting it at the beginning
import os
os.remove('../grademiner_pro.json')

# Importing in each cell because of the kernel restarts.
import scrapy
from scrapy.crawler import CrawlerProcess

class EssayItem(scrapy.Item):
    subject = scrapy.Field()
    title = scrapy.Field()
    text = scrapy.Field()

class GMSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "GMS" # for student essay spider
    
    # URL(s) to start with.
    start_urls = [
        'https://grademiners.com/free-papers',
    ]

    # Use XPath to parse the response from the start_urls we declared.
    def parse(self, response):
        # Only pull essays from the "Big Three"
        subject_list = ['Business', 'Law', 'Nursing']
        
        # Iterate over every column element on the page.
        for subjects in response.xpath('//div[@class="col-xs-9"]/h2[@class="subjects__item-title"]/a'):
            
            if subjects.xpath('text()').extract_first() not in subject_list:
                continue
                
            # Create our subject from this top-level page
            subject = subjects.xpath('text()').extract_first()
            
            # Create an EssayItem called essay_item, empty for now
            essay_item = EssayItem()
            
            # Create new url for parse_essays to use
            essay_list_url = subjects.xpath('@href').extract_first()
                        
            # New request with essay_list_url, pass in our essay_item
            yield scrapy.Request(essay_list_url, callback=self.parse_essays, 
                                 dont_filter=True,
                                 meta={'item':essay_item,
                                       'subject':subject})
            
    def parse_essays(self, response):

        # Retrieve essay items from metadata
        essay_item = response.request.meta['item']
        subject = response.request.meta['subject']
        
        for essay in response.xpath('//h2[@class="samples__title"]/a'):
                
            # Xpath to the essay's title
            title = essay.xpath('text()').extract_first()
            
            # Xpath into the actual essay's link... finally!
            essay_url = essay.xpath('@href').extract_first()
#             yield {'title':title, 'url':essay_url}
            
            # This one should return the actual text, along with the rest of the item fields
            yield scrapy.Request(essay_url, callback=self.collect_essay,
                                 dont_filter=False,
                                 meta={'item':essay_item,
                                       'subject':subject,
                                       'title':title})
            
        # Get the next page
        next_url = response.xpath('//a[@class="next page-numbers"]/@href').extract_first()
        yield scrapy.Request(next_url, callback=self.parse_essays,
                                 dont_filter=False,
                                 meta={'item':essay_item,
                                       'subject':subject})
        
        
    def collect_essay(self, response):

        # Collect the first paragraph from the page
        paragraph = response.xpath('//div[@class="sample__content-wrap"]//text()').extract()
#         yield {'text':paragraph}
        # Retrieve our essay_item, once again, from Response's metadata
        essay_item = response.request.meta['item']
                
        # Add the text, subject, and title into our essay_item
        # Something weird was happening, probably due to the aysync nature of this stuff
        essay_item['text'] = paragraph
        essay_item['subject'] = response.request.meta['subject']
        essay_item['title'] = response.request.meta['title']        
        
        yield essay_item
        
        
        
# Tell the script how to run the crawler by passing in settings.
process = CrawlerProcess({
    'FEED_FORMAT' : 'json',         # Store data in JSON format.
    'FEED_URI' : '../grademiner_pro.json',  # Name our storage file.
    'LOG_ENABLED' : False,           # Turn off logging for now.
    'AUTOTHROTTLE_ENABLED' : True,
    'HTTPCACHE_ENABLED' : True,
    'ROBOTSTXT_ENABLED' : False,
    'DOWNLOAD_DELAY' : 2
})

# Start the crawler with our spider.
process.crawl(GMSpider)
process.start()
print('Success!')

import pandas as pd

grademiner_pro = pd.read_json('../grademiner_pro.json', orient='records', encoding='latin')
grademiner_pro['student'] = 0
print(grademiner_pro.shape)
grademiner_pro.head()

Success!
(59, 4)


Unnamed: 0,subject,text,title,student
0,Nursing,"[\n , Introduction, \r\...",Patient safety in the operating room,0
1,Nursing,"[\n , Statement of the ...",Pain Management and Alternative Therapies,0
2,Nursing,[\n The purpose of vent...,The purpose of ventilatory management,0
3,Business,[\n Family owned busine...,Family Business Succession,0
4,Business,"[\n , Business Case Ana...",Business case report essay,0


In [3]:
import winsound
frequency = 2500  # Set Frequency To 2500 Hertz
duration = 1000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

In [2]:
import pickle

pickle.dump(grademiner_pro, open('../grademiner_pro.pkl', 'wb'))

Woo!! All done with that one. On the the harder part: finding proffessional samples. The websites I've found generally do not post very many samples, unlike this UK Student Sample Bank. There were tens of thousands of essays to choose from, but the same website only posted 40 or so samples from their professionals. I will have to manually find a bunch of websites and make scrapers for all of them :(