In [1]:
# This let's me iterate faster by deleting it at the beginning
import os
os.remove('../nursing_pro.json')

# Importing in each cell because of the kernel restarts.
import scrapy
from scrapy.crawler import CrawlerProcess

class EssayItem(scrapy.Item):
    subject = scrapy.Field()
    title = scrapy.Field()
    text = scrapy.Field()

class NSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "NS" # for nursing spider
    
    # URL(s) to start with.
    start_urls = [
        'https://www.nursingwritingservices.com/samples?limitstart=0',
    ]

    # Use XPath to parse the response from the start_urls we declared.
    def parse(self, response):
        # Iterate snippets on each page.
        for essay in response.xpath('//header[@class="entry-header"]/h2/a'):
            # Create our subject and title from this top-level page
            subject = 'Nursing'
            title = essay.xpath('text()').extract_first()
            
            # Create an EssayItem called essay_item, empty for now
            essay_item = EssayItem()

            # Create new url for parse_essays to use
            essay_url = response.urljoin(essay.xpath('@href').extract_first())

            # New request with essay_url, pass in our essay_item
            yield scrapy.Request(essay_url, callback=self.parse_essays, 
                                 dont_filter=True,
                                 meta={'item':essay_item,
                                       'subject':subject,
                                       'title':title})
        if response.xpath('//a[@title="Next"]/@href').extract_first():
            current_url = response.url
            next_url = current_url[:-1] + str(5 + int(current_url[-1]))
            yield scrapy.Request(next_url, callback=self.parse,
                                 dont_filter=True)
            
    def parse_essays(self, response):
        # Retrieve essay items from metadata
        essay_item = response.request.meta['item']
        subject = response.request.meta['subject']
        title = response.request.meta['title']
        
        # Capture each paragraph as a list
        text = response.xpath('//span[@itemprop="articleBody"]/p/span//text()').extract()
        
        # Pack all of our variables into out essay_item
        essay_item['subject'] = subject
        essay_item['title'] = title
        essay_item['text'] = text
        
        # This one should return the actual text, along with the rest of the item fields
        yield essay_item        
        
        
# Tell the script how to run the crawler by passing in settings.
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': '../nursing_pro.json',  # Name our storage file.
    'LOG_ENABLED': False,           # Turn off logging for now.
    'AUTOTHROTTLE_ENABLED' : True,
    'HTTPCACHE_ENABLED' : True,
    'ROBOTSTXT_ENABLED' : False,
    'DOWNLOAD_DELAY' : 2
})

# Start the crawler with our spider.
process.crawl(NSpider)
process.start()
print('Success!')

import pandas as pd

nursing_pro = pd.read_json('../nursing_pro.json', orient='records', encoding='latin')
nursing_pro['student'] = 0
print(nursing_pro.shape)
nursing_pro.head()

Success!
(20, 4)


Unnamed: 0,subject,text,title,student
0,Nursing,[Treatment Alternatives and Stepwise Managemen...,\n\t\t\tASTHMA AND STEPWISE MANAGEMENT,0
1,Nursing,[The basic theoretical assertions or propositi...,\n\t\t\tNursing Theories,0
2,Nursing,[Discuss why a nursing center should be starte...,\n\t\t\tThe Nursing Center and Nurse Practitio...,0
3,Nursing,"[For a majority of women, missing a menstrual ...",\n\t\t\tNursing Care of the Family During Preg...,0
4,Nursing,[The Health Belief Model is the most used theo...,\n\t\t\tThe Health Belief Model,0


In [2]:
nursing_pro.dropna(inplace=True)
print(len(nursing_pro))
nursing_pro.head()

20


Unnamed: 0,subject,text,title,student
0,Nursing,[Treatment Alternatives and Stepwise Managemen...,\n\t\t\tASTHMA AND STEPWISE MANAGEMENT,0
1,Nursing,[The basic theoretical assertions or propositi...,\n\t\t\tNursing Theories,0
2,Nursing,[Discuss why a nursing center should be starte...,\n\t\t\tThe Nursing Center and Nurse Practitio...,0
3,Nursing,"[For a majority of women, missing a menstrual ...",\n\t\t\tNursing Care of the Family During Preg...,0
4,Nursing,[The Health Belief Model is the most used theo...,\n\t\t\tThe Health Belief Model,0


In [3]:
import pickle

pickle.dump(nursing_pro, open('../nursing_pro.pkl', 'wb'))

That was part one of professional sample scraping. I (might) need to find a crap-ton more. I do not know if I should try to balance my dataset so that it matches real-world distributions or if this would work.