In [13]:
import scrapy
import time
import json
import logging
import pandas as pd
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup

In [2]:
#activity_list = ['3673761', '3662467', '3669435', '3662636', '3659777', '3664756', '3663135', '3662547']
activity_data = pd.read_excel('../data/misle/MISLE Incident Investigations DT.xlsx')
activity_list = activity_data['Activity ID'].tolist()

In [19]:
print(len(activity_list))

3393


In [20]:
misle_scrape = pd.read_json('../data/misle/scrape/misle-scraped-brief.json')

print(misle_scrape)

    activity_id                                     incident_brief
0       3743747  At approximately 1925 on May 11th, 2010 the M/...
1       3748349  UTV Ocie Clark had a buoy stuck in the starboa...
2       3709328  M/V Jean Akin reported alliding with the Sherm...
3       3720297  Campbell transportation advised the USCG that ...
4       3697617  On 16Mar2010 at 2230L, the UTV SANDY DRAKE was...
..          ...                                                ...
73      3757646  Received a report of the UTV James H. Hunter a...
74      3766718  While sliding barges on board the M/V Sally Br...
75      3795297  05 MAY 2010, The M/V GINNY STONE caught an obj...
76      3732642  UTV Hazel (Crounse Corp) allided with the Clar...
77      3744432  Pikes Island L/D operator advised the UTV MONG...

[78 rows x 2 columns]


In [7]:
def getData(cssID, soup):
    data = soup.find(id=cssID)
    if(data is not None):
        return data.text #to extract the text without html tags
    else:
        return ''
    
briefs = []

class MISLEViewStateSpider(scrapy.Spider):
    name = 'misle-viewstate'
    start_urls = ['https://cgmix.uscg.mil/IIR/IIRSearch.aspx']
    download_delay = 1.5
    
    def __init__(self, activity_id=None):
        self.activity_id = activity_id
    
    def parse(self, response):
        yield scrapy.FormRequest('https://cgmix.uscg.mil/IIR/IIRSearch.aspx',
                                 formdata={'__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)'
                                                                      ).extract_first(),
                                           'TextBoxActivityNumber': self.activity_id,
                                           'DropDownListVesselService':'ALL',
                                           'TextBoxFromDate':'01/01/2010',
                                           'TextBoxToDate':'10/16/2019',
                                           'ButtonSearch':'Search',
                                           '__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)'
                                                                      ).extract_first()
                                          },
                                 callback=self.parse_activity)

    def parse_activity(self, response):
        yield scrapy.FormRequest('https://cgmix.uscg.mil/IIR/IIRSearch.aspx',
                                 formdata={'__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)'
                                                                      ).extract_first(),
                                           '__VIEWSTATEGENERATOR': response.css('input#__VIEWSTATEGENERATOR::attr(value)'
                                                                      ).extract_first(),
                                           '__EVENTTARGET':'GridViewIIR$ctl02$ReportButton',
                                           '__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)'
                                                                      ).extract_first()
                                          },
                                 callback=self.parse_results)

    def parse_results(self, response):
        soup = BeautifulSoup(response.body, 'html.parser')
        brief_result = {
            'activity_id': soup.find(id='LabelActivityNumber').text,
            'incident_brief': soup.find(id='LabelIncidentBrief').text
        }
        briefs.append(brief_result)
        
        yield brief_result

In [None]:
process = CrawlerProcess(settings={
    'FEED_FORMAT':'json',
    'FEED_URI': '../data/misle/scrape/misle-scraped-brief.json',
    'LOG_LEVEL': logging.WARNING,
})

for i in range(len(activity_list)):
    if i < 100:
        time.sleep(5)
        process.crawl(MISLEViewStateSpider, activity_list[i])
    
process.start() # the script will block here until the crawling is finished

In [None]:
briefs

In [None]:
authors = ['J.K. Rowling']
tags = ['live-death-love','friends']

In [None]:
class SpidyQuotesViewStateSpider(scrapy.Spider):
    name = 'spidyquotes-viewstate'
    start_urls = ['http://quotes.toscrape.com/search.aspx']
    download_delay = 1.5
    
    def parse(self, response):
        for author in authors:
            yield scrapy.FormRequest('http://quotes.toscrape.com/filter.aspx',
                                     formdata={'author': author,
                                               '__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)'
                                                                          ).extract_first()
                                              },
                                     callback=self.parse_tags)

    def parse_tags(self, response):
        for tag in tags:
            yield scrapy.FormRequest('http://quotes.toscrape.com/filter.aspx',
                                     formdata={'author': response.css('select#author > option[selected] ::attr(value)'
                                                                     ).extract_first(),
                                               'tag': tag,
                                               '__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)'
                                                                          ).extract_first()
                                              },
                                     callback=self.parse_results)

    def parse_results(self, response):
        for quote in response.css("div.quote"):
            yield {
                'quote': quote.css('span.content ::text').extract_first(),
                'author': quote.css('span.author ::text').extract_first(),
                'tag': quote.css('span.tag ::text').extract_first(),
            }

In [None]:
process = CrawlerProcess(settings={
    'FEED_FORMAT': 'json',
    'FEED_URI': 'items.json'
})

process.crawl(SpidyQuotesViewStateSpider)
process.start() # the script will block here until the crawling is finished

```
2019-10-16 15:42:25 [scrapy.utils.log] INFO: Scrapy 1.7.3 started (bot: scrapybot)
2019-10-16 15:42:25 [scrapy.utils.log] INFO: Versions: lxml 4.4.1.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.7.0, Python 3.7.4 (default, Sep 11 2019, 09:35:55) - [Clang 8.0.0 (clang-800.0.42.1)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1c  28 May 2019), cryptography 2.7, Platform Darwin-15.6.0-x86_64-i386-64bit
2019-10-16 15:42:25 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'items.json'}
2019-10-16 15:42:25 [scrapy.extensions.telnet] INFO: Telnet Password: a9d3096c91063f7f
2019-10-16 15:42:25 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2019-10-16 15:42:25 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
 'scrapy.downloadermiddlewares.stats.DownloaderStats']
2019-10-16 15:42:25 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
 'scrapy.spidermiddlewares.referer.RefererMiddleware',
 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
 'scrapy.spidermiddlewares.depth.DepthMiddleware']
2019-10-16 15:42:25 [scrapy.middleware] INFO: Enabled item pipelines:
[]
2019-10-16 15:42:25 [scrapy.core.engine] INFO: Spider opened
2019-10-16 15:42:25 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2019-10-16 15:42:25 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2019-10-16 15:42:25 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://quotes.toscrape.com/search.aspx> (referer: None)
2019-10-16 15:42:27 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://quotes.toscrape.com/filter.aspx> (referer: http://quotes.toscrape.com/search.aspx)
2019-10-16 15:42:28 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://quotes.toscrape.com/filter.aspx> (referer: http://quotes.toscrape.com/filter.aspx)
2019-10-16 15:42:28 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/filter.aspx>
{'quote': '“It takes a great deal of bravery to stand up to our enemies, but just as much to stand up to our friends.”', 'author': 'J.K. Rowling', 'tag': 'friends'}
2019-10-16 15:42:29 [scrapy.core.engine] DEBUG: Crawled (200) <POST http://quotes.toscrape.com/filter.aspx> (referer: http://quotes.toscrape.com/filter.aspx)
2019-10-16 15:42:29 [scrapy.core.scraper] DEBUG: Scraped from <200 http://quotes.toscrape.com/filter.aspx>
{'quote': '“Do not pity the dead, Harry. Pity the living, and, above all those who live without love.”', 'author': 'J.K. Rowling', 'tag': 'live-death-love'}
2019-10-16 15:42:29 [scrapy.core.engine] INFO: Closing spider (finished)
2019-10-16 15:42:29 [scrapy.extensions.feedexport] INFO: Stored json feed (2 items) in: items.json
2019-10-16 15:42:29 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 4437,
 'downloader/request_count': 4,
 'downloader/request_method_count/GET': 1,
 'downloader/request_method_count/POST': 3,
 'downloader/response_bytes': 11386,
 'downloader/response_count': 4,
 'downloader/response_status_count/200': 4,
 'elapsed_time_seconds': 4.152423,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2019, 10, 16, 19, 42, 29, 780676),
 'item_scraped_count': 2,
 'log_count/DEBUG': 6,
 'log_count/INFO': 11,
 'memusage/max': 99852288,
 'memusage/startup': 99852288,
 'request_depth_max': 2,
 'response_received_count': 4,
 'scheduler/dequeued': 4,
 'scheduler/dequeued/memory': 4,
 'scheduler/enqueued': 4,
 'scheduler/enqueued/memory': 4,
 'start_time': datetime.datetime(2019, 10, 16, 19, 42, 25, 628253)}
2019-10-16 15:42:29 [scrapy.core.engine] INFO: Spider closed (finished)
```