## Scraper code to extract reports of NAICS id: 213111, 213112
- NAICS 213111 - Drilling Oil and Gas Wells Accidents
- NAICS 213112 - Oil and Gas Field Services and Not Elsewhere Classified Accidents
- This code downloads the accident reports for 5 years data from 2013 to 2017
- The final data will have three fields - the accident date, report id and report text
- Each report will be appended to a pandas dataframe and finally exported to a csv file

<n/>
This code is more concise than the previous *naics_237120_scraper.ipynb* file. It has the same working mechanism with fewer lines of code and little to no explanation as to how it works. But if you follow the previous scraper code this should be no different and easy to understand.

In [39]:
#import libraries
from bs4 import BeautifulSoup 
import requests
import pandas as pd

In [40]:
# url to begin crawling
home_page_url = "https://www.osha.gov/pls/imis/AccidentSearch.search?p_logger=1&acc_description=&acc_Abstract=&acc_keyword=&sic=&naics=21311&Office=All&officetype=All&endmonth=01&endday=01&endyear=2001&startmonth=12&startday=31&startyear=2017&InspNr="
page_url = home_page_url
url_header = "https://www.osha.gov/pls/imis/"   #to open summary reports and next pages

In [41]:
# lets also create empty lists for all fields
event_date = []
report_id = []
summary = []
company = []
degree = []
age = []
nature = []
flag = True   # setup a flag to exit loop 

In [42]:
# we will now iterate the entire scraping process for each page and each row in a page inside a while loop
while(flag is True):
    response = requests.get(page_url)
    page_content = BeautifulSoup(response.content, 'lxml')
    
    records = (page_content.find("div", id = ("wrapper"))
               .find("div", id = "maincontain", class_ = "container")
               .find_all("table", class_ = ("table table-bordered table-striped"))[1])
    rcd_rows = records.find_all('tr')[1:]
    for tr in rcd_rows:
        td = tr.find_all('td')
        row = [i.text for i in td]
        event_date.append(row[3])    # copy date into list
        report_id.append(row[4])    # copy report_id into list
        # navigate to summary url
        rpt_url = url_header + str(tr.a.get('href'))
        rqst = requests.get(rpt_url)
        soup = BeautifulSoup(rqst.content, "lxml")
        smr = (soup.find("div", id = "maincontain", class_ = "container")
            .find("table", class_ = ("tablei_100 table-borderedi_100 table-striped")))
        smr_rows = smr.find_all('tr')[1:]
        for r in smr_rows:
            if r.td:
                if r.td.attrs['colspan'] == '8':
                    summary.append(r.td.text.strip())
                    break
        company.append(smr_rows[1].find_all('td')[3].text)
        degree.append(smr_rows[-1].find_all('td')[4].text)
        age.append(smr_rows[-1].find_all('td')[2].text)
        nature.append(smr_rows[-1].find_all('td')[5].text)
        
    nxt_page = (page_content
            .find("div", id = "maincontain", class_ = "container")
            .find_all('div', class_="text-right")[1]
            .find('a', title="Next Page"))
    if nxt_page != None: 
        page_url = url_header + str(nxt_page.get('href'))
    else:
        flag = False

In [43]:
len(company)

999

In [44]:
# create a dataframe out of the lists
acc_rpts = pd.DataFrame({"Report_ID" : report_id,
                         "Event_Date" : event_date, 
                         "Age" : age,
                         "Nature" : nature,
                         "Establishment" : company,
                         "Summary" : summary,
                         "Degree" : degree
                         })
acc_rpts.head()

Unnamed: 0,Report_ID,Event_Date,Age,Nature,Establishment,Summary,Degree
0,627700,12/31/2017,50,Other,"Ada Energy Service, Llc","At 9:30 a.m. on December 31, 2017, an employee...",Fatality
1,627700,12/30/2017,28,Fracture,"H.L. Morris Farms, Inc.","At 11:00 a.m. on December 30, 2017, an employe...",Fatality
2,627700,12/21/2017,35,Fracture,Ricks Well Service Llc,"At 10:45 a.m. on December 21, 2017, an employe...",Fatality
3,950647,12/04/2017,21,Fracture,Applied Technologies Associates,"At 11:40 a.m. on December 4, 2017, an employee...",Hospitalized injury
4,626300,11/28/2017,30,Concussion,"Southern Petroleum Laboratories, Inc.","At 12:18 p.m. on November 28, 2017, Employee #...",Hospitalized injury


In [45]:
# export as csv files
acc_rpts.to_csv('naics_21311_data.csv', sep=',', encoding='utf-8', index = False)