# Intro to Scraping

## Part 1: BeautifulSoup

BeautifulSoup Documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#

In [1]:
# Imports
# If this isn't working, uncomment the following line to install (this is not the recommended way)
# !pip install beautifulsoup4
from bs4 import BeautifulSoup
import pandas as pd

# Import the requests library, which underlies most of this tutorial
# You don't actually need to know much more than requests.get(url) though # https://2.python-requests.org/en/v2.5.3/user/advanced/
import requests 

In [2]:
# Identify a target URL, and fetch the HTML 
warn_url = "https://labor.ny.gov/app/warn/"
response = requests.get(warn_url)
print(response.status_code)

200


In [3]:
# Assuming we only want 2020 data
warn_2020_url = "https://labor.ny.gov/app/warn/default.asp?warnYr=2020"
response = requests.get(warn_url)

### Create a soup object to help parse the html

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <link href="//www.labor.ny.gov/css/apps/v.0.5.0/css/nys-global-nav.min.css" media="screen" rel="stylesheet"/>
  <link href="//www.labor.ny.gov/css/apps/v.0.5.0/css/nys-global-nav-fonts.min.css" media="screen" rel="stylesheet"/>
  <link href="//www.labor.ny.gov/css/apps/v.0.5.0/css/business.min.css" media="screen" rel="stylesheet"/>
  <link href="//www.labor.ny.gov/css/apps/v.0.5.0/css/ux-style.min.css" media="screen" rel="stylesheet"/>
  <link href="//www.labor.ny.gov/css/apps/v.0.5.0/css/ux-print.min.css" media="print" rel="stylesheet"/>
  <title>
   Worker Adjustment and Retraining Notification - New York State Department of Labor
  </title>
  <meta content="" name="Keywords">
   <meta content="" name="Keyphrases">
    <meta content="" name="Keysentences"/>
    <meta content="" name="Summary"/>
    <meta content="17" name="revisionNumber

### Let's look at the page

[](img/warn.png)

![WARN](img/warn.png)

It looks like we want to parse this table.



### Quick HTML Syntax Review
![HTML tag syntax](img/html-tag.png)

### Get the table with .find()

In [5]:
# Get the table
table = soup.find('table')
print('1', type(table))

# This is does same thing, but the syntax is a little cleaner
table = soup.table
print('2', type(table))

1 <class 'bs4.element.Tag'>
2 <class 'bs4.element.Tag'>


### Look at the items

In [6]:
# Here's one way to do it
for item in table:
    print(item)



<tr>
<td><span class="style2"><font face="Arial, Helvetica, sans-serif">4 / 1 / 2020 - WARN Notice Dated 3/26/2020<br/>
<a href="details.asp?id=7387"><strong>38 West 26th Street Restaurant Corp. dba Flatiron Hall - New York City Region</strong></a></font></span>
</td>
</tr>


<tr>
<td><span class="style2"><font face="Arial, Helvetica, sans-serif">4 / 1 / 2020 - WARN Notice Dated 3/25/2020<br/>
<a href="details.asp?id=7388"><strong>Havatequila Restaurant Partners, LLC - New York City Region</strong></a></font></span>
</td>
</tr>


<tr>
<td><span class="style2"><font face="Arial, Helvetica, sans-serif">4 / 1 / 2020 - WARN Notice Dated 3/25/2020<br/>
<a href="details.asp?id=7389"><strong>Forest Electric Corp. - New York City Region</strong></a></font></span>
</td>
</tr>


<tr>
<td><span class="style2"><font face="Arial, Helvetica, sans-serif">4 / 1 / 2020 - WARN Notice Dated 3/20/2020<br/>
<a href="details.asp?id=7390"><strong>GFB Restaurant Corp. - New York City Region</strong></a></fo

In [7]:
# Or you could just extract the links with find_all

# find_all() returns a list
link_tags = table.find_all('a')
print('number of links:', len(link_tags))

for link in link_tags:
    print(link.prettify())

number of links: 365
<a href="details.asp?id=7387">
 <strong>
  38 West 26th Street Restaurant Corp. dba Flatiron Hall - New York City Region
 </strong>
</a>
<a href="details.asp?id=7388">
 <strong>
  Havatequila Restaurant Partners, LLC - New York City Region
 </strong>
</a>
<a href="details.asp?id=7389">
 <strong>
  Forest Electric Corp. - New York City Region
 </strong>
</a>
<a href="details.asp?id=7390">
 <strong>
  GFB Restaurant Corp. - New York City Region
 </strong>
</a>
<a href="details.asp?id=7391">
 <strong>
  IMNY GS, LLC - New York City Region
 </strong>
</a>
<a href="details.asp?id=7392">
 <strong>
  II Mulino Gramercy, LLC - New York City Region
 </strong>
</a>
<a href="details.asp?id=7393">
 <strong>
  Chateau Briand Caterers - Long Island Region
 </strong>
</a>
<a href="details.asp?id=7394">
 <strong>
  The Fox Hollow - Long Island Region
 </strong>
</a>
<a href="details.asp?id=7395">
 <strong>
  Watermill Caterers - Long Island Region
 </strong>
</a>
<a href="details.

In [8]:
# But we just want the text for the url to go to

# Create a list of links
links = []
for link in link_tags:
    # Use the bracket notation to get an attribute from a tag
    # print(link['href'])
    
    # add the current link text to the list of links
    links.append(warn_url + link['href'])

In [9]:
for link in links:
    print(link)

https://labor.ny.gov/app/warn/details.asp?id=7387
https://labor.ny.gov/app/warn/details.asp?id=7388
https://labor.ny.gov/app/warn/details.asp?id=7389
https://labor.ny.gov/app/warn/details.asp?id=7390
https://labor.ny.gov/app/warn/details.asp?id=7391
https://labor.ny.gov/app/warn/details.asp?id=7392
https://labor.ny.gov/app/warn/details.asp?id=7393
https://labor.ny.gov/app/warn/details.asp?id=7394
https://labor.ny.gov/app/warn/details.asp?id=7395
https://labor.ny.gov/app/warn/details.asp?id=7396
https://labor.ny.gov/app/warn/details.asp?id=7397
https://labor.ny.gov/app/warn/details.asp?id=7398
https://labor.ny.gov/app/warn/details.asp?id=7399
https://labor.ny.gov/app/warn/details.asp?id=7400
https://labor.ny.gov/app/warn/details.asp?id=7401
https://labor.ny.gov/app/warn/details.asp?id=7402
https://labor.ny.gov/app/warn/details.asp?id=7403
https://labor.ny.gov/app/warn/details.asp?id=7404
https://labor.ny.gov/app/warn/details.asp?id=7405
https://labor.ny.gov/app/warn/details.asp?id=7406


In [10]:
# Create a pandas DataFrame to store the data we scrape.
# Each row in the dataframe will correspond to a single WARN listing
# Each column will be a piece of single labeled piece information from the listing
data = []

def scrape_single_page(url):
    
    # Create a dictionary to store the data for a single WARN listing
    page_data = {}
    
    # Fetch the page
    response = requests.get(url)
    page_soup = BeautifulSoup(response.text, 'html.parser')
    
    # Sanity check
    # print(page_soup.prettify())
    
    # Get the first/only table
    table = page_soup.table
    
    # Get each paragraph tag
    paragraphs = table.find_all('p')
    
    # Use .text to get the inner text for each p
    for paragraph in paragraphs:
        # print(paragraph.text)
        text = paragraph.text
        print(text)
        
        # We are going to split on only the first colon in each row (':') by using text.split(delimeter, 1)
        split_text = text.split(':', 1)
        
        print(split_text)
        
        # Add this paragraph to the page data
        page_data[split_text[0]] = split_text[1]
        
    # After looping through each paragraph, add this listing to the DataFrame
    data.append(page_data)

scrape_single_page(links[0])

Date of Notice:  3/26/2020
['Date of Notice', '\xa0 3/26/2020']
Event Number:  2019-0734
['Event Number', '\xa0 2019-0734']
Rapid Response Specialist:  Stuart Goldberg
['Rapid Response Specialist', '\xa0 Stuart Goldberg']
Reason Stated for Filing:  Temporary Plant Closing
['Reason Stated for Filing', '\xa0 Temporary Plant Closing']
Company:           38 West 26th Street Restaurant Corp. dba Flatiron Hall 38 West 26th Street New York, NY 10010
['Company', '\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 38 West 26th Street Restaurant Corp. dba Flatiron Hall 38 West 26th Street New York, NY 10010']
County:  New York | WDB Name:  NEW YORK CITY | Region:  New York City
['County', '\xa0 New York | WDB Name:\xa0 NEW YORK CITY | Region:\xa0 New York City']
Contact:  Jon Bloostein, CEO
['Contact', '\xa0 Jon Bloostein, CEO']
Phone:  (917) 999-6532
['Phone', '\xa0 (917) 999-6532']
Business Type:  Restaurant
['Business Type', '\xa0 Restaurant']
Number Affected:  49
['Number Affected', '\xa0 49']
Total E

In [11]:
import time
import gzip
import unicodedata

# Create a list to store the data we scrape.
# Each item in the list will correspond to a single WARN listing
# Each column will be a piece of single labeled piece information from the listing
data = []

def scrape_single_page(url):
    print('scraping', url)
    
    # Create a dictionary to store the data for a single WARN listing
    page_data = {}
    
    # Fetch the page
    my_headers = {'accept-encoding':'gzip'}
    response = requests.get(url, headers=my_headers)
    
    # This is pretty atypical
    # If the request didn't automatically unzip the html, we have to do it ourselves
    if response.apparent_encoding is None:
        html = gzip.decompress(response.content).decode('utf-8')
    else:
        html = response.text
        
    # Remove non-breaking space characters in the HTML
    html = html.replace('&nbsp;', ' ')
    
    # Make the soup for the single page
    page_soup = BeautifulSoup(html, 'html.parser')
    
    # Sanity check
    # print(page_soup.prettify())
    
    # Get the first table (there should only be one)
    table = page_soup.table
    
    # Get each paragraph tag
    paragraphs = table.find_all('p')
    
    # Use .text to get the inner text for each p
    for paragraph in paragraphs:
        # print(paragraph.text)
        text = paragraph.text
        
        # We are going to split on only the first colon in each row (':') by using text.split(delim, 1)
        split_text = text.split(':', 1)
        
        if len(split_text) == 2:
            # Add this paragraph to the page data
            page_data[split_text[0]] = split_text[1]
        
        
    # After looping through each paragraph, add this listing to the DataFrame
    data.append(page_data)

for link in links:
    scrape_single_page(link)
    
    # This is the most important line in the entire notebook
    # This line ensures that you won't crash servers and make peoople come knocking on your door
    # Don't be an idiot!
    time.sleep(0.5)

scraping https://labor.ny.gov/app/warn/details.asp?id=7387
scraping https://labor.ny.gov/app/warn/details.asp?id=7388
scraping https://labor.ny.gov/app/warn/details.asp?id=7389
scraping https://labor.ny.gov/app/warn/details.asp?id=7390
scraping https://labor.ny.gov/app/warn/details.asp?id=7391
scraping https://labor.ny.gov/app/warn/details.asp?id=7392
scraping https://labor.ny.gov/app/warn/details.asp?id=7393
scraping https://labor.ny.gov/app/warn/details.asp?id=7394
scraping https://labor.ny.gov/app/warn/details.asp?id=7395
scraping https://labor.ny.gov/app/warn/details.asp?id=7396
scraping https://labor.ny.gov/app/warn/details.asp?id=7397
scraping https://labor.ny.gov/app/warn/details.asp?id=7398
scraping https://labor.ny.gov/app/warn/details.asp?id=7399
scraping https://labor.ny.gov/app/warn/details.asp?id=7400
scraping https://labor.ny.gov/app/warn/details.asp?id=7401
scraping https://labor.ny.gov/app/warn/details.asp?id=7402
scraping https://labor.ny.gov/app/warn/details.asp?id=74

scraping https://labor.ny.gov/app/warn/details.asp?id=7311
scraping https://labor.ny.gov/app/warn/details.asp?id=7312
scraping https://labor.ny.gov/app/warn/details.asp?id=7313
scraping https://labor.ny.gov/app/warn/details.asp?id=7314
scraping https://labor.ny.gov/app/warn/details.asp?id=7315
scraping https://labor.ny.gov/app/warn/details.asp?id=7180
scraping https://labor.ny.gov/app/warn/details.asp?id=7181
scraping https://labor.ny.gov/app/warn/details.asp?id=7182
scraping https://labor.ny.gov/app/warn/details.asp?id=7183
scraping https://labor.ny.gov/app/warn/details.asp?id=7184
scraping https://labor.ny.gov/app/warn/details.asp?id=7185
scraping https://labor.ny.gov/app/warn/details.asp?id=7186
scraping https://labor.ny.gov/app/warn/details.asp?id=7187
scraping https://labor.ny.gov/app/warn/details.asp?id=7188
scraping https://labor.ny.gov/app/warn/details.asp?id=7189
scraping https://labor.ny.gov/app/warn/details.asp?id=7190
scraping https://labor.ny.gov/app/warn/details.asp?id=71

scraping https://labor.ny.gov/app/warn/details.asp?id=7137
scraping https://labor.ny.gov/app/warn/details.asp?id=7138
scraping https://labor.ny.gov/app/warn/details.asp?id=7139
scraping https://labor.ny.gov/app/warn/details.asp?id=7140
scraping https://labor.ny.gov/app/warn/details.asp?id=7141
scraping https://labor.ny.gov/app/warn/details.asp?id=7142
scraping https://labor.ny.gov/app/warn/details.asp?id=7143
scraping https://labor.ny.gov/app/warn/details.asp?id=7144
scraping https://labor.ny.gov/app/warn/details.asp?id=7145
scraping https://labor.ny.gov/app/warn/details.asp?id=7146
scraping https://labor.ny.gov/app/warn/details.asp?id=7147
scraping https://labor.ny.gov/app/warn/details.asp?id=7127
scraping https://labor.ny.gov/app/warn/details.asp?id=7125
scraping https://labor.ny.gov/app/warn/details.asp?id=7126
scraping https://labor.ny.gov/app/warn/details.asp?id=7124
scraping https://labor.ny.gov/app/warn/details.asp?id=7122
scraping https://labor.ny.gov/app/warn/details.asp?id=71

In [12]:
print(len(data))
df = pd.DataFrame(data)

with open('warn_data.tsv', 'w') as f:
    f.write(df.to_csv(sep='\t'))

df

365


Unnamed: 0,Date of Notice,Event Number,Rapid Response Specialist,Reason Stated for Filing,Company,County,Contact,Phone,Business Type,Number Affected,...,Other Brooklyn Community Services (BCS) sites,2019-0238,2019-0231,Other A.C. Moore Incorporated sites,2019-0213,Layoff Dates,"Other Macy's Retail Holdings, Inc. site",2019-0208,"Other New York Express and Logistics, LLC sites",2019-0202
0,3/26/2020,2019-0734,Stuart Goldberg,Temporary Plant Closing,38 West 26th Street Restaurant Corp...,New York | WDB Name: NEW YORK CITY | Region...,"Jon Bloostein, CEO",(917) 999-6532,Restaurant,49,...,,,,,,,,,,
1,3/25/2020,2019-0733,Stuart Goldberg,Temporary Plant Closing,"Havatequila Restaurant Partners, LL...",New York | WDB Name: NEW YORK CITY | Region...,"Melissa MacLeod, Vice President and General ...",(267) 312-1644,Restaurant,138,...,,,,,,,,,,
2,3/25/2020,2019-0732,Stuart Goldberg,Temporary Plant Layoff,Forest Electric Corp. 1375 Broadway...,New York | WDB Name: NEW YORK CITY | Region...,"Donna M. Lucas, Director of Administration",(212) 318-1735,Electric Company,72,...,,,,,,,,,,
3,3/20/2020,2019-0612,Stuart Goldberg,Temporary Plant Closing,GFB Restaurant Corp. 86 West 3rd St...,New York | WDB Name: NEW YORK CITY | Region...,Payroll Department,(215) 321-2200,Restaurant,47,...,,,,,,,,,,
4,3/20/2020,2019-0615,Stuart Goldberg,Temporary Plant Closing,"IMNY GS, LLC 361 Greenwich Street N...",New York | WDB Name: NEW YORK CITY | Region...,Payroll Department,(215) 321-2200,Restaurant,32,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,1/6/2020,2019-0207,Frederick Danks,Plant Closing,Macy's Broadway Mall Store (Macy's Retail Hold...,Nassau | WDB Name: OYSTER BAY | Region: Long ...,"Heath R. Salit, Human Resources Business Partner",(646) 429-7462,Retail Store,155,...,,,,,,,,Macy's Commack Shopping Center Store (Macy's ...,,
361,12/27/2019,2019-0206,Regenna Darrah,Temporary Plant Closing,Wesley Gardens Nursing Home 3 Upton Park Roche...,Monroe | WDB Name: MONROE | Region: Finger Lakes,"Sharon Davis, Human Resources Manager",(585) 241-2105,Nursing Home,132,...,,,,,,,,,,
362,12/30/2019,2019-0205,Stuart Goldberg,Plant Closing,"127 W. 43rd St. Chophouse, Inc. (He...",New York | WDB Name: NEW YORK CITY | Region: ...,"Jon Bloostein, Chief Executive Officer",(917) 999-6532,Restaurant,106,...,,,,,,,,,,
363,12/30/2019,2019-0201,"Jacqueline Huertas, Karl Price, Regenna Darra...",Plant Closing,"New York Express and Logistics, LLC 292 Wolf R...",Albany | WDB Name: CAPITAL DISTRICT | Region:...,"Chris Kalavantis, Operations Manager",(617) 968-5311,Trucking company providing freight transporta...,48,...,,,,,,,,,,"16 Corporate Circle, East Syracuse, NY 13057 ..."
