# Web Scraping Examples - Online Police Logs

In [1]:
from IPython.display import IFrame
IFrame('https://www.cambridgema.gov/cpd/newsandalerts/Archives', width=800, height=300)

In [2]:
import requests
from bs4 import BeautifulSoup
import re

In [3]:
r = requests.get('https://www.cambridgema.gov/robots.txt')
print(r.text)

ï»¿User-Agent: *
Disallow: /propertydatabase/


In [4]:
url = 'https://www.cambridgema.gov/cpd/newsandalerts/Archives'
r = requests.get(url)
landing_page = r.text


In [5]:
r.status_code


200

In [6]:
r.encoding

'utf-8'

In [8]:
print(landing_page[:1500])





<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head id="Head1"><title>
	News - Police Department - City of Cambridge, Massachusetts
</title><meta name="description" content="CPD News" /><meta name="keywords" content="news, cambridge police" /><meta name="robots" content="noindex, follow" /><meta name="author" content="City of Cambridge" /><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /><meta http-equiv="X-UA-Compatible" content="IE=8" /><link rel="stylesheet" type="text/css" href="/Layouts/CambridgePD/_resources/css/global.css" media="all"/><link rel="stylesheet" type="text/css" href="/Layouts/CambridgePD/_resources/css/cpd.css" media="all"/><link rel="stylesheet" type="text/css" href="/Layouts/CambridgePD/_resources/css/print.css" media="print"/><link rel="stylesheet" type="text/css" href="/Layouts/SharedPresentation/_reso

In [9]:
soup = BeautifulSoup(landing_page,'lxml')


In [10]:
def make_soup(url,params = None):
    response = requests.get(url, params=params)
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'lxml')
    else:
        print('Error')
        raise
    return soup


In [14]:
soup = make_soup(url)
print(soup.prettify()[:1500])


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head id="Head1">
  <title>
   News - Police Department - City of Cambridge, Massachusetts
  </title>
  <meta content="CPD News" name="description"/>
  <meta content="news, cambridge police" name="keywords"/>
  <meta content="noindex, follow" name="robots"/>
  <meta content="City of Cambridge" name="author"/>
  <meta content="text/html;charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=8" http-equiv="X-UA-Compatible"/>
  <link href="/Layouts/CambridgePD/_resources/css/global.css" media="all" rel="stylesheet" type="text/css"/>
  <link href="/Layouts/CambridgePD/_resources/css/cpd.css" media="all" rel="stylesheet" type="text/css"/>
  <link href="/Layouts/CambridgePD/_resources/css/print.css" media="print" rel="stylesheet" type="text/css"/>
  <link href="/Layouts/SharedPresentation/_resources/css/univ

In [16]:
all_a = soup.findAll('a')

In [17]:
all_a[:10]


[<a href="/cpd" id="brand">Police Department</a>,
 <a href="/cpd/newsandalerts/Calendar">Calendar</a>,
 <a href="/cpd/ContactUs">Contact Us</a>,
 <a href="/cpd/Publications">Publications</a>,
 <a class="reg" href="#" onclick="setActiveStyleSheet('default', 'true', 'false'); return false;">A</a>,
 <a class="med" href="#" onclick="setActiveStyleSheet('medium', 'true', 'false'); return false;">A</a>,
 <a class="large" href="#" onclick="setActiveStyleSheet('large', 'true', 'false'); return false;">A</a>,
 <a class="nav-home" href="/cpd">Police Department</a>,
 <a class="nav-units" href="/cpd/policeunits">Police Units</a>,
 <a class="nav-resources" href="/cpd/communityresources">Community Resources</a>]

In [18]:
a = all_a[0]


In [19]:
a.text


'Police Department'

In [20]:
a.attrs['href']


'/cpd'

In [None]:
# bucket +> dictionary
# queue +> urls of all pages
# while loop: scraping



## Intermezzo: Handling Exceptions in Python

In [31]:
import sys

try:
    # do something
    x = 1/2
    x = 1/0
except ZeroDivisionError:
    print('Zero division error')
    pass
except:
    print('I have an unknown error..', sys.exc_info()[0])
    raise NameError('... an error ....')
    # do something else if everything went wrong
else:
    print('The result of the division is %1.2f' % x)
finally:
    print('Done!')

Zero division error
Done!


## Back to scraping..

### Get url of police daily logs

In [44]:
from time import sleep

In [45]:
sleep(10)

In [46]:
# bucket +> dictionary
# queue +> urls of all pages
# while loop: scraping
Logs = {} # key is date, value is the url
#queue = ['https://www.cambridgema.gov/cpd/newsandalerts/Archives']
queue = ['https://www.cambridgema.gov/cpd/newsandalerts/Archives?Year=%d' % year for year in range(2014,2018)]

while queue:
    url = queue.pop()
    soup = make_soup(url) # http request
    for a in soup.find_all('a'):
        if 'Daily Log' in a.text:
            label = a.text
            url = a.attrs['href']
            
            try:
                date = re.findall( '(\d{1,2})\.(\d{1,2})\.(\d{4})'  , label)[0]
                month = int(date[0])
                day = int(date[1])
                year = int(date[2])
                pretty_date = '%02d/%02d/%d' % (month,day,year)
                print(pretty_date)
                Logs[pretty_date] = {'url': 'https://www.cambridgema.gov%s' % url}
            except IndexError:
                print('!!! index error !!!')
            except:
                print('Unknown:', sys.exc_info()[0])
    
    next_page = soup.find('a', text='Next')
    if next_page:
        print('Next page found..')
        url = 'https://www.cambridgema.gov%s' % next_page.attrs['href']
        queue.append(url)
    sleep(2)


09/27/2017
09/26/2017
09/25/2017
09/24/2017
09/21/2017
09/20/2017
!!! index error !!!
Next page found..
09/19/2017
09/18/2017
09/17/2017
09/14/2017
09/13/2017
09/12/2017
09/11/2017
!!! index error !!!
Next page found..
09/11/2017
09/10/2017
09/07/2017
09/06/2017
09/05/2017
09/04/2017
!!! index error !!!
Next page found..
09/04/2017
08/31/2017
08/30/2017
08/29/2017
08/28/2017
08/27/2017
08/24/2017
08/23/2017
!!! index error !!!
Next page found..
08/22/2017
08/21/2017
08/20/2017
08/17/2017
08/16/2017
08/15/2017
08/14/2017
08/13/2017
!!! index error !!!
Next page found..
08/10/2017
08/09/2017
08/08/2017
08/07/2017
08/06/2017
08/03/2017
08/02/2017
08/01/2017
07/31/2017
07/30/2017
!!! index error !!!
Next page found..


KeyboardInterrupt: 

In [48]:
Logs['07/30/2017']

{'url': 'https://www.cambridgema.gov/cpd/newsandalerts/Archives/detail.aspx?path=%2fsitecore%2fcontent%2fhome%2fcpd%2fnewsandalerts%2fArchives%2f2017%2f07%2f07302017'}

In [43]:
soup.find('a', text = 'Next111')

In [35]:
label = 'Daily Log 9.27.2017'
re.findall( '(\d{1,2})\.(\d{1,2})\.(\d{4})'  , label)[0]

('9', '27', '2017')

In [38]:
digit = 9

In [39]:
print('%02d' % digit)

09


In [36]:
x = []

In [37]:
x[0]

IndexError: list index out of range

### Get url of police daily logs for all the years

### Parse individual log pages

In [49]:
for log, url in Logs.items():
    print(log, url)

09/27/2017 {'url': 'https://www.cambridgema.gov/cpd/newsandalerts/Archives/detail.aspx?path=%2fsitecore%2fcontent%2fhome%2fcpd%2fnewsandalerts%2fArchives%2f2017%2f09%2f09272017'}
09/26/2017 {'url': 'https://www.cambridgema.gov/cpd/newsandalerts/Archives/detail.aspx?path=%2fsitecore%2fcontent%2fhome%2fcpd%2fnewsandalerts%2fArchives%2f2017%2f09%2f09262017'}
09/25/2017 {'url': 'https://www.cambridgema.gov/cpd/newsandalerts/Archives/detail.aspx?path=%2fsitecore%2fcontent%2fhome%2fcpd%2fnewsandalerts%2fArchives%2f2017%2f09%2f09252017'}
09/24/2017 {'url': 'https://www.cambridgema.gov/cpd/newsandalerts/Archives/detail.aspx?path=%2fsitecore%2fcontent%2fhome%2fcpd%2fnewsandalerts%2fArchives%2f2017%2f09%2f09242017'}
09/21/2017 {'url': 'https://www.cambridgema.gov/cpd/newsandalerts/Archives/detail.aspx?path=%2fsitecore%2fcontent%2fhome%2fcpd%2fnewsandalerts%2fArchives%2f2017%2f09%2f09212017'}
09/20/2017 {'url': 'https://www.cambridgema.gov/cpd/newsandalerts/Archives/detail.aspx?path=%2fsitecore%2

In [50]:
log

'07/30/2017'

In [51]:
url = Logs[log]['url']
print(url)
soup = make_soup(url)

https://www.cambridgema.gov/cpd/newsandalerts/Archives/detail.aspx?path=%2fsitecore%2fcontent%2fhome%2fcpd%2fnewsandalerts%2fArchives%2f2017%2f07%2f07302017


In [52]:
soup.find('tr')

<tr style="mso-yfti-irow: 0; mso-yfti-firstrow: yes;">
<td colspan="2" style="padding: 0.75pt; border: rgb(0, 0, 0); width: 464.2pt; background-color: transparent;">
<p style="margin: 0in 0in 0pt; text-align: center; line-height: normal;"><strong>Cambridge
            Police Daily Log July 30<sup>th</sup>, 2017</strong></p>
</td>
</tr>

In [53]:
soup.find('tr').getText().

'\n\ncambridge\n            police daily log july 30th, 2017\n\n'

In [54]:
rows = soup.find_all('tr')

In [56]:
row = rows[3]

In [57]:
row

<tr style="mso-yfti-irow: 3;">
<td style="padding: 0.75pt; border: rgb(0, 0, 0); width: 160.35pt; background-color: transparent;" valign="top">
<p style="margin: 0in 0in 0pt; line-height: normal;">07/30/2017 <strong>06:45</strong><br/>
<strong>INCIDENT </strong> 17005254<br/>
<strong>B&amp;E NIGHTTIME FOR FELONY c266
            S16</strong></p>
</td>
<td style="padding: 0.75pt; border: rgb(0, 0, 0); width: 302.35pt; background-color: transparent;">
<p style="margin: 0in 0in 0pt; line-height: normal;"><strong>MASSACHUSETTS AVE</strong><br/>
            
            A restaurant on Massachusetts Avenue reported a night time breaking and entering
            and larceny to Cambridge Police.</p>
</td>
</tr>

In [58]:
cells = [cell for cell in row.find_all('td') if cell.getText().strip()]

In [63]:
cells[1]

<td style="padding: 0.75pt; border: rgb(0, 0, 0); width: 302.35pt; background-color: transparent;">
<p style="margin: 0in 0in 0pt; line-height: normal;"><strong>MASSACHUSETTS AVE</strong><br/>
            
            A restaurant on Massachusetts Avenue reported a night time breaking and entering
            and larceny to Cambridge Police.</p>
</td>

In [70]:
rows = []
for row in soup.find_all('tr'):
    text = row.getText().lower()
    try:
        if ('incident' in text) or ('arrest' in text):
            cells = [cell for cell in row.find_all('td') if cell.getText().strip()]
            for i, cell in enumerate(cells):
                text = cell.getText().strip()
                if i == 0:
                    incident_number = re.findall('(\d{8})', text)[-1]
                    text = text.split(incident_number)
                    crime = text[-1]
                    crime = crime.strip().lower()
                    crime = re.sub('\s+', ' ', crime)
                    crime = re.sub('\n',' ', crime)
                    if len(crime) == 0:
                        crime = 'unclassified'
                    print('crime: %s' % crime, end = '\t')
                elif i == 1:
                    place = cell.find('strong').getText()
                    description = text[len(place):]
                    place = place.strip().lower()
                    description = description.strip().lower()
                    print('place: %s' % place, end = '\t')
                    print('description: %s' % description, end = '\t')
    
            if (len(crime)>0) & (len(place)>0)& (len(description)>0): 
                    rows.append((crime,place, description))
    except:
        print('Some error occurred:', sys.exc_info()[0])

crime: misc. report type	place: sciarappa st	description: cambridge police responded to reports of a loud party.	crime: b&e nighttime for felony c266 s16	place: massachusetts ave	description: a restaurant on massachusetts avenue reported a night time breaking and entering
            and larceny to cambridge police.	crime: destruction of property +$250, malicious c266 s12	place: cottage st	description: several cambridge residents reported multiple incidents of malicious
            destruction of property and vandalism to their motor vehicles.	crime: destruction of property +$250, malicious c266 s12	place: bowdoin st	description: a bowdoin street resident reported his vehicle was damaged around 4 a.m. by
            an unknown person while it was parked in his driveway.	crime: b&e daytime for felony c266 s18	place: cambridge st	description: a somerville resident reported an attempted break in at the office where she
            works in cambridge. the attempted break in took place on 7

___

In [72]:
rows[0]

('misc. report type',
 'sciarappa st',
 'cambridge police responded to reports of a loud party.')

In [73]:
rows[2]

('destruction of property +$250, malicious c266 s12',
 'cottage st',
 'several cambridge residents reported multiple incidents of malicious\n            destruction of property and vandalism to their motor vehicles.')