# Information

In [1]:
#title                :DC Council Calendar Web Scrape (bs4)
#description          :This will scrape a web page of calendar items listed in a table, place in a pandas DataFrame
#                      and export the DataFrame to a .csv file. 
#author               :alisonthaung
#date created         :2017-07-10
#date last modified   :2017-07-12
#python_version       :'3.6.1 |Anaconda 4.4.0 (x86_64)| (default, May 11 2017, 13:04:09)
#                      [GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]'
#operating system     :MacOS Sierra 10.12.5
#==============================================================================

In [2]:
import sys
sys.version

'3.6.1 |Anaconda 4.4.0 (x86_64)| (default, May 11 2017, 13:04:09) \n[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]'

# Import libraries

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Specify URL from which to scrape, create BeautifulSoup object

In [4]:
url = 'http://dccouncil.us/calendar/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')

# Review underlying html structure of webpage

In [5]:
soup.prettify

<bound method Tag.prettify of <!DOCTYPE html>
<!--[if lt IE 7 ]>
<html lang="en" class="ie6">
<![endif]--><!--[if IE 7 ]>
<html lang="en" class="ie7">
<![endif]--><!--[if IE 8 ]>
<html lang="en" class="ie8">
<![endif]--><!--[if IE 9 ]>
<html lang="en" class="ie9">
<![endif]--><!--[if (gt IE 9)|!(IE)]><!--><html class="no-js" lang="en"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="eSt4OvXWV9h88hl71sSKm4UBBRk9EBAy2R8__sAkCFc" name="google-site-verification"/>
<meta content="/files/site/images/fb_logo.jpeg" property="og:image"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<title>Council of the District of Columbia</title>
<link href="/files/site/assets/favicon.ico" rel="shortcut icon"/>
<link href="http://dccouncil.us/?css=styles/index.v.1444061289" rel="stylesheet"/>
<link href="//fonts.googleapis.com/css?family=Open+Sans:400,600,700" rel="stylesheet" type="text/css"/>
<link href="http://dccouncil.us/news/rss" rel="alternate" title="RSS" type="application/rss

# Create lists of data for columns

The dates, times, and locations of each meeting are listed in a single &lt;td&gt; within a &lt;div&gt; class of "event-description-dev-metabox". Each item is listed in successive &lt;p&gt; tags. The following code extracts the information from successive &lt;p&gt; tags.

In [6]:
date = [div.find('p').text for div in soup.find_all('div', {'class': 'event-description-dev-metabox'})]
time = [div.find('p').findNext('p').text for div in soup.find_all('div', {'class': 'event-description-dev-metabox'})]
location = [div.find('p').findNext('p').findNext('p').text for div in 
            soup.find_all('div', {'class': 'event-description-dev-metabox'})]

In [7]:
date

['Wednesday, 7/12/2017',
 'Wednesday, 7/12/2017',
 'Wednesday, 7/12/2017',
 'Wednesday, 7/12/2017',
 'Wednesday, 7/12/2017',
 'Thursday, 7/13/2017',
 'Thursday, 7/13/2017',
 'Friday, 7/14/2017']

In [8]:
time

['9:30am',
 '11:00am',
 '1:00pm',
 '2:30pm',
 '6:00pm',
 '11:00am',
 '6:30pm',
 '11:00am']

In [9]:
location

['Room 412',
 '(Cancelled)',
 'Room 120',
 'Room 500',
 'University of District of Columbia Student Center; 4200 Connecticut Avenue, N.W.; Washington, DC 20008',
 'Room 412',
 'Edward J. Pryzbyla Center, Great Room A; 620 Michigan Avenuem NE; Washington, DC 20064',
 'Room 500']

# Extract from div class: event-description-content-dev for title and contents of meeting

In [10]:
titles = [div.find('a').text for div in soup.find_all('div', {'class': 'event-description-content-dev'})]

In [11]:
titles

['Judiciary & Public Safety Public  Hearing',
 'Health Public Oversight Hearing (Cancelled)',
 'Human Services Additional Meeting',
 'Committee of the Whole Public Oversight Roundtable',
 'Transportation & the Environment & Business & Economic Development Joint Public Roundtable',
 'Transportation & the Environment & Education Joint Public Roundtable',
 'Transportation & the Environment Public Oversight Roundtable',
 'Transportation & the Environment Public Hearing']

In [12]:
children = soup.findChildren('div', {'class': 'event-description-content-dev'})

content = []
for item in children:
    # Don't need Title info as that's already been extracted. Start reading content at index = 1 and extract data to end
    item = [content for content in item.text.split('\n') if len(content)>0][1:]
    
    # Create string from separate list items to all be listed in
    item = ' '.join(item)
    content.append(item)

content

['The Committee on the Judiciary & Public Safety will hold a Public Hearing on the following Legislation: Bill 22-0080, the "Access to Justice for Immigrants Amendment Act of 2017" Bill 22-0129, the "Street Harassment Prevention Act of 2017" The Committee invites the public to testify or to submit written testimony. Anyone wishing to\xa0testify at the hearing should contact the Committee via email at judiciary@dccouncil.us or at (202)\xa0727-8275, and provide their name, telephone number, organizational affiliation, and title (if any),\xa0by close of business Friday, July 7. Representatives of organizations will be allowed a maximum\xa0of five minutes for oral testimony, and individuals will be allowed a maximum of three minutes. ',
 'The Committee on Health has cancelled this Public Oversight Hearing. ',
 'The Committee on Human Services will hold an Additional Meeting to consider and vote on B22-194, the “DC Healthcare Alliance Recertification Simplification Amendment Act of 2017.” '

# Check that all lists are the same length

Checking that all lists are the same length to "sanity check" the scrape and ensure that the number of rows of data is lined up appropriately for each variable

In [13]:
len(content) == len(titles) == len(location) == len(time) == len(date)

True

# Create DataFrame for data and export to csv

In [14]:
data = {'Date': date,
       'Time' : time,
       'Location': location,
       'Titles': titles,
       'Content': content}

df = pd.DataFrame(data, columns = ['Date', 'Time', 'Location', 'Titles', 'Content'])

Look at dataframe and review to make sure it looks like the data structure you want to export. Spot check against data on website

In [15]:
df.head()

Unnamed: 0,Date,Time,Location,Titles,Content
0,"Wednesday, 7/12/2017",9:30am,Room 412,Judiciary & Public Safety Public Hearing,The Committee on the Judiciary & Public Safety...
1,"Wednesday, 7/12/2017",11:00am,(Cancelled),Health Public Oversight Hearing (Cancelled),The Committee on Health has cancelled this Pub...
2,"Wednesday, 7/12/2017",1:00pm,Room 120,Human Services Additional Meeting,The Committee on Human Services will hold an A...
3,"Wednesday, 7/12/2017",2:30pm,Room 500,Committee of the Whole Public Oversight Roundt...,The Committee of the Whole will hold a Public ...
4,"Wednesday, 7/12/2017",6:00pm,University of District of Columbia Student Cen...,Transportation & the Environment & Business & ...,The Committee on Transportation and the Enviro...


In [16]:
df.to_csv('DC Council Calendar - 2017-07-12')