In [1]:
# Dependencies
import json
import requests
from bs4 import BeautifulSoup as bs
import pymongo

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.etl_project_db
collection = db.oc_data

In [2]:
# URL of page to be scraped
# Orange County: https://data-ocpw.opendata.arcgis.com/datasets/d625d46014c44e68a483ab0e74be2aa2_7/data
url_oc = 'https://opendata.arcgis.com/datasets/d625d46014c44e68a483ab0e74be2aa2_7.geojson'

# Los Angeles: https://data.lacity.org/dataset/2010-Census-Populations-by-Zip-Code/nxs9-385f/data
url_la = 'https://data.lacity.org/resource/nxs9-385f.json'

url_oc_city_data = 'https://www.ciclt.net/sn/clt/capitolimpact/gw_ziplist.aspx?FIPS=06059'
url_oc_city_data2 = 'https://www.zipcodestogo.com/Orange/CA/'

In [4]:
# Retrieve page with the requests module
response = requests.get(url_oc).json()
response_oc_city = requests.get(url_oc_city_data)

# Create BeautifulSoup object; parse with 'lxml'
soup = bs(response_oc_city.text, 'html.parser')
print(soup)


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>
	CI Gateway Zip Code List
</title><link href="/sn/clt/capitolimpact/StyleSheet.css" rel="stylesheet" title="Default" type="text/css"/></head>
<body id="MPBody" style="margin-top: 0; background-color: #0a1451;">
<script src="/milonic/milonic_src.js" type="text/javascript"></script>
<param copyright="JavaScript Menu by Milonic - http://www.milonic.com"/>
<script type="text/javascript"></script>
<script language="JavaScript" src="/milonic/mmenudom.js"></script>
<script src="/sn/clt/capitolimpact/MenuGW_Public.js" type="text/javascript"></script>
<!-- Header Section -->
<center>
<table border="0" cellpadding="0" cellspacing="0" style="border-right: white thin solid; border-top: white thin solid; border-left: white thin solid; border-bottom: white thin solid; background-color: white" width="800"><tr>
<td valig

In [6]:
for i in range(len(response['features'])):
    zipcode = response['features'][i]["properties"]["GEOID10"]
    total_population = response['features'][i]["properties"]["DP0010001"]
    male_population = response['features'][i]["properties"]["DP0010020"]
    female_population = response['features'][i]["properties"]["DP0010039"]
    total_household = response['features'][i]["properties"]["DP0130001"]
    
    post = {
        'zipcode':zipcode,
        'total_population':total_population,
        'male_population':male_population,
        'female_population':female_population,
        'total_household':total_household
    }
    
    # Dictionary to be inserted as a MongoDB document
    collection.insert_one(post)

In [7]:
# Display items in MongoDB collection
oc_data = db.oc_data.find()

for data in oc_data:
    print(data)

{'_id': ObjectId('5dcccbcf26aec451d00c337d'), 'zipcode': '90620', 'total_population': 45113, 'male_population': 22241, 'female_population': 22872, 'total_household': 13268}
{'_id': ObjectId('5dcccbcf26aec451d00c337e'), 'zipcode': '90621', 'total_population': 35153, 'male_population': 17377, 'female_population': 17776, 'total_household': 10304}
{'_id': ObjectId('5dcccbcf26aec451d00c337f'), 'zipcode': '90623', 'total_population': 15554, 'male_population': 7516, 'female_population': 8038, 'total_household': 5072}
{'_id': ObjectId('5dcccbcf26aec451d00c3380'), 'zipcode': '90630', 'total_population': 47993, 'male_population': 23204, 'female_population': 24789, 'total_household': 15785}
{'_id': ObjectId('5dcccbcf26aec451d00c3381'), 'zipcode': '90631', 'total_population': 67619, 'male_population': 33320, 'female_population': 34299, 'total_household': 21452}
{'_id': ObjectId('5dcccbcf26aec451d00c3382'), 'zipcode': '90680', 'total_population': 29945, 'male_population': 14811, 'female_population'

In [5]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all('tr')

print(results)


[<tr>
<td valign="top"><img alt="" src="http://www.ciclt.net/ul/capitolimpact_gw/images/CI_Gateway.jpg"/></td>
</tr>, <tr>
<!-- Body Section -->
<td align="center" valign="top">
<center>
<script type="text/javascript">
with(new menuname("GWSub")){
style=menuStyle2;
alwaysvisible=1;
top=152;
screenposition="center";
orientation="horizontal";
aI("text=US;url=/sn/clt/capitolimpact/gw_default.aspx;");
aI("text=California;url=/sn/clt/capitolimpact/gw_state.aspx?state=ca&stfips=06&stname=California");
aI("text=Orange County, CA;url=/sn/clt/capitolimpact/gw_countydet.aspx?FIPS=06059&state=ca&stfips=06&stname=California");
aI("text=Help;url=/sn/clt/clthelp/hpage.aspx?ClientCode=clthelp&H_Key=gw_state&H_System=net&H_Type=Start;target=_blank;");
}
drawMenus();
</script>
<br/><br/>
<h3>Zip Code List<br/><span id="ContentPlaceHolder1_litHeader">County - Orange County, California</span></h3>
<table><tr><td valign="top"><td><table border="3" cellpadding="3" cellspacing="3"><tr style="ba

In [6]:
for result in results:
    if result.find('a'):
        print(result.find_all('a')[0].text)
        print(result.find_all('a')[1].text)


90620
Buena Park
90620
Buena Park
90620
Buena Park
90621
Buena Park
90622
Buena Park
90623
Buena Park
90623
La Palma
90624
Buena Park
90630
Cypress
90631
La Habra
90631
La Habra Heights
90632
La Habra
90633
La Habra
90680
Stanton
90720
Rossmoor
90720
Los Alamitos
90721
Los Alamitos
90740
Seal Beach
90742
Sunset Beach
90743
Surfside
92602
Irvine
92603
Irvine
92604
Irvine
92605
Huntington Beach
92606
Irvine
92607
Laguna Beach
92607
Laguna Niguel
92609
El Toro
92609
Lake Forest
92610
El Toro
92610
Foothill Ranch
92612
Irvine
92614
Irvine
92615
Huntington Beach
92616
Irvine
92618
Irvine
92619
Irvine
92620
Irvine
92623
Irvine
92624
Capistrano Beach
92624
Dana Point
92625
Corona Del Mar
92626
Costa Mesa
92627
Costa Mesa
92628
Costa Mesa
92629
Monarch Bay
92629
Monarch Beach
92629
Dana Point
92630
El Toro
92630
Lake Forest
92637
Laguna Hills
92646
Huntington Beach
92647
Huntington Beach
92648
Huntington Beach
92649
Huntington Beach
92650
East Irvine
92650
Irvine
92651
Laguna Beach
92652
Lagun

In [51]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all('td')

print(results)

[]
