In [1]:
# Dependencies
import json
import requests
from bs4 import BeautifulSoup as bs
import pymongo

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.etl_project_db
collection = db.oc_data

In [8]:
# URL of page to be scraped
# Orange County: https://data-ocpw.opendata.arcgis.com/datasets/d625d46014c44e68a483ab0e74be2aa2_7/data
url_oc = 'https://opendata.arcgis.com/datasets/d625d46014c44e68a483ab0e74be2aa2_7.geojson'

# Los Angeles: https://data.lacity.org/dataset/2010-Census-Populations-by-Zip-Code/nxs9-385f/data
url_la = 'https://data.lacity.org/resource/nxs9-385f.json'

url_oc_city_data = 'https://www.ciclt.net/sn/clt/capitolimpact/gw_ziplist.aspx?FIPS=06059'


In [9]:
# Retrieve page with the requests module
response = requests.get(url_oc).json()
response_oc_city = requests.get(url_oc_city_data)

# Create BeautifulSoup object; parse with 'lxml'
soup = bs(response_oc_city.text, 'html.parser')

In [6]:
for i in range(len(response['features'])):
    zipcode = response['features'][i]["properties"]["GEOID10"]
    total_population = response['features'][i]["properties"]["DP0010001"]
    male_population = response['features'][i]["properties"]["DP0010020"]
    female_population = response['features'][i]["properties"]["DP0010039"]
    total_household = response['features'][i]["properties"]["DP0130001"]
    
    post = {
        'zipcode':zipcode,
        'total_population':total_population,
        'male_population':male_population,
        'female_population':female_population,
        'total_household':total_household
    }
    
    # Dictionary to be inserted as a MongoDB document
    collection.insert_one(post)

In [7]:
# Display items in MongoDB collection
oc_data = db.oc_data.find()

for data in oc_data:
    print(data)

{'_id': ObjectId('5dcccbcf26aec451d00c337d'), 'zipcode': '90620', 'total_population': 45113, 'male_population': 22241, 'female_population': 22872, 'total_household': 13268}
{'_id': ObjectId('5dcccbcf26aec451d00c337e'), 'zipcode': '90621', 'total_population': 35153, 'male_population': 17377, 'female_population': 17776, 'total_household': 10304}
{'_id': ObjectId('5dcccbcf26aec451d00c337f'), 'zipcode': '90623', 'total_population': 15554, 'male_population': 7516, 'female_population': 8038, 'total_household': 5072}
{'_id': ObjectId('5dcccbcf26aec451d00c3380'), 'zipcode': '90630', 'total_population': 47993, 'male_population': 23204, 'female_population': 24789, 'total_household': 15785}
{'_id': ObjectId('5dcccbcf26aec451d00c3381'), 'zipcode': '90631', 'total_population': 67619, 'male_population': 33320, 'female_population': 34299, 'total_household': 21452}
{'_id': ObjectId('5dcccbcf26aec451d00c3382'), 'zipcode': '90680', 'total_population': 29945, 'male_population': 14811, 'female_population'

In [85]:
# Examine the results, then determine element that contains sought info
# results are returned as an iterable list
results = soup.find_all('tr')

print(results[6])


<tr><td align="left"><a href="/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&amp;stfips=06&amp;stname=California&amp;zip=90622">90622</a></td>
<td align="left"><a href="/sn/clt/capitolimpact/gw_citydet.aspx?state=ca&amp;stfips=06&amp;stname=California&amp;citykey=0608786">Buena Park</a></td>
<td align="left"><a href="/sn/clt/capitolimpact/gw_countydet.aspx?state=ca&amp;stfips=06&amp;stname=California&amp;fips=06059">Orange County</a></td>
</tr>


In [97]:
for result in results:
    if result.find('a'):
        print(result.find('a').text)
        print(result.find('a')['href'])

90620
/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&stfips=06&stname=California&zip=90620
90620
/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&stfips=06&stname=California&zip=90620
90620
/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&stfips=06&stname=California&zip=90620
90621
/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&stfips=06&stname=California&zip=90621
90622
/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&stfips=06&stname=California&zip=90622
90623
/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&stfips=06&stname=California&zip=90623
90623
/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&stfips=06&stname=California&zip=90623
90624
/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&stfips=06&stname=California&zip=90624
90630
/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&stfips=06&stname=California&zip=90630
90631
/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&stfips=06&stname=California&zip=90631
90631
/sn/clt/capitolimpact/gw_zipdet.aspx?state=ca&stfips=06&stname=California&zip=90631
90632
/sn/