# City-Data Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time, os
import numpy as np
import random
import re
import pickle
import urllib.request, urllib.error, urllib.parse
import pandas as pd

### Pulling Zipcode Specific Information on City-Data.com

In [None]:
#Pull City-Data HTML with link

city_data_link = 'https://www.city-data.com/zipmaps/Philadelphia-Pennsylvania.html'
city_data_html = requests.get(city_data_link).text
city_data_soup = BeautifulSoup(city_data_html, "lxml")

In [2]:
#Pull City-Data saved html

with open('philadelphia_info_by_zip.html') as page:
    property_html = page.read()
city_soup = BeautifulSoup(property_html, "lxml")

In [3]:
#Get all zipcodes function

def get_zipcodes(soup):
    phila_zip_list = []
    for div in soup.findAll('div', attrs={'class' : 'zip data-block'}):
        phila_zip_list.append(div.get('id'))
    return phila_zip_list

In [4]:
#Get Philly zipcodes

phila_zip_list = get_zipcodes(city_soup)

#Remove 19112 - Philadelphia Naval Yard area
phila_zip_list.remove('19112')

#Check final zipcode list
phila_zip_list

['19102',
 '19103',
 '19104',
 '19106',
 '19107',
 '19111',
 '19114',
 '19115',
 '19116',
 '19118',
 '19119',
 '19120',
 '19121',
 '19122',
 '19123',
 '19124',
 '19125',
 '19126',
 '19127',
 '19128',
 '19129',
 '19130',
 '19131',
 '19132',
 '19133',
 '19134',
 '19135',
 '19136',
 '19137',
 '19138',
 '19139',
 '19140',
 '19141',
 '19142',
 '19143',
 '19144',
 '19145',
 '19146',
 '19147',
 '19148',
 '19149',
 '19150',
 '19151',
 '19152',
 '19153',
 '19154']

In [5]:
#Pickle zipcodes

with open('philly_zipcodes', 'wb') as philly_zipcodes:
    pickle.dump(phila_zip_list, philly_zipcodes)

In [6]:
#Pull information for single zipcode

def get_zipcode_info(soup,zipcode):
    
    zipcode_statistics = soup.find('div', attrs={'id' : zipcode}).find_all('b')
    
    zipcode_info_list = []
    
    for item in zipcode_statistics:
        zipcode_info_list.append(item.nextSibling)
        
    zipcode = zipcode
    population_2016 = zipcode_info_list[0].replace(',','').strip() 
    population_2010 = zipcode_info_list[1].replace(',','').strip()         
    population_2000 = zipcode_info_list[2].replace(',','').strip()
    costofliving_index_2016 = zipcode_info_list[3].strip()
    land_area_sqmi = zipcode_info_list[5].strip()
    water_area_sqmi = zipcode_info_list[7].strip()
    population_dentisty_per_sqmi = zipcode_info_list[9].replace(',','').strip()
    population_2016_male = re.sub('[A-Za-z,]','',zipcode_info_list[11]).strip()
    population_2016_female = zipcode_info_list[12].replace(',','').strip()
    property_tax = zipcode_info_list[13].replace('%','').strip()[:3]
    med_tax_morg_2016 = zipcode_info_list[14].replace('$','').replace(',','').strip()[:-6]
    med_tax_nomorg_2016 = zipcode_info_list[15].replace('$','').replace(',','').strip()[:-6]
    med_house_value_2016 = zipcode_info_list[16].replace('$','').replace(',','').strip()
    med_house_income_2016 = zipcode_info_list[17].replace('$','').replace(',','').strip()
    med_owner_cost_morg = zipcode_info_list[18].replace('$','').replace(',','').strip()
    med_owner_cost_nomorg = zipcode_info_list[19].replace('$','').replace(',','').strip()
    med_rent_2016 = zipcode_info_list[20].replace('$','').replace(',','').strip()
    med_asking_price_2016 = zipcode_info_list[21].replace('$','').replace(',','').strip()
    unemployment = zipcode_info_list[22].replace('%','').replace(':','').strip()
        
    city_data_dict = dict(zip(headers, [zipcode, population_2016, population_2010, population_2000,
    costofliving_index_2016, land_area_sqmi, water_area_sqmi,
    population_dentisty_per_sqmi, population_2016_male, population_2016_female,
    property_tax, med_tax_morg_2016, med_tax_nomorg_2016, med_house_value_2016,
    med_house_income_2016, med_owner_cost_morg, med_owner_cost_nomorg, med_rent_2016, 
    med_asking_price_2016, unemployment]))
    
    city_data.append(city_data_dict)
    
    return print(zipcode, 'added')

In [7]:
#Pull information for all zipcodes

city_data = []

headers = ['zipcode', 'population_2016', 'population_2010', 'population_2000',
    'costofliving_index_2016', 'land_area_sqmi', 'water_area_sqmi',
    'population_dentisty_per_sqmi', 'population_2016_male', 'population_2016_female',
    'property_tax', 'med_tax_morg_2016', 'med_tax_nomorg_2016', 'med_house_value_2016',
    'med_house_income_2016', 'med_owner_cost_morg', 'med_owner_cost_nomorg', 'med_rent_2016', 
    'med_asking_price_2016', 'unemployment']

def pull_all_zipcodes(soup,zipcode_list):
    
    for zipcode in zipcode_list:
        get_zipcode_info(soup,zipcode)       

    return city_data

In [8]:
#Pull all zipcode information through function
pull_all_zipcodes(city_soup, phila_zip_list)

19102 added
19103 added
19104 added
19106 added
19107 added
19111 added
19114 added
19115 added
19116 added
19118 added
19119 added
19120 added
19121 added
19122 added
19123 added
19124 added
19125 added
19126 added
19127 added
19128 added
19129 added
19130 added
19131 added
19132 added
19133 added
19134 added
19135 added
19136 added
19137 added
19138 added
19139 added
19140 added
19141 added
19142 added
19143 added
19144 added
19145 added
19146 added
19147 added
19148 added
19149 added
19150 added
19151 added
19152 added
19153 added
19154 added


[{'zipcode': '19102',
  'population_2016': '4936',
  'population_2010': '4705',
  'population_2000': '4396',
  'costofliving_index_2016': '110.5',
  'land_area_sqmi': '0.2',
  'water_area_sqmi': '0.0',
  'population_dentisty_per_sqmi': '26137',
  'population_2016_male': '2504',
  'population_2016_female': '2432',
  'property_tax': '1.0',
  'med_tax_morg_2016': '4331 ',
  'med_tax_nomorg_2016': '4014 ',
  'med_house_value_2016': '427870',
  'med_house_income_2016': '101898',
  'med_owner_cost_morg': '3001',
  'med_owner_cost_nomorg': '978',
  'med_rent_2016': '2082',
  'med_asking_price_2016': '1081578',
  'unemployment': '2.5'},
 {'zipcode': '19103',
  'population_2016': '23239',
  'population_2010': '21908',
  'population_2000': '19714',
  'costofliving_index_2016': '112.3',
  'land_area_sqmi': '0.6',
  'water_area_sqmi': '0.0',
  'population_dentisty_per_sqmi': '36367',
  'population_2016_male': '10223',
  'population_2016_female': '13015',
  'property_tax': '0.9',
  'med_tax_morg_20

In [9]:
#Convert to dataframe

city_dataframe = pd.DataFrame(city_data)

In [10]:
#Check dataframe

city_dataframe

Unnamed: 0,zipcode,population_2016,population_2010,population_2000,costofliving_index_2016,land_area_sqmi,water_area_sqmi,population_dentisty_per_sqmi,population_2016_male,population_2016_female,property_tax,med_tax_morg_2016,med_tax_nomorg_2016,med_house_value_2016,med_house_income_2016,med_owner_cost_morg,med_owner_cost_nomorg,med_rent_2016,med_asking_price_2016,unemployment
0,19102,4936,4705,4396,110.5,0.2,0.0,26137,2504,2432,1.0,4331,4014,427870,101898,3001,978,2082,1081578,2.5
1,19103,23239,21908,19714,112.3,0.6,0.0,36367,10223,13015,0.9,5126,3242,521102,73681,2940,795,1601,362650,2.9
2,19104,55677,51808,50125,107.1,3.0,0.1,18519,28105,27571,0.8,1781,729,128194,22676,1187,413,941,222828,7.0
3,19106,12268,11740,8359,109.6,0.8,0.4,15180,6230,6037,0.9,3562,6231,497794,105334,2615,1242,1737,567543,2.5
4,19107,13657,14875,12340,110.6,0.5,0.0,24867,6736,6920,0.8,3092,3049,374150,52804,1830,936,1237,408260,5.4
5,19111,70919,63090,58874,104.9,4.8,0.0,14679,34417,36501,1.1,2091,2207,188684,46593,1364,505,945,173483,9.1
6,19114,31078,30907,31083,105.0,5.6,0.3,5550,14983,16094,1.1,2288,2365,209884,56397,1486,527,1068,177689,6.1
7,19115,33050,33207,31853,106.8,5.6,0.1,5887,15570,17479,1.1,2872,2915,261605,49357,1675,583,976,231775,4.9
8,19116,33688,33112,32560,106.7,5.0,0.0,6731,15874,17813,1.1,2746,2827,259274,48247,1629,568,931,203806,6.1
9,19118,10690,9808,9608,110.8,3.2,0.0,3351,4758,5931,1.0,5407,6147,538638,82904,2601,945,1468,846508,3.8


In [11]:
#Pickle zipcode dataframe

with open('philly_zipcodes_dataframe', 'wb') as philly_zipcodes_dataframe:
    pickle.dump(city_dataframe, philly_zipcodes_dataframe)