In [1]:
# Dependencies
import requests
import lxml.html as lh
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
from sqlalchemy import create_engine

## Extract housing data

In [2]:
# URL of page to be scraped
url='https://www.kiplinger.com/tool/real-estate/T010-S003-home-prices-in-100-top-u-s-metro-areas/index.php'

# Retrieve page with the requests module
res = requests.get(url)

# Create a Beautiful Soup object
soup = bs(res.content,'lxml')

#  Retrieve the desired table
table = soup.find_all('table')[0] 

# Create a df for the desired data
housePrice_df = pd.read_html(str(table))[0]
housePrice_df.head(10)

Unnamed: 0,Metro Area,Median Home Price,% Change in 2018,% Change Since Peak*,% Change Since Bottom†,Affordability Index
0,"Akron, Ohio","$135,000",11.3,-9.0,51.8,3
1,"Albany, N.Y.",182000,8.7,5.0,18.5,8
2,"Albuquerque, N.M.",167000,3.2,7.6,28.8,7
3,"Allentown, Pa.",175000,11.8,-10.2,32.4,5
4,"Atlanta, Ga.",210000,8.9,5.8,106.4,4
5,"Augusta, Ga.",155000,5.7,6.8,25.3,1
6,"Austin, Texas",290000,2.2,81.1,80.0,9
7,"Bakersfield, Calif.",215000,6.8,-21.3,81.5,7
8,"Baltimore, Md.",248000,6.0,-15.8,26.7,5
9,"Baton Rouge, La.",168000,1.7,31.6,14.4,3


In [3]:
# List the names of all columns
housePrice_df.columns

Index(['Metro Area', 'Median  Home Price', '% Change  in 2018',
       '% Change  Since Peak*', '% Change  Since Bottom†',
       'Affordability Index'],
      dtype='object')

## Transform data

In [4]:
# Create new data with select columns
df_region_prices = housePrice_df[["Metro Area","Median  Home Price"]].copy()
df_region_prices.head()

Unnamed: 0,Metro Area,Median Home Price
0,"Akron, Ohio","$135,000"
1,"Albany, N.Y.",182000
2,"Albuquerque, N.M.",167000
3,"Allentown, Pa.",175000
4,"Atlanta, Ga.",210000


In [5]:
#  Split Metro Area in city/state and select only the City
#df_region_prices['Metro Area'].str.split(',',1, expand=True)

df_region_prices["Metro Area"] = df_region_prices['Metro Area'].str.split(',', n=1, expand=True)[0]

In [6]:
df_region_prices.head(100)

Unnamed: 0,Metro Area,Median Home Price
0,Akron,"$135,000"
1,Albany,182000
2,Albuquerque,167000
3,Allentown,175000
4,Atlanta,210000
5,Augusta,155000
6,Austin,290000
7,Bakersfield,215000
8,Baltimore,248000
9,Baton Rouge,168000


In [7]:
# Rename columns
df_region_prices.rename(columns={'Metro Area':'metro_city','Median  Home Price':'median_home_price'}, inplace=True)
df_region_prices.head(10)

Unnamed: 0,metro_city,median_home_price
0,Akron,"$135,000"
1,Albany,182000
2,Albuquerque,167000
3,Allentown,175000
4,Atlanta,210000
5,Augusta,155000
6,Austin,290000
7,Bakersfield,215000
8,Baltimore,248000
9,Baton Rouge,168000


In [8]:
# Remove "$" sign from the first record
df_region_prices['median_home_price'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')

In [9]:
df_region_prices.head(10)

Unnamed: 0,metro_city,median_home_price
0,Akron,135000
1,Albany,182000
2,Albuquerque,167000
3,Allentown,175000
4,Atlanta,210000
5,Augusta,155000
6,Austin,290000
7,Bakersfield,215000
8,Baltimore,248000
9,Baton Rouge,168000


In [10]:
# Export dataframe to a csv file
df_region_prices.to_csv('DATA/housePrices_bs.csv', index=False)

## Store the avocado CSV into a DataFrame

In [11]:
csv_file1 = "DATA/avocado.csv"
avocado_df = pd.read_csv(csv_file1)
avocado_df.head()

Unnamed: 0.1,Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


## Create new data with select columns and rename the columns

In [12]:
new_avocado_df = avocado_df[['AveragePrice', 'region']].copy()
new_avocado_df.head()

Unnamed: 0,AveragePrice,region
0,1.33,Albany
1,1.35,Albany
2,0.93,Albany
3,1.08,Albany
4,1.28,Albany


In [13]:
new_avocado_df.rename(columns={'AveragePrice':'avocado_price',
                              'region':'city'}, 
                 inplace=True)
new_avocado_df.head()

Unnamed: 0,avocado_price,city
0,1.33,Albany
1,1.35,Albany
2,0.93,Albany
3,1.08,Albany
4,1.28,Albany


In [14]:
clean_avocado_df = new_avocado_df.groupby('city')['avocado_price'].mean()

clean_avocado_df.head()

city
Albany                 1.561036
Atlanta                1.337959
BaltimoreWashington    1.534231
Boise                  1.348136
Boston                 1.530888
Name: avocado_price, dtype: float64

In [15]:
clean_avocado_df.to_csv("DATA/avocado_data.csv", encoding="utf-8", header= True)

## Load: Connect to local database

In [16]:
rds_connection_string = "postgres:postgres@localhost:5433/avocado_vs_house"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [17]:
engine.table_names()

['avocado', 'houseprices']

In [18]:
df_region_prices.to_sql(name='houseprices', con=engine, if_exists='append', index=False)

In [19]:
clean_avocado_df.to_sql(name='avocado', con=engine, if_exists='append')

In [20]:
pd.read_sql_query('select * from houseprices', con=engine).head()

Unnamed: 0,house_id,metro_city,median_home_price
0,1,Akron,135000
1,2,Albany,182000
2,3,Albuquerque,167000
3,4,Allentown,175000
4,5,Atlanta,210000


In [21]:
pd.read_sql_query('select * from avocado', con=engine).head()

Unnamed: 0,id,avocado_price,city
0,1,1.5610355029585792,Albany
1,2,1.3379585798816571,Atlanta
2,3,1.5342307692307684,BaltimoreWashington
3,4,1.348136094674556,Boise
4,5,1.5308875739644972,Boston


In [26]:
pd.read_sql_query('select avocado.city, avocado.avocado_price, houseprices.median_home_price from avocado join houseprices on avocado.city like houseprices.metro_city order by houseprices.median_home_price', con=engine).head()


Unnamed: 0,city,avocado_price,median_home_price
0,Syracuse,1.5203254437869826,112000
1,Indianapolis,1.313994082840236,125000
2,Pittsburgh,1.3643195266272197,138000
3,Detroit,1.2760946745562132,157000
4,Louisville,1.286686390532545,168000
