In [1]:
import pandas as pd
import sqlite3
import src.data_enrichment as enrich
import src.helpers as helpers

In [2]:
# Read in data
conn = sqlite3.connect("housing_information.db")
data_df = pd.read_sql_query('SELECT * FROM df_listings', conn)

In [3]:
data_df['house_age'] = data_df['approx_age'].apply(enrich.age_of_house)
data_df['difference_in_days'] = data_df.apply(lambda row: enrich.calculate_days_to_sell(row['list_date'], row['end_date']), axis=1)
data_df['price_difference_abs'] = data_df.apply(lambda row: enrich.calculate_price_diff_or_pct(row['list_price'], row['sold_price'], 'difference'), axis=1)
data_df['price_difference_pct'] = data_df.apply(lambda row: enrich.calculate_price_diff_or_pct(row['list_price'], row['sold_price'], 'percentage'), axis=1)

## Data Exploration

In [4]:
# Clean up "property_addr" field
data_df['Address_Clean'] = data_df['property_addr'].apply(enrich.regex_cleanup)

In [5]:
data_df[['property_addr','Address_Clean']].head(50)

Unnamed: 0,property_addr,Address_Clean
0,"Property Address: Juneau Street, Burnaby, Brit...",Juneau Street
1,"Property Address: 105 - 7180 Linden Avenue, Bu...",105 - 7180 Linden Avenue
2,"Property Address: 703 - 3737 Bartlett Court, B...",703 - 3737 Bartlett Court
3,"Property Address: 1035 Holdom Avenue, Burnaby,...",1035 Holdom Avenue
4,"Property Address: 4304 - 4485 Skyline Drive, B...",4304 - 4485 Skyline Drive
5,"Property Address: 206 - 7478 Byrnepark Walk, B...",206 - 7478 Byrnepark Walk
6,"Property Address: 103 - 7428 19 Th Avenue, Bur...",103 - 7428 19 Th Avenue
7,"Property Address: 2103 - 4508 Hazel Street, Bu...",2103 - 4508 Hazel Street
8,"Property Address: 301 - 4118 Dawson Street, Bu...",301 - 4118 Dawson Street
9,"Property Address: 706 - 6098 Station Street, B...",706 - 6098 Station Street


In [16]:
# few rows that needs to be replaced
# Juneau Street -> 1301 - 4488 Juneau Street
# Soball Street -> 38 - 1295 Soball Street
# Seaham Crescent -> 11300 Seaham Crescent
# Yale Street -> 2558 Yale Street [ But remove this since lot was bought]

data_df.loc[data_df['Address_Clean'] == 'Juneau Street','Address_Clean'] ='1301 - 4488 Juneau Street'
data_df.loc[data_df['Address_Clean'] == 'Soball Street','Address_Clean'] ='38 - 1295 Soball Street'
data_df.loc[data_df['Address_Clean'] == 'Seaham Crescent','Address_Clean'] ='11300 Seaham Crescent'
data_df = data_df[data_df.Address_Clean != 'Yale Street']

In [None]:
# import importlib
# importlib.reload(enrich)