In [1]:
import pandas as pd
import sqlite3
import src.data_enrichment as enrich
import src.helpers as helpers

In [2]:
# Read in data
conn = sqlite3.connect("housing_information.db")
data_df = pd.read_sql_query('SELECT * FROM df_listings', conn)

In [3]:
data_df['house_age'] = data_df['approx_age'].apply(enrich.age_of_house)
data_df['difference_in_days'] = data_df.apply(lambda row: enrich.calculate_days_to_sell(row['list_date'], row['end_date']), axis=1)
data_df['price_difference_abs'] = data_df.apply(lambda row: enrich.calculate_price_diff_or_pct(row['list_price'], row['sold_price'], 'difference'), axis=1)
data_df['price_difference_pct'] = data_df.apply(lambda row: enrich.calculate_price_diff_or_pct(row['list_price'], row['sold_price'], 'percentage'), axis=1)

## Data Exploration

In [4]:
# Clean up "property_addr" field
data_df['Address_Clean'] = data_df['property_addr'].apply(enrich.regex_cleanup)

In [5]:
# few rows that needs to be replaced
# Juneau Street -> 1301 - 4488 Juneau Street
# Soball Street -> 38 - 1295 Soball Street
# Seaham Crescent -> 11300 Seaham Crescent
# Yale Street -> 2558 Yale Street [ But remove this since lot was bought]

data_df.loc[data_df['Address_Clean'] == 'Juneau Street','Address_Clean'] ='1301 - 4488 Juneau Street'
data_df.loc[data_df['Address_Clean'] == 'Soball Street','Address_Clean'] ='38 - 1295 Soball Street'
data_df.loc[data_df['Address_Clean'] == 'Seaham Crescent','Address_Clean'] ='11300 Seaham Crescent'
data_df = data_df[data_df.Address_Clean != 'Yale Street']

In [8]:
data_df.house_type.value_counts()

house_type
Apartment/Condominium      7379
Single Family Residence    4326
Townhouse                  2751
Multi Family                344
Manufactured Home            43
Manufactured On Land         20
Duplex                       17
Detached                     10
Condo Apt                     6
Quadruplex                    4
Triplex                       3
Comm Element Condo            1
Semi-Detached                 1
Rural Resid                   1
Det Condo                     1
Condo Townhouse               1
Att/Row/Twnhouse              1
House                         1
Name: count, dtype: int64

In [29]:
# data_df[data_df.house_type == 'House'] # Update from House to Single Family Residence
# data_df[data_df.house_type == 'Quadruplex'] # Update from Quadruplex to Single Family Residence
# data_df[data_df.house_type == 'Manufactured On Land'] # Update from House to Single Family Residence

# Det Condo -> House

# data_df[data_df.house_type == 'Det Condo']
data_df[data_df.house_type == 'Att/Row/Twnhouse'] #--> # In Kitcher?

# Somethign wrong with Richmond listings

# data_df[data_df.house_type == 'Comm Element Condo'] # Update from House to Single Family Residence

Unnamed: 0,house_type,size,maint_fee,approx_age,mls_number,levels,garage,garage_size,taxes,avg_price_sqft,...,list_date,list_price,end_date,sold_price,City,house_age,difference_in_days,price_difference_abs,price_difference_pct,Address_Clean
5566,Att/Row/Twnhouse,1500-2000 sqft,,,X8144620,2-Storey,Built-In,2.0,"$4,840",$457,...,15/03/2024,"$799,900",24/03/2024,"$880,000",richmond,,9.0,80100.0,10.013752,120 Hollybrook Tr


In [16]:
data_df.City.value_counts()

City
vancouver    5232
surrey       3385
richmond     2198
burnaby      1728
coquitlam    1465
delta         907
Name: count, dtype: int64

In [None]:
# import importlib
# importlib.reload(enrich)