In [1]:
import pandas as pd
import sqlite3
import ast

In [2]:
conn = sqlite3.connect("housing_info.db")
df_master = pd.read_sql_query('SELECT * FROM df_master', conn)

full_map_coords = pd.read_csv('data//full_map_coordinates.csv', index_col = [0])

In [3]:
df_master['geocode_json'] = df_master['geocode_json'].apply(ast.literal_eval)

In [4]:
df_master['geocode_json'][0]

[{'address_components': [{'long_name': '1301',
    'short_name': '1301',
    'types': ['subpremise']},
   {'long_name': '4488', 'short_name': '4488', 'types': ['street_number']},
   {'long_name': 'Juneau Street',
    'short_name': 'Juneau St',
    'types': ['route']},
   {'long_name': 'Coquitlam',
    'short_name': 'Coquitlam',
    'types': ['locality', 'political']},
   {'long_name': 'Metro Vancouver',
    'short_name': 'Metro Vancouver',
    'types': ['administrative_area_level_2', 'political']},
   {'long_name': 'British Columbia',
    'short_name': 'BC',
    'types': ['administrative_area_level_1', 'political']},
   {'long_name': 'Canada',
    'short_name': 'CA',
    'types': ['country', 'political']},
   {'long_name': 'V5C 0M4',
    'short_name': 'V5C 0M4',
    'types': ['postal_code']}],
  'formatted_address': '4488 Juneau St #1301, Coquitlam, BC V5C 0M4, Canada',
  'geometry': {'location': {'lat': 49.26337940000001, 'lng': -123.0037234},
   'location_type': 'ROOFTOP',
   'viewpo

In [5]:
df_master['geocode_json'][0][0]['geometry']['location']

{'lat': 49.26337940000001, 'lng': -123.0037234}

In [6]:
def extract_lat_lon(row):
    address_data = row['geocode_json'][0]['geometry']
    if isinstance(address_data, dict):
        location = address_data.get('location', {})
        lat = location.get('lat')
        lng = location.get('lng')
        return lat, lng
    else:
        return None, None

# Apply the extract_lat_lon function and create new columns
df_master['latitude'], df_master['longitude'] = zip(*df_master.apply(extract_lat_lon, axis=1))


In [7]:
from shapely.geometry import Point
from shapely.wkt import loads

def create_location_pt(row):
    return Point(row['longitude'],row['latitude'])

df_master['location_pt'] = df_master.apply(create_location_pt, axis = 1)
full_map_coords['geometry'] = full_map_coords['geometry'].apply(loads)

def point_in_polygons(point, polygons, names):
    nearest_polygon = None
    min_distance = float('inf')

    for polygon, name in zip(polygons, names):
        if point.within(polygon):
            return name
        # Calculate the distance from the point to the polygon's boundary
        distance = point.distance(polygon.boundary)
        # Update nearest polygon if distance is smaller
        if distance < min_distance:
            nearest_polygon = name
            min_distance = distance
    
    # Return the name of the nearest polygon if one exists
    if nearest_polygon is not None:
        return nearest_polygon
    else:
        return None  # No polygon found within approximation

df_master['Polygon_Name'] = df_master.apply(lambda row: point_in_polygons(row['location_pt'], full_map_coords['geometry'], full_map_coords['name']), axis=1)

  return lib.distance(a, b, **kwargs)


In [8]:
df_master['Polygon_Name'].isna().sum() #681

0

In [9]:
df_master

Unnamed: 0,mls_number,address,city,home_type,yr_built,home_age,garage,garage_size,taxes,avg_price_sqft,...,difference_in_days,price_difference_abs,price_difference_pct,geocode_json,postal_code,index_col,latitude,longitude,location_pt,Polygon_Name
0,R2834693,1301 - 4488 Juneau Street,burnaby,Apartment/Condominium,2021.0,3.0,Yes,1.0,"$2,944","$1,157",...,8.0,-25000.0,-2.512563,"[{'address_components': [{'long_name': '1301',...",V5C 0M4,0,49.263379,-123.003723,POINT (-123.0037234 49.26337940000001),West Central
1,R2821009,105 - 7180 Linden Avenue,burnaby,Apartment/Condominium,1973.0,51.0,Yes,1.0,"$1,044",$600,...,164.0,-5000.0,-1.136364,"[{'address_components': [{'long_name': '105', ...",V5E 3G6,1,49.220089,-122.950739,POINT (-122.9507389 49.2200892),Richmond Park
2,R2773433,703 - 3737 Bartlett Court,burnaby,Apartment/Condominium,1975.0,49.0,Yes,1.0,"$1,404",$662,...,92.0,-10000.0,-1.336898,"[{'address_components': [{'long_name': '703', ...",V3J 7E3,2,49.250485,-122.900122,POINT (-122.9001222 49.2504851),Cameron
3,R2820239,1035 Holdom Avenue,burnaby,House,1969.0,55.0,Yes,5.0,"$5,066",$756,...,4.0,-50000.0,-2.702703,"[{'address_components': [{'long_name': '1035',...",V5B 3V5,3,49.274787,-122.981492,POINT (-122.9814919 49.2747869),Parkcrest-Aubrey
4,R2837353,4304 - 4485 Skyline Drive,burnaby,Apartment/Condominium,2017.0,7.0,Yes,2.0,"$2,696","$1,154",...,85.0,-35572.0,-3.690807,"[{'address_components': [{'long_name': '4304',...",V5C 0J2,4,49.265688,-123.003635,POINT (-123.0036354 49.2656881),West Central
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14910,R2592520,710 - 535 Smithe Street,vancouver,Apartment/Condominium,2010.0,14.0,No,0.0,"$1,635","$1,165",...,12.0,-4000.0,-0.667780,"[{'address_components': [{'long_name': '710', ...",V6B 0H2,14910,49.279445,-123.119668,POINT (-123.119668 49.27944549999999),Downtown
14911,R2592341,365 E 16,vancouver,Townhouse,2018.0,6.0,Yes,0.0,"$2,765",$905,...,12.0,-9000.0,-0.900901,"[{'address_components': [{'long_name': '365', ...",V5T 2T7,14911,49.256891,-123.097177,POINT (-123.0971773 49.25689149999999),Mount Pleasant
14912,R2550233,8469 French,vancouver,House,2004.0,20.0,Yes,0.0,"$5,405","$1,098",...,106.0,-59900.0,-3.546897,"[{'address_components': [{'long_name': '8469',...",V6P 4W3,14912,49.209577,-123.139504,POINT (-123.1395039 49.20957689999999),Marpole
14913,R2594052,528 E 2nd,vancouver,Townhouse,2020.0,4.0,Yes,0.0,,$797,...,1.0,0.0,0.000000,"[{'address_components': [{'long_name': '528', ...",V5T,14913,49.266163,-123.095995,POINT (-123.0959953 49.2661632),Mount Pleasant


In [10]:
df_master['home_type'].value_counts()

home_type
Apartment/Condominium    7382
House                    4773
Townhouse                2758
Name: count, dtype: int64

Visualize the location of the houses based on lat and lon

In [20]:
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.tile_providers import get_provider, Vendors
from bokeh.models import ColumnDataSource
import pandas as pd

In [39]:
def latlon_to_mercator(lat, lon):
    import numpy as np
    k = 6378137
    x = lon * (k * np.pi / 180.0)
    y = np.log(np.tan((90 + lat) * np.pi / 360.0)) * k
    return x, y

# Define coord as tuple (lat,long)
df_master['coords'] = list(zip(df_master['latitude'], df_master['longitude']))
# Obtain list of mercator coordinates
mercators = [latlon_to_mercator(x, y) for x, y in df_master['coords']]

# Create mercator column in our df
df_master['mercator'] = mercators # Split that column out into two separate columns - mercator_x and mercator_y
df_master[['mercator_x', 'mercator_y']] = df_master['mercator'].apply(pd.Series)

In [53]:
abc = df_master[['mercator_x','mercator_y']]

In [57]:
from bokeh.io import output_notebook, show
# Select tile set to use
tile_provider  = get_provider(Vendors.CARTODBPOSITRON)

# Tell Bokeh to use df as the source of the data
source = ColumnDataSource(data=abc)
p = figure(title = 'GVA House Listings', x_axis_type="mercator", y_axis_type="mercator", x_axis_label = 'longitude', y_axis_label = 'latitude')
p.add_tile(tile_provider)

# Add points using mercator coordinates
p.circle(x = 'mercator_x', y = 'mercator_y', source=source, size=10, fill_alpha = 0.7)

output_notebook()
show(p)

In [59]:
df_master.columns

Index(['mls_number', 'address', 'city', 'home_type', 'yr_built', 'home_age',
       'garage', 'garage_size', 'taxes', 'avg_price_sqft', 'bedroom',
       'bathroom', 'list_date', 'list_price', 'end_date', 'sold_price',
       'difference_in_days', 'price_difference_abs', 'price_difference_pct',
       'geocode_json', 'postal_code', 'index_col', 'latitude', 'longitude',
       'location_pt', 'Polygon_Name', 'coords', 'mercator', 'mercator_x',
       'mercator_y'],
      dtype='object')

In [62]:
import seaborn as sns
import matplotlib.pyplot as plt

In [73]:
cols_to_excl = ['mls_number', 'address', 'geocode_json', 'postal_code', 'index_col', 
                'latitude', 'longitude', 'location_pt', 'Polygon_Name', 'coords', 
                'mercator', 'mercator_x', 'mercator_y']
quant_variables = []
qual_variables = ['city','home_type','garage']

# exclude columns you don't want
df_corr = df_master[df_master.columns[~df_master.columns.isin(cols_to_excl + qual_variables)]]

In [74]:
df_corr.columns

Index(['yr_built', 'home_age', 'garage_size', 'taxes', 'avg_price_sqft',
       'bedroom', 'bathroom', 'list_date', 'list_price', 'end_date',
       'sold_price', 'difference_in_days', 'price_difference_abs',
       'price_difference_pct'],
      dtype='object')

In [76]:
df_corr.corr()

ValueError: could not convert string to float: '$2,944'