In [1]:
import pandas as pd
import sqlite3
import src.data_enrichment as enrich
import src.helpers as helpers
import src.login_config as login_config

In [2]:
# Read in data
conn = sqlite3.connect("housing_information.db")
data_df = pd.read_sql_query('SELECT * FROM df_listings', conn)

## Data enrichment

In [3]:
data_df['house_age'] = data_df['approx_age'].apply(enrich.age_of_house)
data_df['difference_in_days'] = data_df.apply(lambda row: enrich.calculate_days_to_sell(row['list_date'], row['end_date']), axis=1)
data_df['price_difference_abs'] = data_df.apply(lambda row: enrich.calculate_price_diff_or_pct(row['list_price'], row['sold_price'], 'difference'), axis=1)
data_df['price_difference_pct'] = data_df.apply(lambda row: enrich.calculate_price_diff_or_pct(row['list_price'], row['sold_price'], 'percentage'), axis=1)

## Data Exploration

### Address 

Currently the address name is retrieved in the following format:
<br> - "Property Address: {name of address}, {city}, {province}"
<br> - Example: 'Property Address: Juneau Street, Burnaby, British Columbia'
<br>
<br> We only need the {name of address}. We will perform regex in order to retrieve it.

In [4]:
# Clean up "property_addr" field
data_df['Address_Clean'] = data_df['property_addr'].apply(enrich.regex_cleanup)

The following addresses need to be updated. For some odd reason, the data scrape algorithm did not catch the full address of these names. We will need to manually check and verify the correct name of the address.

In [5]:
# few rows that needs to be replaced
# Juneau Street -> 1301 - 4488 Juneau Street
# Soball Street -> 38 - 1295 Soball Street
# Seaham Crescent -> 11300 Seaham Crescent
# Yale Street -> 2558 Yale Street [ But remove this since lot was bought]

data_df.loc[data_df['Address_Clean'] == 'Juneau Street','Address_Clean'] ='1301 - 4488 Juneau Street'
data_df.loc[data_df['Address_Clean'] == 'Soball Street','Address_Clean'] ='38 - 1295 Soball Street'
data_df.loc[data_df['Address_Clean'] == 'Seaham Crescent','Address_Clean'] ='11300 Seaham Crescent'
data_df = data_df[data_df.Address_Clean != 'Yale Street']

### Home type

In [6]:
data_df.house_type.value_counts()

house_type
Apartment/Condominium      7383
Single Family Residence    4341
Townhouse                  2758
Multi Family                345
Manufactured Home            43
Manufactured On Land         20
Duplex                       17
Quadruplex                    4
Triplex                       3
Condo Apt                     1
House                         1
Name: count, dtype: int64

Notice above that house type are categorized into multiple categories. However, the ones with low value counts actually below in one of the following categories:
<br> - Apartment/Condominium
<br> - Single Family Residence
<br> - Townhouse

In [7]:
data_df['Home_type'] = data_df['house_type'] 

data_df.loc[data_df['house_type'] == 'Quadruplex','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Manufactured On Land','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Condo Apt','Home_type'] ='Apartment/Condominium'
data_df.loc[data_df['house_type'] == 'Triplex','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Duplex','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Manufactured Home','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Multi Family','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Single Family Residence','Home_type'] ='House'

Let's verify that the changes have indeed been made

In [8]:
data_df.Home_type.value_counts()

Home_type
Apartment/Condominium    7384
House                    4774
Townhouse                2758
Name: count, dtype: int64

### Maintanience fee
Remove it as they are all blanks. We will revisit this later
<br> For Apartment/Condominum, we could fetch this data off of condos.ca. There may be a better way to retrieve maintenacne fee. In the meantime, let's not include this column in the master copy.

In [9]:
data_df.maint_fee.value_counts()

maint_fee
$843    1
Name: count, dtype: int64

### Levels

In [10]:
data_df.levels.value_counts()

levels
Residential           14891
Residential Income       24
Land                      3
Residential Lease         1
Apartment                 1
Detached                  1
Name: count, dtype: int64

Based on the distribution of the field "levels". It seems that this variable is not that useful as majority of the listings are classified as "Residential". Therefore we will not include this variable in the final dataframe to work with.

### Garage 

In [11]:
data_df.garage.value_counts()

garage
Yes          12830
No            2089
Undergrnd        1
Name: count, dtype: int64

Notice there is one row that lists as "Undergrnd". Will update from "Undergrnd" to "Yes".

In [12]:
data_df.loc[data_df['garage'] == 'Undergrnd','garage'] ='Yes'

Verification

In [13]:
data_df.garage.value_counts()

garage
Yes    12831
No      2089
Name: count, dtype: int64

### House age
There are 6 rows (listings) where house age is not calculated. This it because no built age was provided.

In [14]:
print('The total # of rows before dropping nas is:', len(data_df)) # Before dropping nas
data_df = data_df.dropna(axis = 0, subset = ['house_age']).reset_index(drop = True)
print('The total # of rows after dropping nas is:', len(data_df)) # After dropping nas --  Before dropping nas - After dropping nas = 6 rows

The total # of rows before dropping nas is: 14921
The total # of rows after dropping nas is: 14915


Create master dataframe

In [15]:
columns_to_keep = ['mls_number', 'Address_Clean', 'City', 'Home_type', 'approx_age', 'house_age', 
                   'garage','garage_size','taxes', 'avg_price_sqft', 'bedroom','bathroom','list_date',
                   'list_price','end_date','sold_price', 'difference_in_days','price_difference_abs', 
                   'price_difference_pct']

df_master = data_df[columns_to_keep].copy()

df_master.rename(columns = {'Address_Clean': 'address',
                            'approx_age': 'yr_built',
                            'house_age': 'home_age'}, inplace = True)
df_master.columns = map(str.lower, df_master.columns)

## Features

In [68]:
import googlemaps
gmaps = googlemaps.Client(key = login_config.GEOCODE_KEY)

df_master['geocode_json'] = df_master.apply(lambda row: enrich.geocode_json(row['address'], row['city'], gmaps), axis=1)
df_master['geocode_json'] = df_master['geocode_json'].astype(str)

In [69]:
# #Store this in a database, # saving progress

# # conn = sqlite3.connect("housing_information.db")

# ##push the dataframe to sql 
# df_master.to_sql("df_master", conn, if_exists="replace", index = False)

# ##create the table

# conn.execute(
#     """
#     create table my_table_master as 
#     select * from df_master
#     """)

In [78]:
# Read data back in

In [119]:
import ast
# df_master['geocode_json'] = df_master['geocode_json'].apply(ast.literal_eval)
df_master['postal_code'] = df_master['geocode_json'].apply(enrich.extract_geocode_postal)

In [120]:
df_master['postal_code']

0        V5C 0M4
1        V5E 3G6
2        V3J 7E3
3        V5B 3V5
4        V5C 0J2
          ...   
14910    V6B 0H2
14911      98663
14912       None
14913        V5T
14914        V5R
Name: postal_code, Length: 14915, dtype: object

In [131]:
# df_master['geocode_json'][14911] # Lookedinto WA instead of Canada
df_master['address'][14912]

'8469 French'

In [110]:
postal_code = None
for component in df_master['geocode_json'][0][0]['address_components']:
    if 'postal_code' in component['types']:
        postal_code = component['long_name']
        break
print("Postal Code:", postal_code)

Postal Code: V5C 0M4


In [99]:
df_master['geocode_json'][0]

[{'address_components': [{'long_name': '1301',
    'short_name': '1301',
    'types': ['subpremise']},
   {'long_name': '4488', 'short_name': '4488', 'types': ['street_number']},
   {'long_name': 'Juneau Street',
    'short_name': 'Juneau St',
    'types': ['route']},
   {'long_name': 'Coquitlam',
    'short_name': 'Coquitlam',
    'types': ['locality', 'political']},
   {'long_name': 'Metro Vancouver',
    'short_name': 'Metro Vancouver',
    'types': ['administrative_area_level_2', 'political']},
   {'long_name': 'British Columbia',
    'short_name': 'BC',
    'types': ['administrative_area_level_1', 'political']},
   {'long_name': 'Canada',
    'short_name': 'CA',
    'types': ['country', 'political']},
   {'long_name': 'V5C 0M4',
    'short_name': 'V5C 0M4',
    'types': ['postal_code']}],
  'formatted_address': '4488 Juneau St #1301, Coquitlam, BC V5C 0M4, Canada',
  'geometry': {'location': {'lat': 49.26337940000001, 'lng': -123.0037234},
   'location_type': 'ROOFTOP',
   'viewpo

<sqlite3.Cursor at 0x2830ad79640>

In [None]:
# geocode_result = gmaps.geocode(df_master.address[0])

In [118]:
import importlib
importlib.reload(enrich)

<module 'src.data_enrichment' from 'C:\\Users\\Alan\\Projects\\Housing Project\\src\\data_enrichment.py'>