In [1]:
import pandas as pd
import sqlite3
import src.data_enrichment as enrich
import src.helpers as helpers
import src.login_config as login_config

In [2]:
# Read in data
conn = sqlite3.connect("housing_information.db")
data_df = pd.read_sql_query('SELECT * FROM df_listings', conn)

## Data enrichment

In [3]:
data_df['house_age'] = data_df['approx_age'].apply(enrich.age_of_house)
data_df['difference_in_days'] = data_df.apply(lambda row: enrich.calculate_days_to_sell(row['list_date'], row['end_date']), axis=1)
data_df['price_difference_abs'] = data_df.apply(lambda row: enrich.calculate_price_diff_or_pct(row['list_price'], row['sold_price'], 'difference'), axis=1)
data_df['price_difference_pct'] = data_df.apply(lambda row: enrich.calculate_price_diff_or_pct(row['list_price'], row['sold_price'], 'percentage'), axis=1)

## Data Exploration

### Address 

Currently the address name is retrieved in the following format:
<br> - "Property Address: {name of address}, {city}, {province}"
<br> - Example: 'Property Address: Juneau Street, Burnaby, British Columbia'
<br>
<br> We only need the {name of address}. We will perform regex in order to retrieve it.

In [4]:
# Clean up "property_addr" field
data_df['Address_Clean'] = data_df['property_addr'].apply(enrich.regex_cleanup)

The following addresses need to be updated. For some odd reason, the data scrape algorithm did not catch the full address of these names. We will need to manually check and verify the correct name of the address.

In [5]:
# few rows that needs to be replaced
# Juneau Street -> 1301 - 4488 Juneau Street
# Soball Street -> 38 - 1295 Soball Street
# Seaham Crescent -> 11300 Seaham Crescent
# Yale Street -> 2558 Yale Street [ But remove this since lot was bought]

data_df.loc[data_df['Address_Clean'] == 'Juneau Street','Address_Clean'] ='1301 - 4488 Juneau Street'
data_df.loc[data_df['Address_Clean'] == 'Soball Street','Address_Clean'] ='38 - 1295 Soball Street'
data_df.loc[data_df['Address_Clean'] == 'Seaham Crescent','Address_Clean'] ='11300 Seaham Crescent'
data_df = data_df[data_df.Address_Clean != 'Yale Street']

### Home type

In [6]:
data_df.house_type.value_counts()

house_type
Apartment/Condominium      7383
Single Family Residence    4341
Townhouse                  2758
Multi Family                345
Manufactured Home            43
Manufactured On Land         20
Duplex                       17
Quadruplex                    4
Triplex                       3
Condo Apt                     1
House                         1
Name: count, dtype: int64

Notice above that house type are categorized into multiple categories. However, the ones with low value counts actually below in one of the following categories:
<br> - Apartment/Condominium
<br> - Single Family Residence
<br> - Townhouse

In [7]:
data_df['Home_type'] = data_df['house_type'] 

data_df.loc[data_df['house_type'] == 'Quadruplex','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Manufactured On Land','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Condo Apt','Home_type'] ='Apartment/Condominium'
data_df.loc[data_df['house_type'] == 'Triplex','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Duplex','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Manufactured Home','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Multi Family','Home_type'] ='House'
data_df.loc[data_df['house_type'] == 'Single Family Residence','Home_type'] ='House'

Let's verify that the changes have indeed been made

In [8]:
data_df.Home_type.value_counts()

Home_type
Apartment/Condominium    7384
House                    4774
Townhouse                2758
Name: count, dtype: int64

### Maintanience fee
Remove it as they are all blanks. We will revisit this later
<br> For Apartment/Condominum, we could fetch this data off of condos.ca. There may be a better way to retrieve maintenacne fee. In the meantime, let's not include this column in the master copy.

In [9]:
data_df.maint_fee.value_counts()

maint_fee
$843    1
Name: count, dtype: int64

### Levels

In [10]:
data_df.levels.value_counts()

levels
Residential           14891
Residential Income       24
Land                      3
Residential Lease         1
Apartment                 1
Detached                  1
Name: count, dtype: int64

Based on the distribution of the field "levels". It seems that this variable is not that useful as majority of the listings are classified as "Residential". Therefore we will not include this variable in the final dataframe to work with.

### Garage 

In [11]:
data_df.garage.value_counts()

garage
Yes          12830
No            2089
Undergrnd        1
Name: count, dtype: int64

Notice there is one row that lists as "Undergrnd". Will update from "Undergrnd" to "Yes".

In [12]:
data_df.loc[data_df['garage'] == 'Undergrnd','garage'] ='Yes'

Verification

In [13]:
data_df.garage.value_counts()

garage
Yes    12831
No      2089
Name: count, dtype: int64

### House age
There are 6 rows (listings) where house age is not calculated. This it because no built age was provided.

In [14]:
print('The total # of rows before dropping nas is:', len(data_df)) # Before dropping nas
data_df = data_df.dropna(axis = 0, subset = ['house_age']).reset_index(drop = True)
print('The total # of rows after dropping nas is:', len(data_df)) # After dropping nas --  Before dropping nas - After dropping nas = 6 rows

The total # of rows before dropping nas is: 14921
The total # of rows after dropping nas is: 14915


Create master dataframe

In [15]:
columns_to_keep = ['mls_number', 'Address_Clean', 'City', 'Home_type', 'approx_age', 'house_age', 
                   'garage','garage_size','taxes', 'avg_price_sqft', 'bedroom','bathroom','list_date',
                   'list_price','end_date','sold_price', 'difference_in_days','price_difference_abs', 
                   'price_difference_pct']

df_master = data_df[columns_to_keep].copy()

df_master.rename(columns = {'Address_Clean': 'address',
                            'approx_age': 'yr_built',
                            'house_age': 'home_age'}, inplace = True)
df_master.columns = map(str.lower, df_master.columns)

## Features

In [30]:
import googlemaps
gmaps = googlemaps.Client(key = login_config.GEOCODE_KEY)

In [68]:
df_master['geocode_json'] = df_master.apply(lambda row: enrich.geocode_json(row['address'], row['city'], gmaps), axis=1)
df_master['geocode_json'] = df_master['geocode_json'].astype(str)

In [69]:
# #Store this in a database, # saving progress

# # conn = sqlite3.connect("housing_information.db")

# ##push the dataframe to sql 
# df_master.to_sql("df_master", conn, if_exists="replace", index = False)

# ##create the table

# conn.execute(
#     """
#     create table my_table_master as 
#     select * from df_master
#     """)

In [8]:
# # Read data back in
# # Read in data
# conn = sqlite3.connect("housing_information.db")
# df_master = pd.read_sql_query('SELECT * FROM df_master', conn)

In [9]:
# import ast
df_master['geocode_json'] = df_master['geocode_json'].apply(ast.literal_eval)
df_master['postal_code'] = df_master['geocode_json'].apply(enrich.extract_geocode_postal)

In [None]:
df_master['index_col'] = range(0,len(df_master))

There are some rows where the postal code is not length 7. By definition, in Canada, postal code should be in the following format: 
<br> - A1A 1A1, where A is letter and 1 is a digit, with a space separating the third and fourth characters. Therefore the length of the postal code is 7.

In [32]:
filtered_df_equal_7  = df_master[df_master['postal_code'].str.len() == 7]
print(f"The number of rows where postal code == 7 is: {len(filtered_df_equal_7 )}")

filtered_df_not_equal_7 = df_master[df_master['postal_code'].str.len() != 7]
print(f"The number of rows where postal code != 7 is: {len(filtered_df_not_equal_7 )}")

The number of rows where postal code == 7 is: 13838
The number of rows where postal code != 7 is: 1077


There are a total of 1077 rows (listings) where the postal codes do not match the format of a canadian postal code. Let's explore.

Example #1:

In [36]:
filtered_df_not_equal_7['geocode_json'][38] # Doesn't seem like searching address with just street + city is enough.

[{'address_components': [{'long_name': 'Burnaby',
    'short_name': 'Burnaby',
    'types': ['locality', 'political']},
   {'long_name': 'Metro Vancouver',
    'short_name': 'Metro Vancouver',
    'types': ['administrative_area_level_2', 'political']},
   {'long_name': 'British Columbia',
    'short_name': 'BC',
    'types': ['administrative_area_level_1', 'political']},
   {'long_name': 'Canada',
    'short_name': 'CA',
    'types': ['country', 'political']}],
  'formatted_address': 'Burnaby, BC, Canada',
  'geometry': {'bounds': {'northeast': {'lat': 49.2993349, 'lng': -122.891689},
    'southwest': {'lat': 49.180637, 'lng': -123.0246499}},
   'location': {'lat': 49.2488091, 'lng': -122.9805104},
   'location_type': 'APPROXIMATE',
   'viewport': {'northeast': {'lat': 49.2993349, 'lng': -122.891689},
    'southwest': {'lat': 49.180637, 'lng': -123.0246499}}},
  'partial_match': True,
  'place_id': 'ChIJc4OcIah3hlQRabFSh3NyCt0',
  'types': ['locality', 'political']}]

In [47]:
# Example 2:
filtered_df_not_equal_7['geocode_json'][14911] # seems like there is a similar location in the USA. There is a city in USA called Vancouver

[{'address_components': [{'long_name': '365',
    'short_name': '365',
    'types': ['street_number']},
   {'long_name': 'East 16th Street',
    'short_name': 'E 16th St',
    'types': ['route']},
   {'long_name': 'Esther Short',
    'short_name': 'Esther Short',
    'types': ['neighborhood', 'political']},
   {'long_name': 'Vancouver',
    'short_name': 'Vancouver',
    'types': ['locality', 'political']},
   {'long_name': 'Clark County',
    'short_name': 'Clark County',
    'types': ['administrative_area_level_2', 'political']},
   {'long_name': 'Washington',
    'short_name': 'WA',
    'types': ['administrative_area_level_1', 'political']},
   {'long_name': 'United States',
    'short_name': 'US',
    'types': ['country', 'political']},
   {'long_name': '98663', 'short_name': '98663', 'types': ['postal_code']},
   {'long_name': '3410',
    'short_name': '3410',
    'types': ['postal_code_suffix']}],
  'formatted_address': '365 E 16th St, Vancouver, WA 98663, USA',
  'geometry': {'l

Let's re-run the addresses in filtered_df_not_equal_7, where postal codes did not have length == 7. Let's tweak our function enrich.geocode_json to include provincial and country during the search of the geocode json

In [53]:
filtered_df_not_equal_7['postal_code_correction'] = filtered_df_not_equal_7.apply(lambda row: enrich.geocode_json(row['address'], row['city'], gmaps), axis=1)
filtered_df_not_equal_7['postal_code_correction'] = filtered_df_not_equal_7['geocode_json_rerun'].apply(enrich.extract_geocode_postal)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_not_equal_7['postal_code_correction'] = filtered_df_not_equal_7.apply(lambda row: enrich.geocode_json(row['address'], row['city'], gmaps), axis=1)


In [88]:
# rerun_filtered_df_equal_7  = filtered_df_not_equal_7[filtered_df_not_equal_7['postal_code_correction'].str.len() == 7]
# print(f"The number of rows where postal code == 7 is: {len(rerun_filtered_df_equal_7 )}")

# rerun_filtered_df_not_equal_7 = filtered_df_not_equal_7[filtered_df_not_equal_7['postal_code_correction'].str.len() != 7]
# print(f"The number of rows where postal code != 7 is: {len(rerun_filtered_df_not_equal_7 )}")

In [89]:
# rerun_filtered_df_not_equal_3 = rerun_filtered_df_not_equal_7[rerun_filtered_df_not_equal_7['postal_code_correction'].str.len() != 3]
# print(f"The number of rows where postal code != 3 is: {len(rerun_filtered_df_not_equal_3 )}")

In [112]:
df1 = filtered_df_equal_7
cols_to_add = ['mls_number', 'address', 'city', 'home_type', 'yr_built', 'home_age',
       'garage', 'garage_size', 'taxes', 'avg_price_sqft', 'bedroom',
       'bathroom', 'list_date', 'list_price', 'end_date', 'sold_price',
       'difference_in_days', 'price_difference_abs', 'price_difference_pct',
       'geocode_json_rerun', 'postal_code_correction', 'index_col']
df2 = filtered_df_not_equal_7[cols_to_add]
df2 = df2.rename(columns = {'geocode_json_rerun': 'geocode_json',
                  'postal_code_correction': 'postal_code'})
# geocode_json_rerun, postal_code_correction
remaster_df = pd.concat([df1, df2])
remaster_df = remaster_df.sort_values(by = ['index_col'], ascending = True)

In [116]:
remaster_df['geocode_json'] = remaster_df['geocode_json'].astype(str)

In [118]:
#Store this in a database, # saving progress

conn = sqlite3.connect("housing_information.db")

##push the dataframe to sql 
remaster_df.to_sql("remaster_df", conn, if_exists="replace", index = False)

##create the table

conn.execute(
    """
    create table my_table_remaster as 
    select * from remaster_df
    """)

<sqlite3.Cursor at 0x1ba9ee20240>

In [119]:
conn = sqlite3.connect("housing_information.db")
df_master = pd.read_sql_query('SELECT * FROM remaster_df', conn)

In [86]:
import geopy
geo_locator = geopy.Nominatim(user_agent='1234')
                        # Latitude, Longitude
r = geo_locator.reverse((49.28376261, -122.7932065))
# print(r.raw['address']['postcode'])

In [87]:
r

Location(Caffe Divano, 3003, Burlington Drive, Hoy Creek Housing Coop, River Springs, Coquitlam, Metro Vancouver Regional District, British Columbia, V3B 6X1, Canada, (49.2839076, -122.793435, 0.0))

In [None]:
# geocode_result = gmaps.geocode(df_master.address[0])

In [52]:
import importlib
importlib.reload(enrich)

<module 'src.data_enrichment' from 'C:\\Users\\Alan\\Projects\\Housing Project\\src\\data_enrichment.py'>