# Properties Data Preprocess

Preprocess the properties data previously scrapped to raw



## Import Packages


In [1]:
import pandas as pd
import json
import re
import numpy as np

## Read in the data


In [3]:
# Read in JSON file
with open('../../data/landing/domain_properties.json', 'r') as json_file:
    data = json.load(json_file)

# Convert the JSON data to a DataFrame
df = pd.DataFrame.from_dict(data, orient='index')

# Display the first few rows of the DataFrame for a preview
df.columns

Index(['address_line1', 'address_line2', 'suburb', 'state', 'postcode',
       'price', 'area', 'type', 'description', 'latitude', 'longitude', 'bed',
       'bath', 'parking', 'bond'],
      dtype='object')

## Standardized column name

In [13]:
# Reset the index and rename the columns as per the provided instructions
df.reset_index(inplace=True)
df.columns = ['properties_URL'] + list(df.columns[1:])

# Display the first few rows of the DataFrame for a preview
df.head()

Unnamed: 0,properties_URL,address_line1,address_line2,suburb,state,postcode,price,area,type,description,latitude,longitude,bed,bath,parking,bond
0,https://www.domain.com.au/12-mckenzie-road-ech...,"12 McKenzie Road,",ECHUCA VIC 3564,ECHUCA,VIC,3564,Contact Agent,,,,,,,,,
1,https://www.domain.com.au/667-glenhuntly-road-...,"667 Glenhuntly Road,",CAULFIELD VIC 3162,CAULFIELD,VIC,3162,"$38,000 p.a. Incl. Outgoings + GST",154m²,House,GLAMOUR ON GLENHUNTLYCan you hear it? Opportun...,-37.8860233,145.0173065,,,,
2,https://www.domain.com.au/upstairs-2c-staley-s...,"Upstairs 2C Staley Street,",BRUNSWICK VIC 3056,BRUNSWICK,VIC,3056,"$35,000 Annually",337m²,House,Be Creative1st floor offices/studiosEasy walki...,-37.7655919,144.9633048,−,−,2.0,
3,https://www.domain.com.au/ground-floor-325-vic...,"Ground Floor 325 Victoria Street,",BRUNSWICK VIC 3056,BRUNSWICK,VIC,3056,Contact Agent,136m²,House,Prime Commercial SpaceNestled in a dynamic Bru...,-37.7663981,144.9594197,−,1,1.0,
4,https://www.domain.com.au/8-chamberlain-road-r...,"8 Chamberlain Road,",REDCASTLE VIC 3523,REDCASTLE,VIC,3523,"$28,000 Per Year !!",,Vacant land,"8 Chamberlin Road, RedcastleCentury 21 Paramou...",-36.7247292,144.7609424,,,,


**Check Nan Counts**

In [14]:
# Check NaN counts for each column in the DataFrame
nan_counts = df.isnull().sum()
nan_counts

properties_URL       0
address_line1      135
address_line2        0
suburb               0
state                0
postcode             0
price                0
area              8911
type                10
description         10
latitude            10
longitude           10
bed                 41
bath                41
parking             41
bond               804
dtype: int64

**Drop 'area'**

In [15]:
# Drop 'price' and 'bond' columns from the DataFrame
df.drop(columns=['area'], inplace=True)

# Display the first few rows of the DataFrame for verification
df.head()

Unnamed: 0,properties_URL,address_line1,address_line2,suburb,state,postcode,price,type,description,latitude,longitude,bed,bath,parking,bond
0,https://www.domain.com.au/12-mckenzie-road-ech...,"12 McKenzie Road,",ECHUCA VIC 3564,ECHUCA,VIC,3564,Contact Agent,,,,,,,,
1,https://www.domain.com.au/667-glenhuntly-road-...,"667 Glenhuntly Road,",CAULFIELD VIC 3162,CAULFIELD,VIC,3162,"$38,000 p.a. Incl. Outgoings + GST",House,GLAMOUR ON GLENHUNTLYCan you hear it? Opportun...,-37.8860233,145.0173065,,,,
2,https://www.domain.com.au/upstairs-2c-staley-s...,"Upstairs 2C Staley Street,",BRUNSWICK VIC 3056,BRUNSWICK,VIC,3056,"$35,000 Annually",House,Be Creative1st floor offices/studiosEasy walki...,-37.7655919,144.9633048,−,−,2.0,
3,https://www.domain.com.au/ground-floor-325-vic...,"Ground Floor 325 Victoria Street,",BRUNSWICK VIC 3056,BRUNSWICK,VIC,3056,Contact Agent,House,Prime Commercial SpaceNestled in a dynamic Bru...,-37.7663981,144.9594197,−,1,1.0,
4,https://www.domain.com.au/8-chamberlain-road-r...,"8 Chamberlain Road,",REDCASTLE VIC 3523,REDCASTLE,VIC,3523,"$28,000 Per Year !!",Vacant land,"8 Chamberlin Road, RedcastleCentury 21 Paramou...",-36.7247292,144.7609424,,,,


In [16]:
# Drop rows with NaN values in the 'address' column
df_cleaned = df.dropna(subset=['address_line1'])
df_cleaned.head()

Unnamed: 0,properties_URL,address_line1,address_line2,suburb,state,postcode,price,type,description,latitude,longitude,bed,bath,parking,bond
0,https://www.domain.com.au/12-mckenzie-road-ech...,"12 McKenzie Road,",ECHUCA VIC 3564,ECHUCA,VIC,3564,Contact Agent,,,,,,,,
1,https://www.domain.com.au/667-glenhuntly-road-...,"667 Glenhuntly Road,",CAULFIELD VIC 3162,CAULFIELD,VIC,3162,"$38,000 p.a. Incl. Outgoings + GST",House,GLAMOUR ON GLENHUNTLYCan you hear it? Opportun...,-37.8860233,145.0173065,,,,
2,https://www.domain.com.au/upstairs-2c-staley-s...,"Upstairs 2C Staley Street,",BRUNSWICK VIC 3056,BRUNSWICK,VIC,3056,"$35,000 Annually",House,Be Creative1st floor offices/studiosEasy walki...,-37.7655919,144.9633048,−,−,2.0,
3,https://www.domain.com.au/ground-floor-325-vic...,"Ground Floor 325 Victoria Street,",BRUNSWICK VIC 3056,BRUNSWICK,VIC,3056,Contact Agent,House,Prime Commercial SpaceNestled in a dynamic Bru...,-37.7663981,144.9594197,−,1,1.0,
4,https://www.domain.com.au/8-chamberlain-road-r...,"8 Chamberlain Road,",REDCASTLE VIC 3523,REDCASTLE,VIC,3523,"$28,000 Per Year !!",Vacant land,"8 Chamberlin Road, RedcastleCentury 21 Paramou...",-36.7247292,144.7609424,,,,


## Cast data types

In [17]:
# Convert the 'bed', 'bath', 'parking' column to string temporarily to identify non-numeric entries
df_cleaned['bed_temp'] = df_cleaned['bed'].astype(str)
df_cleaned['bath_temp'] = df_cleaned['bath'].astype(str)
df_cleaned['parking_temp'] = df_cleaned['parking'].astype(str)

# Convert 'bed,'bath' and 'parking' to float.
df_cleaned['bed'] = df_cleaned['bed_temp'].replace('− ', np.nan).str.strip().astype(float)
df_cleaned.drop('bed_temp', axis=1, inplace=True)

df_cleaned['bath'] = df_cleaned['bath_temp'].replace('− ', np.nan).str.strip().astype(float)
df_cleaned.drop('bath_temp', axis=1, inplace=True)

df_cleaned['parking'] = df_cleaned['parking_temp'].replace('− ', np.nan).str.strip().astype(float)
df_cleaned.drop('parking_temp', axis=1, inplace=True)


# Convert other columns to the desired data types
df_cleaned['properties_URL'] = df_cleaned['properties_URL'].astype(str)
df_cleaned['type'] = df_cleaned['type'].astype(str)
df_cleaned['description'] = df_cleaned['description'].astype(str)
df_cleaned['address_line1'] = df_cleaned['address_line1'].astype(str)
df_cleaned['address_line2'] = df_cleaned['address_line2'].astype(str)
df_cleaned['suburb'] = df_cleaned['suburb'].astype(str)
df_cleaned['state'] = df_cleaned['state'].astype(str)
df_cleaned['postcode'] = df_cleaned['postcode'].astype(str)
df_cleaned['price'] = df_cleaned['price'].astype(str)
df_cleaned['bond'] = df_cleaned['bond'].astype(str)

# Display the data types of the DataFrame columns to verify the changes
df_cleaned.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['bed_temp'] = df_cleaned['bed'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['bath_temp'] = df_cleaned['bath'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['parking_temp'] = df_cleaned['parking'].astype(str)
A value is trying to be set on

properties_URL     object
address_line1      object
address_line2      object
suburb             object
state              object
postcode           object
price              object
type               object
description        object
latitude           object
longitude          object
bed               float64
bath              float64
parking           float64
bond               object
dtype: object

In [18]:
# Check that the object data type is string, change the column selected for different column
all_strings = df_cleaned['price'].apply(lambda x: isinstance(x, str)).all()
all_strings

True

In [19]:
# Change column order
desired_order = [
    'properties_URL', 'address_line1', 'address_line2', 'suburb', 'postcode', 'type', 
    'price', 'bond', 'description', 'bed', 'bath', 'parking', 'latitude', 'longitude'
]
df_cleaned = df_cleaned[desired_order]

# Display the first few rows of the DataFrame to verify the new column order
df_cleaned.head()

Unnamed: 0,properties_URL,address_line1,address_line2,suburb,postcode,type,price,bond,description,bed,bath,parking,latitude,longitude
0,https://www.domain.com.au/12-mckenzie-road-ech...,"12 McKenzie Road,",ECHUCA VIC 3564,ECHUCA,3564,,Contact Agent,,,,,,,
1,https://www.domain.com.au/667-glenhuntly-road-...,"667 Glenhuntly Road,",CAULFIELD VIC 3162,CAULFIELD,3162,House,"$38,000 p.a. Incl. Outgoings + GST",,GLAMOUR ON GLENHUNTLYCan you hear it? Opportun...,,,,-37.8860233,145.0173065
2,https://www.domain.com.au/upstairs-2c-staley-s...,"Upstairs 2C Staley Street,",BRUNSWICK VIC 3056,BRUNSWICK,3056,House,"$35,000 Annually",,Be Creative1st floor offices/studiosEasy walki...,,,2.0,-37.7655919,144.9633048
3,https://www.domain.com.au/ground-floor-325-vic...,"Ground Floor 325 Victoria Street,",BRUNSWICK VIC 3056,BRUNSWICK,3056,House,Contact Agent,,Prime Commercial SpaceNestled in a dynamic Bru...,,1.0,1.0,-37.7663981,144.9594197
4,https://www.domain.com.au/8-chamberlain-road-r...,"8 Chamberlain Road,",REDCASTLE VIC 3523,REDCASTLE,3523,Vacant land,"$28,000 Per Year !!",,"8 Chamberlin Road, RedcastleCentury 21 Paramou...",,,,-36.7247292,144.7609424


In [20]:
# Save the raw data
df_cleaned.to_csv('../../data/raw/raw_domain_properties.csv', index = False)