In [1]:
# Import the necessary libraries
import pandas as pd

In [2]:
# Loading datasets
domain_properties_data = pd.read_csv("data/domain_properties.csv")
sydney_suburbs_data = pd.read_csv("data/syd_sub_rev.csv")

In [3]:
# Understanding basic structure
print("DOMAIN PROPERTIES")
print("shape: ", domain_properties_data.shape, "\n")
print("information: ", domain_properties_data.info(), "\n")
print("first 5 rows: ", domain_properties_data.head(), "\n")
print("column names: ", domain_properties_data.columns, "\n")

print("\n\n\n")

print("SYDNEY SUBURBS REVIEW")
print("shape: ", sydney_suburbs_data.shape, "\n")
print("information: ", sydney_suburbs_data.info(), "\n")
print("first 5 rows: ", sydney_suburbs_data.head(), "\n")
print("column names: ", sydney_suburbs_data.columns, "\n")

DOMAIN PROPERTIES
shape:  (11160, 17) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11160 entries, 0 to 11159
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   price                     11160 non-null  int64  
 1   date_sold                 11160 non-null  object 
 2   suburb                    11160 non-null  object 
 3   num_bath                  11160 non-null  int64  
 4   num_bed                   11160 non-null  int64  
 5   num_parking               11160 non-null  int64  
 6   property_size             11160 non-null  int64  
 7   type                      11160 non-null  object 
 8   suburb_population         11160 non-null  int64  
 9   suburb_median_income      11160 non-null  int64  
 10  suburb_sqkm               11160 non-null  float64
 11  suburb_lat                11160 non-null  float64
 12  suburb_lng                11160 non-null  float64
 13  suburb_elevation     

In [4]:
# Cleaning the datasets
domain_properties_data.columns = domain_properties_data.columns.str.strip().str.lower().str.replace(" ", "_") # fixing the feature names
sydney_suburbs_data.columns = sydney_suburbs_data.columns.str.strip().str.lower().str.replace(" ", "_") # fixing the feature names

sydney_suburbs_data.rename(columns={"name":"suburb"}, inplace=True)
# Merge the datasets
Sydney_merged = domain_properties_data.merge(sydney_suburbs_data, on="suburb", how="left")

print("Shape:", Sydney_merged.shape)
print("Nulls:\n", Sydney_merged.isnull().sum().sort_values(ascending=False).head(10))

Shape: (11160, 46)
Nulls:
 review_link                                      8656
highlights/attractions                           8641
ideal_for                                        8610
median_apartment_price_(2020)                    5361
nearest_train_station                            5105
median_apartment_rent_(per_week)                 4641
avg._years_held                                  4305
median_house_rent_(per_week)                     4178
median_house_price_(2021)                        4122
time_to_cbd_(public_transport)_[town_hall_st]    4098
dtype: int64


In [5]:
# Checking numeric correlation with target variable price
num_cols = Sydney_merged.select_dtypes(include=["float64", "int64"]).columns
corr = Sydney_merged[num_cols].corr()["price"].sort_values(ascending=False)
print(corr)

price                       1.000000
num_bath                    0.395310
suburb_median_income        0.365452
num_bed                     0.314329
property_inflation_index    0.229929
num_parking                 0.228236
suburb_lng                  0.177976
safety                      0.139487
nature                      0.136236
noise                       0.129818
family-friendliness         0.126623
pet_friendliness            0.126469
traffic                     0.113520
overall_rating              0.113188
things_to_see/do            0.109225
avg._years_held             0.105331
property_size               0.099212
public_transport            0.091391
affordability_(rental)      0.056713
affordability_(buying)      0.003874
suburb_elevation           -0.008412
suburb_population          -0.038954
suburb_lat                 -0.109136
suburb_sqkm                -0.153286
cash_rate                  -0.236636
postcode                   -0.339870
km_from_cbd                -0.358321
N

In [6]:
print(Sydney_merged["type"].value_counts())
print(Sydney_merged.groupby("type")["price"].median())

type
House                            9583
Apartment / Unit / Flat           688
Townhouse                         211
Semi-Detached                     170
Vacant land                       163
Villa                             114
Duplex                             67
Terrace                            63
Block of Units                     37
Acreage / Semi-Rural               21
New House & Land                   15
New Apartments / Off the Plan       9
Development Site                    7
Studio                              5
Rural                               4
New land                            3
Name: count, dtype: int64
type
Acreage / Semi-Rural             2050000.0
Apartment / Unit / Flat          1025000.0
Block of Units                   2900000.0
Development Site                 2000000.0
Duplex                           1240000.0
House                            1442000.0
New Apartments / Off the Plan    1400000.0
New House & Land                 1000000.0
New land    

In [7]:
Sydney_merged.shape

(11160, 46)

In [8]:
# Dropping the columns with corr less than 0.05
low_corr_cols = [col for col in corr.index if abs(corr[col]) < 0.05 and col != "price"]
Sydney_merged.drop(columns=low_corr_cols, inplace=True)

print("Dropped columns: ", low_corr_cols, "\n")
print("Shape after drop: ", Sydney_merged.shape, "\n")
print("Top nulls: \n", Sydney_merged.isnull().sum().sort_values(ascending=False).head(10))

Dropped columns:  ['affordability_(buying)', 'suburb_elevation', 'suburb_population'] 

Shape after drop:  (11160, 43) 

Top nulls: 
 review_link                                      8656
highlights/attractions                           8641
ideal_for                                        8610
median_apartment_price_(2020)                    5361
nearest_train_station                            5105
median_apartment_rent_(per_week)                 4641
avg._years_held                                  4305
median_house_rent_(per_week)                     4178
median_house_price_(2021)                        4122
time_to_cbd_(public_transport)_[town_hall_st]    4098
dtype: int64


In [9]:
# Adding manual dropping for some columns with even low corr
manual_drop = ["cash_rate", "postcode", "suburb_lat", "suburb_lng", "suburb_sqkm"]
Sydney_merged.drop(columns=manual_drop, inplace=True, errors='ignore')

In [10]:
# Handling missing values
missing = Sydney_merged.isnull().sum().sort_values(ascending=False)
print(missing.head(15))

review_link                                      8656
highlights/attractions                           8641
ideal_for                                        8610
median_apartment_price_(2020)                    5361
nearest_train_station                            5105
median_apartment_rent_(per_week)                 4641
avg._years_held                                  4305
median_house_rent_(per_week)                     4178
median_house_price_(2021)                        4122
time_to_cbd_(public_transport)_[town_hall_st]    4098
traffic                                          4081
public_housing_%                                 4070
safety                                           4059
overall_rating                                   4059
family-friendliness                              4059
dtype: int64


In [11]:
# Strip $ and , and convert to float
for col in median_cols:
    if col in Sydney_merged.columns:
        Sydney_merged[col] = (
            Sydney_merged[col]
            .astype(str)
            .str.replace("$", "", regex=False)
            .str.replace(",", "", regex=False)
        )
        Sydney_merged[col] = pd.to_numeric(Sydney_merged[col], errors="coerce")

NameError: name 'median_cols' is not defined

In [None]:
Sydney_merged.drop(columns=[
    "review_link",
    "highlights/attractions",
    "ideal_for",
    "nearest_train_station",
    "things_to_see/do"  # if present
], inplace=True, errors="ignore")

Sydney_merged = Sydney_merged[Sydney_merged["price"].notnull() & (Sydney_merged["price"] > 0)]
Sydney_merged.dropna(subset=["num_bed", "num_bath", "num_parking", "type", "km_from_cbd"], inplace=True)

median_cols = [
    "property_size",
    "suburb_median_income",
    "avg._years_held",
    "median_house_price_(2021)",
    "median_house_rent_(per_week)",
    "median_apartment_price_(2020)",
    "median_apartment_rent_(per_week)"
]

for col in median_cols:
    if col in Sydney_merged.columns:
        Sydney_merged[col].fillna(Sydney_merged[col].median(), inplace=True)

mode_cols = [
    "traffic",
    "safety",
    "overall_rating",
    "family-friendliness",
    "public_housing_%"
]

for col in mode_cols:
    if col in Sydney_merged.columns:
        Sydney_merged[col].fillna(Sydney_merged[col].mode()[0], inplace=True)
        
print(Sydney_merged.shape)
print(Sydney_merged.isnull().sum().sort_values(ascending=False).head(10))

(11160, 33)
time_to_cbd_(public_transport)_[town_hall_st]    4098
time_to_cbd_(driving)_[town_hall_st]             4059
pet_friendliness                                 4059
nature                                           4059
public_transport                                 4059
noise                                            4059
affordability_(rental)                           4059
median_house_price_(2020)                        4056
%_change                                         4049
population_(rounded)*                            4049
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Sydney_merged[col].fillna(Sydney_merged[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Sydney_merged[col].fillna(Sydney_merged[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interm