In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('Melbourne_housing_FULL.csv')

In [3]:
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [4]:
df.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,34857.0,27247.0,34856.0,34856.0,26640.0,26631.0,26129.0,23047.0,13742.0,15551.0,26881.0,26881.0,34854.0
mean,3.031012,1050173.0,11.184929,3116.062859,3.084647,1.624798,1.728845,593.598993,160.2564,1965.289885,-37.810634,145.001851,7572.888306
std,0.969933,641467.1,6.788892,109.023903,0.98069,0.724212,1.010771,3398.841946,401.26706,37.328178,0.090279,0.120169,4428.090313
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.19043,144.42379,83.0
25%,2.0,635000.0,6.4,3051.0,2.0,1.0,1.0,224.0,102.0,1940.0,-37.86295,144.9335,4385.0
50%,3.0,870000.0,10.3,3103.0,3.0,2.0,2.0,521.0,136.0,1970.0,-37.8076,145.0078,6763.0
75%,4.0,1295000.0,14.0,3156.0,4.0,2.0,2.0,670.0,188.0,2000.0,-37.7541,145.0719,10412.0
max,16.0,11200000.0,48.1,3978.0,30.0,12.0,26.0,433014.0,44515.0,2106.0,-37.3902,145.52635,21650.0


In [5]:
df.shape

(34857, 21)

In [6]:
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
df = df[cols_to_use]

In [7]:
df.isnull().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           3
Propertycount        3
Distance             1
CouncilArea          3
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
Price             7610
dtype: int64

In [8]:
df.dropna(subset=['Price'], inplace=True)

In [9]:
cols_to_fill_zero = ['Propertycount', 'Car']
df[cols_to_fill_zero] = df[cols_to_fill_zero].fillna(0)

In [10]:
df['Landsize'] = df['Landsize'].fillna(df.Landsize.mean())

In [11]:
df['Bedroom2'] = df['Bedroom2'].fillna(df.Bedroom2.mode()[0])
df['Bathroom'] = df['Bathroom'].fillna(df.Bathroom.mode()[0])

### KNN imputation

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
df['BuildingArea'] = imputer.fit_transform(df[['BuildingArea', 'Price', 'Rooms', 'Landsize']])[:, 0]

In [44]:
df['BuildingArea'].isnull().sum()

np.int64(0)

### MICE Imputation

In [13]:
#label encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

df['CouncilArea'] = label_encoder.fit_transform(df['CouncilArea'])

In [14]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=0)
df['CouncilArea'] = imputer.fit_transform(df[['CouncilArea']])

In [15]:
df.dropna(inplace=True)

In [16]:
df.isnull().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
Price            0
dtype: int64

# Handling Outliers

In [17]:
from scipy import stats

### Z-score method

In [18]:
z_scores = np.abs(stats.zscore(df['Price']))
print(z_scores)

outliers = df[z_scores > 3]
outliers

[0.66999691 0.02371083 0.64661351 ... 0.53814578 0.13997302 0.04709423]


Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
166,Albert Park,3,h,S,Greg,Southern Metropolitan,3280.0,3.3,26.0,3.0,2.0,0.0,147.0,146.00000,3010000.0
178,Albert Park,4,h,S,Marshall,Southern Metropolitan,3280.0,3.3,26.0,4.0,2.0,1.0,330.0,207.00000,4735000.0
182,Albert Park,3,h,S,Greg,Southern Metropolitan,3280.0,3.3,26.0,3.0,2.0,2.0,325.0,312.00000,3755000.0
387,Armadale,2,u,S,Jellis,Southern Metropolitan,4836.0,6.3,27.0,3.0,2.0,2.0,0.0,236.40000,3625000.0
388,Armadale,4,h,VB,Jellis,Southern Metropolitan,4836.0,6.3,27.0,4.0,2.0,2.0,1581.0,316.96734,3000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34404,Kew,3,h,PI,Marshall,Southern Metropolitan,10331.0,5.4,2.0,3.0,2.0,2.0,1332.0,241.60000,4650000.0
34455,Malvern East,5,h,S,Marshall,Southern Metropolitan,8801.0,8.4,27.0,5.0,3.0,2.0,807.0,264.20000,3680000.0
34494,Middle Park,6,h,S,Marshall,Southern Metropolitan,2019.0,3.0,26.0,6.0,4.0,4.0,420.0,324.00000,5575000.0
34496,Middle Park,3,h,SA,Greg,Southern Metropolitan,2019.0,3.0,26.0,3.0,1.0,0.0,284.0,148.00000,3750000.0


In [19]:
df['Price'].shape

(27244,)

In [20]:
df_cleaned = df[z_scores <= 3]
df_cleaned.shape

(26770, 15)

### IQR method

In [21]:
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['Price'] < lower_bound) | (df['Price'] > upper_bound)]
outliers

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
135,Albert Park,3,h,PI,Buxton,Southern Metropolitan,3280.0,3.3,26.0,3.0,2.0,0.0,211.000000,198.0,2850000.0
142,Albert Park,4,h,S,Marshall,Southern Metropolitan,3280.0,3.3,26.0,4.0,2.0,1.0,153.000000,180.0,2300000.0
146,Albert Park,3,h,S,Cayzer,Southern Metropolitan,3280.0,3.3,26.0,3.0,1.0,0.0,593.488933,308.2,2485000.0
152,Albert Park,3,h,S,Cayzer,Southern Metropolitan,3280.0,3.3,26.0,3.0,2.0,1.0,177.000000,181.0,2615000.0
158,Albert Park,3,h,S,Greg,Southern Metropolitan,3280.0,3.3,26.0,3.0,2.0,1.0,228.000000,278.0,2575000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34619,Port Melbourne,4,h,PI,Biggin,Southern Metropolitan,8648.0,3.5,18.0,4.0,2.0,2.0,376.000000,208.9,2300000.0
34623,Port Melbourne,3,u,S,RT,Southern Metropolitan,8648.0,3.5,18.0,3.0,1.0,0.0,593.488933,778.8,2610000.0
34632,Prahran,3,h,VB,Marshall,Southern Metropolitan,7717.0,4.6,27.0,3.0,2.0,2.0,330.000000,175.0,2500000.0
34714,South Yarra,3,h,PI,Jellis,Southern Metropolitan,14887.0,2.7,18.0,3.0,2.0,1.0,298.000000,256.4,2855000.0


### Removing Outliers

In [22]:
df_cleaned = df[(df['Price'] >= lower_bound) & (df['Price'] <= upper_bound)]

### IQR (Capping & Flooring)

In [23]:
df['Capped_Price'] = df['Price'].apply(lambda x:
                                      lower_bound if x<lower_bound
                                      else upper_bound if x>upper_bound
                                      else x)

### Transformations(Log & sqrt)

In [24]:
df['Log_Price'] = np.log1p(df['Price'])
df['Sqrt_Price'] = np.sqrt(df['Price'])

In [25]:
numerical_features = df.select_dtypes(include=['number'])
numerical_features

Unnamed: 0,Rooms,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Capped_Price,Log_Price,Sqrt_Price
1,2,4019.0,2.5,31.0,2.0,1.0,1.0,202.000000,163.4,1480000.0,1480000.0,14.207553,1216.552506
2,2,4019.0,2.5,31.0,2.0,1.0,0.0,156.000000,79.0,1035000.0,1035000.0,13.849913,1017.349497
4,3,4019.0,2.5,31.0,3.0,2.0,0.0,134.000000,150.0,1465000.0,1465000.0,14.197366,1210.371844
5,3,4019.0,2.5,31.0,3.0,2.0,1.0,94.000000,106.4,850000.0,850000.0,13.652993,921.954446
6,4,4019.0,2.5,31.0,3.0,1.0,2.0,120.000000,142.0,1600000.0,1600000.0,14.285515,1264.911064
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,4,6543.0,6.3,16.0,4.0,1.0,3.0,593.000000,212.0,1480000.0,1480000.0,14.207553,1216.552506
34853,2,6543.0,6.3,16.0,2.0,2.0,1.0,98.000000,104.0,888000.0,888000.0,13.696728,942.337519
34854,2,6543.0,6.3,16.0,2.0,1.0,2.0,220.000000,120.0,705000.0,705000.0,13.465955,839.642781
34855,3,6543.0,6.3,16.0,3.0,1.0,0.0,593.488933,144.6,1140000.0,1140000.0,13.946540,1067.707825


# Scaling

In [26]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() #default range is [0,1]
df_scaled = df.copy()
df['Landsize_scaled'] = scaler.fit_transform(df[['Landsize']])

df[['Landsize', 'Landsize_scaled']].head()

Unnamed: 0,Landsize,Landsize_scaled
1,202.0,0.000466
2,156.0,0.00036
4,134.0,0.000309
5,94.0,0.000217
6,120.0,0.000277


## Standard Scaler
Mean = 0
Standard deviation = 1

In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['BuildingArea_scaled'] = scaler.fit_transform(df[['BuildingArea']])

df[['BuildingArea', 'BuildingArea_scaled']].head()

Unnamed: 0,BuildingArea,BuildingArea_scaled
1,163.4,0.032712
2,79.0,-0.262465
4,150.0,-0.014153
5,106.4,-0.166637
6,142.0,-0.042131


#### Robust Scaler
is perfect when you have outliers in your data. Unlike MinMax or StandardScaler, it uses the median and interquartile range (IQR) to scale the data — so it's less sensitive to extreme values.

##### X_scaled = (X - median) / IQR


In [28]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
df['Price_scaled'] = scaler.fit_transform(df[['Price']])
df[['Price', 'Price_scaled']].head()

Unnamed: 0,Price,Price_scaled
1,1480000.0,0.924242
2,1035000.0,0.25
4,1465000.0,0.901515
5,850000.0,-0.030303
6,1600000.0,1.106061


In [29]:
categorical_features = df.select_dtypes(include=['object'])
categorical_features

Unnamed: 0,Suburb,Type,Method,SellerG,Regionname
1,Abbotsford,h,S,Biggin,Northern Metropolitan
2,Abbotsford,h,S,Biggin,Northern Metropolitan
4,Abbotsford,h,SP,Biggin,Northern Metropolitan
5,Abbotsford,h,PI,Biggin,Northern Metropolitan
6,Abbotsford,h,VB,Nelson,Northern Metropolitan
...,...,...,...,...,...
34852,Yarraville,h,PI,Jas,Western Metropolitan
34853,Yarraville,h,SP,Sweeney,Western Metropolitan
34854,Yarraville,t,S,Jas,Western Metropolitan
34855,Yarraville,h,SP,hockingstuart,Western Metropolitan


In [30]:
df.CouncilArea.unique()

array([31., 22., 26.,  6., 10., 27.,  2., 21.,  8., 28., 16.,  1., 24.,
       15., 18.,  0.,  3., 12., 11., 13., 17.,  5., 19.,  9., 25., 29.,
        7., 14., 32., 30.,  4., 23., 20.])

In [31]:
df.CouncilArea.nunique()

33

In [32]:
df['CouncilArea'].value_counts()


CouncilArea
2.0     2520
6.0     2349
24.0    1790
8.0     1643
22.0    1584
18.0    1502
0.0     1457
3.0     1366
1.0     1311
16.0    1221
11.0    1036
21.0    1007
26.0     952
31.0     918
27.0     884
15.0     842
10.0     799
12.0     735
29.0     709
30.0     492
28.0     435
17.0     356
13.0     287
19.0     243
7.0      229
9.0      209
5.0      138
25.0      78
32.0      71
14.0      39
4.0       26
20.0      11
23.0       5
Name: count, dtype: int64

In [33]:
#df = pd.get_dummies(df, drop_first = True).astype(int)

### feature engineering

In [34]:
df['Price_per_m2'] = df['Price'] / df['Landsize']

In [35]:
df['Price_per_m2'].head()

1     7326.732673
2     6634.615385
4    10932.835821
5     9042.553191
6    13333.333333
Name: Price_per_m2, dtype: float64

In [36]:
# Apply log transformation (add 1 to avoid log(0))
df['Price_log'] = np.log(df['Price'] + 1)

##### Right-skewed data (like housing prices) gets "compressed," making it more normally distributed.

Helps improve model predictions

In [37]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
poly_features = poly.fit_transform(df[['BuildingArea']])
df_poly = pd.DataFrame(poly_features, columns=['BuildingArea', 'BuildingArea^2', 'BuildingArea^3'])

print(df[['BuildingArea']].head())
print(df_poly.head())

   BuildingArea
1         163.4
2          79.0
4         150.0
5         106.4
6         142.0
   BuildingArea  BuildingArea^2  BuildingArea^3
0           1.0           163.4        26699.56
1           1.0            79.0         6241.00
2           1.0           150.0        22500.00
3           1.0           106.4        11320.96
4           1.0           142.0        20164.00


###### Captures non-linear relationships between features and the target variable.

Interactions like x1 * x2 or x² can provide better insights and predictions.

In [38]:
df

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,...,BuildingArea,Price,Capped_Price,Log_Price,Sqrt_Price,Landsize_scaled,BuildingArea_scaled,Price_scaled,Price_per_m2,Price_log
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,31.0,2.0,...,163.4,1480000.0,1480000.0,14.207553,1216.552506,0.000466,0.032712,0.924242,7326.732673,14.207553
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,31.0,2.0,...,79.0,1035000.0,1035000.0,13.849913,1017.349497,0.000360,-0.262465,0.250000,6634.615385,13.849913
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,31.0,3.0,...,150.0,1465000.0,1465000.0,14.197366,1210.371844,0.000309,-0.014153,0.901515,10932.835821,14.197366
5,Abbotsford,3,h,PI,Biggin,Northern Metropolitan,4019.0,2.5,31.0,3.0,...,106.4,850000.0,850000.0,13.652993,921.954446,0.000217,-0.166637,-0.030303,9042.553191,13.652993
6,Abbotsford,4,h,VB,Nelson,Northern Metropolitan,4019.0,2.5,31.0,3.0,...,142.0,1600000.0,1600000.0,14.285515,1264.911064,0.000277,-0.042131,1.106061,13333.333333,14.285515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,4,h,PI,Jas,Western Metropolitan,6543.0,6.3,16.0,4.0,...,212.0,1480000.0,1480000.0,14.207553,1216.552506,0.001369,0.202683,0.924242,2495.784148,14.207553
34853,Yarraville,2,h,SP,Sweeney,Western Metropolitan,6543.0,6.3,16.0,2.0,...,104.0,888000.0,888000.0,13.696728,942.337519,0.000226,-0.175031,0.027273,9061.224490,13.696728
34854,Yarraville,2,t,S,Jas,Western Metropolitan,6543.0,6.3,16.0,2.0,...,120.0,705000.0,705000.0,13.465955,839.642781,0.000508,-0.119073,-0.250000,3204.545455,13.465955
34855,Yarraville,3,h,SP,hockingstuart,Western Metropolitan,6543.0,6.3,16.0,3.0,...,144.6,1140000.0,1140000.0,13.946540,1067.707825,0.001371,-0.033038,0.409091,1920.844578,13.946540


## Train_Test_Split

In [39]:
df.columns

Index(['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname',
       'Propertycount', 'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom',
       'Car', 'Landsize', 'BuildingArea', 'Price', 'Capped_Price', 'Log_Price',
       'Sqrt_Price', 'Landsize_scaled', 'BuildingArea_scaled', 'Price_scaled',
       'Price_per_m2', 'Price_log'],
      dtype='object')

In [40]:
X = df.drop('Price', axis=1)
y = df['Price']
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=2)

In [41]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_X, train_y)

ValueError: could not convert string to float: 'Flemington'

In [None]:
reg.score(test_X,test_y)
reg.score(train_X, train_y)

In [None]:
from sklearn import linear_model
lasso_reg = linear_model.Lasso(alpha=50, max_iter=100, tol=0.1)
lasso_reg.fit(train_X,train_y)

In [None]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=10, max_iter=100, tol=0.1)
ridge_reg.fit(train_X, train_y)