In [32]:
# Import required libraries for data analysis and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [33]:
# Load Melbourne housing dataset
df=pd.read_csv("./melb_data.csv")

In [34]:

# Check for missing values in the dataset
df.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [35]:
# Display dataset information including datatypes and memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [36]:
# Fill missing values in Car column with mean
df['Car']=df['Car'].fillna(df['Car'].mean())

In [37]:
# Fill missing values in BuildingArea with mean
df['BuildingArea'].fillna(df['BuildingArea'].mean(), inplace=True)

In [38]:
# Fill missing values in YearBuilt with mode (most frequent value)
df['YearBuilt'].fillna(df['YearBuilt'].mode()[0], inplace=True)

In [39]:
# Fill missing values in CouncilArea using forward fill method
df['CouncilArea'].fillna(method='ffill', inplace=True)

In [40]:
# Verify if all missing values are handled
df.isnull().sum()

Suburb           0
Address          0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Date             0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
YearBuilt        0
CouncilArea      0
Lattitude        0
Longtitude       0
Regionname       0
Propertycount    0
dtype: int64

In [41]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [42]:
# Create and apply one-hot encoder for categorical variables
categorical_cols = ['Type', 'Method', 'CouncilArea']
onehot = OneHotEncoder(sparse=False)
encoded_cats = pd.DataFrame(
    onehot.fit_transform(df[categorical_cols]),
    columns=onehot.get_feature_names_out(categorical_cols)
)



In [43]:
# Scale numerical features
numerical_cols = ['Rooms', 'Price', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 
                 'Landsize', 'BuildingArea', 'YearBuilt']
scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df[numerical_cols]), 
    columns=[col + '_scaled' for col in numerical_cols]
)

In [44]:
# Combine one-hot encoded and scaled features
df_processed = pd.concat([df_scaled, encoded_cats], axis=1)

In [45]:

# Display first few rows of processed data
print("Shape of processed data:", df_processed.shape)
df_processed.head()

Shape of processed data: (13580, 50)


Unnamed: 0,Rooms_scaled,Price_scaled,Distance_scaled,Bedroom2_scaled,Bathroom_scaled,Car_scaled,Landsize_scaled,BuildingArea_scaled,YearBuilt_scaled,Type_h,...,CouncilArea_Moreland,CouncilArea_Nillumbik,CouncilArea_Port Phillip,CouncilArea_Stonnington,CouncilArea_Unavailable,CouncilArea_Whitehorse,CouncilArea_Whittlesea,CouncilArea_Wyndham,CouncilArea_Yarra,CouncilArea_Yarra Ranges
0,-0.981463,0.632448,-1.301485,-0.947035,-0.772376,-0.635232,-0.089316,0.0,0.110418,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.981463,-0.06364,-1.301485,-0.947035,-0.772376,-1.676467,-0.100843,-0.186147,-2.296108,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.064876,0.608984,-1.301485,0.088284,0.673367,-1.676467,-0.106356,-0.00502,-2.296108,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.064876,-0.353025,-1.301485,0.088284,0.673367,-0.635232,-0.11638,0.0,0.110418,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.111216,0.820157,-1.301485,0.088284,-0.772376,0.406003,-0.109864,-0.025428,1.623091,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
