<a href="https://colab.research.google.com/github/agarwalpratik/aiml/blob/main/FeatureEngg_Melbourne_Housing_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

mb = pd.read_csv('melb_data.csv')
mb = mb[['Car', 'Landsize', 'BuildingArea', 'YearBuilt','Price']]
mb.head()

Unnamed: 0,Car,Landsize,BuildingArea,YearBuilt,Price
0,1.0,202.0,,,1480000.0
1,0.0,156.0,79.0,1900.0,1035000.0
2,0.0,134.0,150.0,1900.0,1465000.0
3,1.0,94.0,,,850000.0
4,2.0,120.0,142.0,2014.0,1600000.0


In [None]:
mb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car           13518 non-null  float64
 1   Landsize      13580 non-null  float64
 2   BuildingArea  7130 non-null   float64
 3   YearBuilt     8205 non-null   float64
 4   Price         13580 non-null  float64
dtypes: float64(5)
memory usage: 530.6 KB


In [None]:
mb['Landsize'].isna().sum(), mb['Car'].isna().sum(), mb['BuildingArea'].isna().sum(), mb['YearBuilt'].isna().sum()

(0, 62, 6450, 5375)

In [None]:
#Statistical approach
mb_stat = mb

# Considering Car parking & YearBuilt as discrete numeric and could be defaulted with median()
mb_stat['Car'].fillna(mb_stat['Car'].median(), inplace=True)
mb_stat['YearBuilt'].fillna(mb_stat['YearBuilt'].median(), inplace=True)

# Considering Landsize & BuildingArea as continuous numeric and could be defaulted with mean()
#mb_stat['Landsize'].fillna(mb_stat['Landsize'].mean(), inplace=True)
mb_stat['BuildingArea'].fillna(mb_stat['BuildingArea'].mean(), inplace=True)
mb_stat.describe()

Unnamed: 0,Car,Landsize,BuildingArea,YearBuilt,Price
count,13580.0,13580.0,13580.0,13580.0,13580.0
mean,1.611856,558.416127,151.96765,1966.788218,1075684.0
std,0.960793,3990.669241,392.002962,29.088642,639310.7
min,0.0,0.0,0.0,1196.0,85000.0
25%,1.0,177.0,122.0,1960.0,650000.0
50%,2.0,440.0,151.96765,1970.0,903000.0
75%,2.0,651.0,151.96765,1975.0,1330000.0
max,10.0,433014.0,44515.0,2018.0,9000000.0


In [None]:
#Domain approach
mb_domain = mb
mb_domain['Car'].fillna(1, inplace=True)
mb_domain['Car'].replace(0,1,inplace=True)
mb_domain['BuildingArea'].fillna(80, inplace=True)
mb_domain['BuildingArea'].replace(0,80, inplace=True)
mb_domain['YearBuilt'].fillna(2000, inplace=True)
mb_domain.describe()


Unnamed: 0,Car,Landsize,BuildingArea,YearBuilt,Price
count,13580.0,13580.0,13580.0,13580.0,13580.0
mean,1.687408,558.416127,152.067797,1966.788218,1075684.0
std,0.865677,3990.669241,391.974341,29.088642,639310.7
min,1.0,0.0,1.0,1196.0,85000.0
25%,1.0,177.0,122.0,1960.0,650000.0
50%,2.0,440.0,151.96765,1970.0,903000.0
75%,2.0,651.0,151.96765,1975.0,1330000.0
max,10.0,433014.0,44515.0,2018.0,9000000.0


In [None]:
#Statistical approach
features = mb_stat.drop('Price', axis=1).values
label = mb_stat['Price'].values

In [None]:
#Domain approach
features = mb_domain.drop('Price', axis=1).values
label = mb_domain['Price'].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

def getBestModel(technique,features,label,CL=0.95,test_size=0.2):
  approvedModels = []
  for rs in range(1,100):
    xTrain,xTest,yTrain,yTest = train_test_split(features,label,test_size=0.2,random_state=rs)

    model = LinearRegression()
    model.fit(xTrain,yTrain)
    trainScore = model.score(xTrain,yTrain)
    testScore = model.score(xTest,yTest)

    if testScore > trainScore and testScore >= CL:
      approvedModels.append([testScore,trainScore,rs,model])
      #print(f"Model APPROVED, Test score: {testScore}, Train score: {trainScore} & Random State: {rs}")

  if len(approvedModels) > 0:
    approvedModels.sort(reverse=True)

    bestModel = approvedModels[0][3]
    print(f"Best model[{technique}]: Test score: {approvedModels[0][0]}, Train score: {approvedModels[0][1]} & Random State: {approvedModels[0][2]}")
    return bestModel
  else:
    print("No model approved")
    return None


In [None]:
bestModel_lr = getBestModel('Linear Regression',features,label,0.10,0.2)


Best model[Linear Regression]: Test score: 0.17938530230102467, Train score: 0.13679385743379702 & Random State: 57


# 2) Correlation Analysis Method

In [None]:
mb_domain.corr()

Unnamed: 0,Car,Landsize,BuildingArea,YearBuilt,Price
Car,1.0,0.024783,0.072387,0.024016,0.265846
Landsize,0.024783,1.0,0.094663,0.008806,0.037507
BuildingArea,0.072387,0.094663,1.0,0.005255,0.069724
YearBuilt,0.024016,0.008806,0.005255,1.0,-0.259387
Price,0.265846,0.037507,0.069724,-0.259387,1.0


In [None]:
#Guideline:
# Select feature columns that have corr greater than or equal to 50% w.r.t label column
# Therefore selected features: RDspend and MarkSpend

#No columns have corr() of more than 50% (0.5), hence Correlation Analysis does not help with feature selection for this dataset.

# Conclusion: Linear Model remains unchanged!

# 3) OLS Method

In [None]:
#Step1: Perform all in (Adding intercept feature column in existing feature array)

featuresAllIn = np.concatenate((np.ones((len(features),1)),features),axis=1)
featuresAllIn


array([[1.0000000e+00, 1.0000000e+00, 2.0200000e+02, 1.5196765e+02,
        1.9700000e+03],
       [1.0000000e+00, 1.0000000e+00, 1.5600000e+02, 7.9000000e+01,
        1.9000000e+03],
       [1.0000000e+00, 1.0000000e+00, 1.3400000e+02, 1.5000000e+02,
        1.9000000e+03],
       ...,
       [1.0000000e+00, 4.0000000e+00, 4.3600000e+02, 1.5196765e+02,
        1.9970000e+03],
       [1.0000000e+00, 5.0000000e+00, 8.6600000e+02, 1.5700000e+02,
        1.9200000e+03],
       [1.0000000e+00, 1.0000000e+00, 3.6200000e+02, 1.1200000e+02,
        1.9200000e+03]])

In [None]:
#Step2: Decide SL
SL = 0.05

In [None]:
#Step3: Perform OLS
# OLS
# endog -- label column
# exog --- feature column

import statsmodels.regression.linear_model as stat

ols_model = stat.OLS(endog=label, exog=featuresAllIn).fit()
ols_model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.145
Model:,OLS,Adj. R-squared:,0.145
Method:,Least Squares,F-statistic:,574.7
Date:,"Sun, 15 Sep 2024",Prob (F-statistic):,0.0
Time:,05:32:18,Log-Likelihood:,-199750.0
No. Observations:,13580,AIC:,399500.0
Df Residuals:,13575,BIC:,399500.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.224e+07,3.43e+05,35.669,0.000,1.16e+07,1.29e+07
x1,1.979e+05,5879.609,33.660,0.000,1.86e+05,2.09e+05
x2,4.5771,1.278,3.583,0.000,2.073,7.081
x3,79.9527,13.036,6.133,0.000,54.400,105.505
x4,-5853.4579,174.498,-33.545,0.000,-6195.498,-5511.418

0,1,2,3
Omnibus:,6846.247,Durbin-Watson:,1.466
Prob(Omnibus):,0.0,Jarque-Bera (JB):,79079.501
Skew:,2.151,Prob(JB):,0.0
Kurtosis:,14.011,Cond. No.,273000.0


In [None]:
#Step4: Select the feature that has the highest p value

ols_model.pvalues[ols_model.pvalues > 0.5]

array([], dtype=float64)

In [None]:
#Step5: if pvalue > SL : eliminate that feature and recreate new one
# else: stop

#Conclusion: None of the columns can be eliminated so the model remains unchanged.

# 4) RFE: Recursive Feature Elimination

In [None]:
#Step 1: Select a linear regression model as the estimator model and create RFE model
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

rfe_estimator_model = LinearRegression()
rfe = RFE(estimator=rfe_estimator_model)

#Step 2: Fit the fearure
rfe.fit(features,label)


In [None]:
rfe.ranking_

array([1, 3, 2, 1])

In [None]:
#Step 3: Preserve features with ranking 1 & 2, eliminate others
# features_rfe = features[:,[0,2,3]]

features_rfe = rfe.transform(features)


In [None]:
features_rfe

array([[1.000e+00, 1.970e+03],
       [1.000e+00, 1.900e+03],
       [1.000e+00, 1.900e+03],
       ...,
       [4.000e+00, 1.997e+03],
       [5.000e+00, 1.920e+03],
       [1.000e+00, 1.920e+03]])

In [None]:
#Step 4: Create model using RFE feature set
bestModel_rfe = getBestModel('RFE',features_rfe,label,0.10,0.2)


Best model[RFE]: Test score: 0.17436419367023448, Train score: 0.13379291201880505 & Random State: 57


# 5) SME: Select From Model

In [None]:
#Step 1: Select a linear regression model as the estimator model and create RFE model
from sklearn.feature_selection import SelectFromModel
sfm_estimator_model = LinearRegression()
sfm = SelectFromModel(estimator=sfm_estimator_model)

#Step 2: Fit the fearure
sfm.fit(features,label)

In [None]:
# Step 3: Get Support
sfm.get_support()

array([ True, False, False, False])

In [None]:
features_sfm = sfm.transform(features)
# label_sfm = label

features_sfm

array([[1.],
       [1.],
       [1.],
       ...,
       [4.],
       [5.],
       [1.]])

In [None]:
# Step 4: Get best model based on SFM features
bestModel_sfm = getBestModel('SFM',features_sfm,label,0.0,0.2)

Best model[SFM]: Test score: 0.09523742022768156, Train score: 0.06460424744071447 & Random State: 42


In [None]:
#Summary

bestModel_lr = getBestModel('LR',features,label,0.0,0.2)
bestModel_rfe = getBestModel('RFE',features_rfe,label,0.0,0.2)
bestModel_sfm = getBestModel('SFM',features_sfm,label,0.0,0.2)

Best model[LR]: Test score: 0.17938530230102467, Train score: 0.13679385743379702 & Random State: 57
Best model[RFE]: Test score: 0.17436419367023448, Train score: 0.13379291201880505 & Random State: 57
Best model[SFM]: Test score: 0.09523742022768156, Train score: 0.06460424744071447 & Random State: 42


In [None]:
#Export model
# import pickle
# pickle.dump(bestModel,open('melbourneHousingPricePredictor.mdl','wb'))