In [33]:
import numpy as np
import pandas as pd
import gc
import warnings
warnings.filterwarnings('ignore')

In [34]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, PowerTransformer

In [36]:
df = pd.read_csv('/content/gurgaon_properties_final.csv', on_bad_lines = 'skip')

df.head()

Unnamed: 0,property_type,sector,built_up_area,agepossession,bedroom,bathroom,study room,servant room,store room,balcony,furnishing_type,luxury_category,floor_category,price_in_cr
0,flat,Manesar,2233,Moderately Old,4,4,0,0,0,3,Semi-furnished,Low,Mid-rise,0.9
1,house,Sector 48,3229,Moderately Old,4,2,1,1,1,1,Un-furnished,Medium,Low-rise,7.35
2,house,Sector 10A,2367,Moderately Old,4,3,0,0,0,2,Semi-furnished,Low,Low-rise,4.25
3,flat,Sector 81,1300,Relatively New,2,2,0,0,0,3,Semi-furnished,Medium,Mid-rise,0.87
4,flat,Sector 88A,1582,New Property,3,3,0,0,0,3+,Semi-furnished,Medium,High-rise,2.65


In [37]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1].values

# y_transformed = np.log1p(y)

scaler = PowerTransformer(method = 'yeo-johnson', standardize = True)

y_transformed = scaler.fit_transform(y.reshape(-1,1))

In [38]:
def scorer(model_name, model, preprocessor, pca = False):

    output = []

    output.append(model_name)

    if(pca == True):
       pipeline = Pipeline(steps = [
                  ('preprocessor', preprocessor),
                  ('pca', PCA(n_components = 0.95)),
                  ('regressor', model)
                 ])
    else:
        pipeline = Pipeline(steps = [
                   ('preprocessor', preprocessor),
                   ('regressor', model)
                  ])

    # K-fold cross-validation

    kfold = KFold(n_splits = 10, shuffle = True)

    r2 =  np.mean(cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring = 'r2', n_jobs = -1))
    mae = np.mean(cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring = 'neg_mean_absolute_error', n_jobs = -1))

    output.append(r2) # kfold r2
    output.append(-mae) # kfolr mae

    return output

In [39]:
model_dict = {
    'SVR': SVR(),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'LASSO Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Extra Trees': ExtraTreesRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# One Hot Encoding without sector

In [40]:
ordinal_encoding_cols = ['sector', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']

ohe_encoding_cols = ['property_type', 'agepossession']

columns_to_scale  =  X.select_dtypes(include = 'number').columns.tolist()

In [41]:
matrix = pd.pivot_table(df, index = 'sector', values = 'price_in_cr', aggfunc = 'mean')

matrix.sort_values(by = 'price_in_cr', ascending = True, inplace = True)


In [42]:
l2 = matrix.index.tolist()
l3 = ['1', '0', '2', '3', '3+']
l4 = ['Semi-furnished', 'Un-furnished', 'Furnished']
l5 = ['Low', 'Average', 'Medium', 'High']
l6 = ['Basement/Ground Floor', 'Mid-rise', 'High-rise', 'Low-rise', 'Skyscraper']

oe_categories = [l2,l3,l4,l5,l6]

## Type1: OHE with Yeo Johnson Transformation

In [43]:
trf_applied1 = [('category_ordinal', OrdinalEncoder(categories = oe_categories, dtype = int, handle_unknown = 'use_encoded_value', unknown_value = -1), ordinal_encoding_cols),
               ('category_ohe', OneHotEncoder(dtype = int, drop = 'first', sparse_output = False, handle_unknown = 'ignore'), ohe_encoding_cols),
               ('numerical', PowerTransformer(method = 'yeo-johnson', standardize = True), columns_to_scale)
               ]

preprocessor1 = ColumnTransformer(transformers = trf_applied1, remainder = 'passthrough')


In [44]:
models_output = []
model_df = pd.DataFrame()

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor1))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'kfold_mae'])

    model_df.sort_values(by = ['kfold_mae', 'kfold_r2'], ascending = [True, False], inplace = True, ignore_index = True)


print('OHE without sector and with Yeo Johson Transformation')
model_df

OHE without sector and with Yeo Johson Transformation


Unnamed: 0,name,kfold_r2,kfold_mae
0,Extra Trees,0.895594,0.216705
1,Random Forest,0.897957,0.221092
2,Gradient Boosting,0.884,0.255438
3,Linear Regression,0.859675,0.285979
4,Ridge Regression,0.861052,0.286045
5,Decision Tree,0.796987,0.288594
6,SVR,0.805489,0.333615
7,LASSO Regression,0.542536,0.534298


In [45]:
del model_df

gc.collect()

369

## Type2: OHE with Yeo-Johsnson Tranformation & PCA

In [46]:
models_output = []

model_df = pd.DataFrame()

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor1, True))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'kfold_mae'])

    model_df.sort_values(by = ['kfold_mae', 'kfold_r2'], ascending = [True, False], inplace = True, ignore_index = True)


print('OHE without sector and with Yeo Johson Transformation & PCA')
model_df


OHE without sector and with Yeo Johson Transformation & PCA


Unnamed: 0,name,kfold_r2,kfold_mae
0,Random Forest,0.744888,0.34828
1,Extra Trees,0.720931,0.358679
2,Decision Tree,0.661954,0.382911
3,Gradient Boosting,0.658361,0.441962
4,SVR,0.549047,0.51694
5,Linear Regression,0.542853,0.533033
6,Ridge Regression,0.545416,0.533378
7,LASSO Regression,0.54365,0.533982


In [47]:
del model_df

gc.collect()

187

## Type3: OHE with Standard Scaler

In [48]:
trf_applied = [('category_ordinal', OrdinalEncoder(categories = oe_categories, dtype = int, handle_unknown = 'use_encoded_value', unknown_value = -1), ordinal_encoding_cols),
               ('category_ohe', OneHotEncoder(dtype = int, drop = 'first', sparse_output = False, handle_unknown = 'ignore'), ohe_encoding_cols),
               ('numerical', StandardScaler(), columns_to_scale)
               ]

preprocessor2 = ColumnTransformer(transformers = trf_applied, remainder = 'passthrough')


In [49]:
models_output = []

model_df = pd.DataFrame()

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor2))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'kfold_mae'])

    model_df.sort_values(by = ['kfold_mae', 'kfold_r2'], ascending = [True, False], inplace = True, ignore_index = True)


print('OHE without sector and with Standard Scaler')
model_df

OHE without sector and with Standard Scaler


Unnamed: 0,name,kfold_r2,kfold_mae
0,Extra Trees,0.897563,0.219345
1,Random Forest,0.89225,0.22182
2,Gradient Boosting,0.884198,0.256043
3,Decision Tree,0.79797,0.292384
4,Linear Regression,0.820467,0.328474
5,Ridge Regression,0.819745,0.32866
6,SVR,0.776671,0.357494
7,LASSO Regression,0.543328,0.534067


In [50]:
del model_df

gc.collect()

187

## Type4: OHE with Standard Scaler & PCA

In [51]:
models_output = []

model_df = pd.DataFrame()

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor2, True))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'kfold_mae'])

    model_df.sort_values(by = ['kfold_mae', 'kfold_r2'], ascending = [True, False], inplace = True, ignore_index = True)


print('OHE without sector and with Standar Scaler & PCA')
model_df

OHE without sector and with Standar Scaler & PCA


Unnamed: 0,name,kfold_r2,kfold_mae
0,Random Forest,0.736688,0.352924
1,Extra Trees,0.706492,0.363707
2,Decision Tree,0.66226,0.391238
3,Gradient Boosting,0.659008,0.444118
4,SVR,0.5528,0.516624
5,Ridge Regression,0.544856,0.532905
6,Linear Regression,0.544907,0.532982
7,LASSO Regression,0.541221,0.534091


In [52]:
del model_df

gc.collect()

135

# One Hot Encoding with sector

In [53]:
ordinal_encoding_cols = ['balcony', 'furnishing_type', 'luxury_category', 'floor_category']

ohe_encoding_cols = ['property_type', 'sector', 'agepossession']

columns_to_scale  =  X.select_dtypes(include = 'number').columns.tolist()

l1 = ['1', '0', '2', '3', '3+']
l2 = ['Semi-furnished', 'Un-furnished', 'Furnished']
l3 = ['Low', 'Average', 'Medium', 'High']
l4 = ['Basement/Ground Floor', 'Mid-rise', 'High-rise', 'Low-rise', 'Skyscraper']

oe_categories = [l1,l2,l3,l4]



## Typ1: OHE with and Yeo Johnson transformation

In [54]:
trf_applied3 = [('category_ordinal', OrdinalEncoder(categories = oe_categories, dtype = int, handle_unknown = 'use_encoded_value', unknown_value = -1), ordinal_encoding_cols),
                ('category_ohe', OneHotEncoder(dtype = int, drop = 'first', sparse_output = False, handle_unknown = 'ignore'), ohe_encoding_cols),
                ('numerical', PowerTransformer(method = 'yeo-johnson', standardize = True), columns_to_scale)
               ]

preprocessor3 = ColumnTransformer(transformers = trf_applied3, remainder = 'passthrough')


In [55]:
models_output = []

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor3))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'kfold_mae'])

    model_df.sort_values(by = ['kfold_mae', 'kfold_r2'], ascending = [True, False], inplace = True, ignore_index = True)


print('OHE with sector and Yeo Johnson transformation')
model_df

OHE with sector and Yeo Johnson transformation


Unnamed: 0,name,kfold_r2,kfold_mae
0,Extra Trees,0.880296,0.22594
1,SVR,0.883448,0.242707
2,Random Forest,0.869313,0.244986
3,Linear Regression,0.874182,0.256534
4,Ridge Regression,0.875455,0.259722
5,Gradient Boosting,0.849856,0.294286
6,Decision Tree,0.774383,0.301659
7,LASSO Regression,-0.00446,0.801924


In [56]:
del model_df

gc.collect()

161

## Type2: OHE with Yeo-Johsnson Tranformation & PCA

In [57]:
models_output = []

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor3, True))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'kfold_mae'])

    model_df.sort_values(by = ['kfold_mae', 'kfold_r2'], ascending = [True, False], inplace = True, ignore_index = True)

print('OHE with sector and Yeo Johnson transformation and PCA')
model_df

OHE with sector and Yeo Johnson transformation and PCA


Unnamed: 0,name,kfold_r2,kfold_mae
0,Extra Trees,0.835895,0.285543
1,Random Forest,0.829903,0.300942
2,SVR,0.825107,0.306619
3,Gradient Boosting,0.820421,0.319956
4,Ridge Regression,0.793705,0.350981
5,Linear Regression,0.794206,0.35128
6,Decision Tree,0.616385,0.410462
7,LASSO Regression,0.137219,0.742106


In [58]:
del model_df

gc.collect()

239

## Type3: OHE with Standard Scaler

In [59]:
trf_applied4 = [('category_ordinal', OrdinalEncoder(categories = oe_categories, dtype = int, handle_unknown = 'use_encoded_value', unknown_value = -1), ordinal_encoding_cols),
               ('category_ohe', OneHotEncoder(dtype = int, drop = 'first', sparse_output = False, handle_unknown = 'ignore'), ohe_encoding_cols),
               ('numerical', StandardScaler(), columns_to_scale)
               ]

preprocessor4 = ColumnTransformer(transformers = trf_applied4, remainder = 'passthrough')


In [60]:
models_output = []

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor4))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'kfold_mae'])

    model_df.sort_values(by = ['kfold_mae', 'kfold_r2'], ascending = [True, False], inplace = True, ignore_index = True)

print('OHE with sector and Standard Scaler')
model_df

OHE with sector and Standard Scaler


Unnamed: 0,name,kfold_r2,kfold_mae
0,Extra Trees,0.880271,0.226493
1,Random Forest,0.874972,0.243973
2,SVR,0.876794,0.248539
3,Gradient Boosting,0.849058,0.293721
4,Linear Regression,0.834158,0.29995
5,Ridge Regression,0.834258,0.300029
6,Decision Tree,0.776198,0.30525
7,LASSO Regression,-0.001594,0.801687


In [61]:
del model_df

gc.collect()

187

## Type4: OHE with Standard Scaler & PCA

In [62]:
models_output = []

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor4))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'kfold_mae'])

    model_df.sort_values(by = ['kfold_mae', 'kfold_r2'], ascending = [True, False], inplace = True, ignore_index = True)

print('OHE with sector and Standard Scaler &  PCA')
model_df

OHE with sector and Standard Scaler &  PCA


Unnamed: 0,name,kfold_r2,kfold_mae
0,Extra Trees,0.879637,0.22324
1,Random Forest,0.873152,0.244644
2,SVR,0.876272,0.24827
3,Gradient Boosting,0.851125,0.292797
4,Linear Regression,0.833712,0.299271
5,Ridge Regression,0.834301,0.299517
6,Decision Tree,0.780503,0.305543
7,LASSO Regression,-0.003975,0.801914


## Type5:

In [63]:
columns_to_encode = ['property_type', 'balcony' ,'luxury_category', 'floor_category']
columns_to_scale =  ['built_up_area','bedroom','bathroom', 'study room', 'servant room', 'store room']

preprocessor5 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columns_to_scale),
        ('cat', OrdinalEncoder(categories = oe_categories, dtype = int, handle_unknown = 'use_encoded_value', unknown_value = -1), columns_to_encode),
        ('cat1',OneHotEncoder(dtype = int, drop = 'first', sparse_output = False, handle_unknown = 'ignore'), ['sector','agepossession','furnishing_type'])
    ],
    remainder='passthrough'
)


In [64]:
models_output = []

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor5))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'kfold_mae'])

    model_df.sort_values(by = ['kfold_mae', 'kfold_r2'], ascending = [True, False], inplace = True, ignore_index = True)

model_df

Unnamed: 0,name,kfold_r2,kfold_mae
0,Extra Trees,0.858365,0.251153
1,SVR,0.858106,0.266696
2,Random Forest,0.84619,0.267525
3,Gradient Boosting,0.830556,0.315598
4,Decision Tree,0.743702,0.325461
5,Ridge Regression,0.804347,0.329763
6,Linear Regression,0.802171,0.330759
7,LASSO Regression,-0.000654,0.801778
