In [2]:
import numpy as np
import pandas as pd
import gc
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('/content/gurgaon_properties_final.csv', on_bad_lines = 'skip')

df.head()


Unnamed: 0,property_type,sector,built_up_area,agepossession,bedroom,bathroom,study room,servant room,store room,balcony,furnishing_type,luxury_category,floor_category,price_in_cr
0,flat,Manesar,2233,Moderately Old,4,4,0,0,0,3,Semi-furnished,Low,Mid-rise,0.9
1,house,Sector 48,3229,Moderately Old,4,2,1,1,1,1,Un-furnished,Medium,Low-rise,7.35
2,house,Sector 10A,2367,Moderately Old,4,3,0,0,0,2,Semi-furnished,Low,Low-rise,4.25
3,flat,Sector 81,1300,Relatively New,2,2,0,0,0,3,Semi-furnished,Medium,Mid-rise,0.87
4,flat,Sector 88A,1582,New Property,3,3,0,0,0,3+,Semi-furnished,Medium,High-rise,2.65


In [4]:
df.isnull().sum().sum()

0

In [5]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [6]:
# Applying the log1p transformation to the target variable

y_transformed = np.log1p(y).values

# Ordinal Encoding

In [7]:
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, PowerTransformer


In [9]:
columns_to_encode  = X.select_dtypes(exclude = 'number').columns.tolist()

columns_to_scale  =  X.select_dtypes(include = 'number').columns.tolist()


In [10]:
def scorer(model_name, model, preprocessor):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation

    kfold = KFold(n_splits = 10, shuffle = True)

    scores1 = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring = 'r2', n_jobs = -1)

    scores2 = cross_val_score(pipeline, X, y_transformed, cv = kfold, scoring = 'neg_mean_absolute_error', n_jobs = -1)

    output.append(scores1.mean()) # kfold r2

    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append((r2_score(np.expm1(y_test), y_pred))) # r2
    output.append(mean_absolute_error(np.expm1(y_test),y_pred)) # mae

    return output


##Trying Multiple Models with Ordinal Encoder

In [11]:
model_dict = {
    'SVR': SVR(),
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'LASSO Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators = 200),
    'Extra Trees': ExtraTreesRegressor(n_estimators = 200),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators = 200)
}

### Type1: Ordinal Encoding with StandardScaler

In [12]:
trf_applied1 = [('categorical', OrdinalEncoder(dtype = int, handle_unknown = 'use_encoded_value', unknown_value = -1), columns_to_encode),
                ('numerical', StandardScaler(), columns_to_scale)]

preprocessor1 = ColumnTransformer(transformers = trf_applied1, remainder = 'passthrough')



### Type2: Ordinal Encoding with PowerTransformer

In [13]:
trf_applied2 = [('categorical', OrdinalEncoder(dtype = int, handle_unknown = 'use_encoded_value', unknown_value = -1), columns_to_encode),
               ('numerical', PowerTransformer(method = 'yeo-johnson', standardize = True), columns_to_scale)]


preprocessor2 = ColumnTransformer(transformers = trf_applied2, remainder = 'passthrough')


### Type3 : Ordinal Encoder with defined categories1 and Standard Scaler

In [14]:
matrix = pd.pivot_table(df, index = 'sector', values = 'price_in_cr', aggfunc = 'mean')

matrix.sort_values(by = 'price_in_cr', ascending = True, inplace = True)

l1 = ['flat', 'house']
l2 = matrix.index.tolist()
l3 = ['New Property', 'Relatively New', 'Moderately Old', 'Old','Under Construction']
l4 = ['1', '0', '2', '3', '3+']
l5 = ['Semi-furnished', 'Un-furnished', 'Furnished']
l6 = ['Low', 'Average', 'Medium', 'High']
l7 = ['Basement/Ground Floor', 'Mid-rise', 'High-rise', 'Low-rise', 'Skyscraper']

oe_categories = [l1,l2,l3,l4,l5,l6,l7]

In [15]:
trf_applied3 = [('categorical', OrdinalEncoder(categories = oe_categories, dtype = int, handle_unknown = 'use_encoded_value', unknown_value = -1), columns_to_encode),
               ('numerical', StandardScaler(), columns_to_scale)]


preprocessor3 = ColumnTransformer(transformers = trf_applied3, remainder = 'passthrough')


### Type4 : Ordinal Encoder with defined categories1 and Power Transformer

In [16]:
trf_applied4 = [('categorical', OrdinalEncoder(categories = oe_categories, dtype = int, handle_unknown = 'use_encoded_value', unknown_value = -1), columns_to_encode),
                ('numerical', PowerTransformer(method = 'yeo-johnson', standardize = True), columns_to_scale)]


preprocessor4 = ColumnTransformer(transformers = trf_applied4, remainder = 'passthrough')


###Type5: Ordinal Encoder with defined categories2 and StandardScaler

In [17]:
matrix = pd.pivot_table(df, index = 'sector', values = 'price_in_cr', aggfunc = 'mean')

matrix.sort_values(by = 'price_in_cr', ascending = True, inplace = True)

l1 = ['flat', 'house']
l2 = matrix.index.tolist()
l3 = ['Old', 'Moderately Old', 'Relatively New', 'New Property', 'Under Construction']
l4 = ['0', '1','2', '3', '3+']
l5 = ['Un-furnished', 'Semi-furnished', 'Furnished']
l6 = ['Low', 'Average', 'Medium', 'High']
l7 = ['Basement/Ground Floor', 'Mid-rise', 'High-rise', 'Low-rise', 'Skyscraper']

oe_categories = [l1,l2,l3,l4,l5,l6,l7]

trf_applied5 = [('categorical', OrdinalEncoder(categories = oe_categories, dtype = int, handle_unknown = 'use_encoded_value', unknown_value = -1), columns_to_encode),
               ('numerical', StandardScaler(), columns_to_scale)]


preprocessor5  = ColumnTransformer(transformers = trf_applied5, remainder = 'passthrough')


### Type6: Ordinal Encoder with defined categories2 and Power Transformer

In [18]:
trf_applied6 = [('categorical', OrdinalEncoder(categories = oe_categories, dtype = int, handle_unknown = 'use_encoded_value', unknown_value = -1), columns_to_encode),
                ('numerical', PowerTransformer(method = 'yeo-johnson', standardize = True), columns_to_scale)]


preprocessor6  = ColumnTransformer(transformers = trf_applied6, remainder = 'passthrough')


In [19]:
models_output = []

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor1))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'r2','mae'])

    model_df.sort_values(by = ['mae', 'kfold_r2', 'r2' ], ascending=[True, False, False], inplace = True, ignore_index = True)


model_df

Unnamed: 0,name,kfold_r2,r2,mae
0,Gradient Boosting,0.87783,0.814456,0.578789
1,Random Forest,0.876009,0.804075,0.585847
2,Extra Trees,0.857623,0.777532,0.652423
3,Decision Tree,0.767947,0.773376,0.694193
4,Ridge Regression,0.711317,0.567207,0.92809
5,Linear Regression,0.709858,0.56694,0.928118
6,SVR,0.707331,0.58289,0.965761
7,LASSO Regression,0.042026,-0.038118,1.577734


In [20]:
# Delete the DataFrame

del model_df

gc.collect()


123

In [21]:
models_output = []

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor2))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'r2','mae'])

    model_df.sort_values(by = ['mae', 'kfold_r2', 'r2' ], ascending=[True, False, False], inplace = True, ignore_index = True)


model_df



Unnamed: 0,name,kfold_r2,r2,mae
0,Gradient Boosting,0.878747,0.8135,0.580301
1,Random Forest,0.871959,0.809256,0.584612
2,Extra Trees,0.861152,0.772623,0.654455
3,Decision Tree,0.756698,0.758118,0.698799
4,Linear Regression,0.744885,0.688616,0.874219
5,Ridge Regression,0.742699,0.688423,0.874304
6,SVR,0.717333,0.495725,0.979273
7,LASSO Regression,0.043608,-0.038118,1.577734


In [22]:
del model_df

gc.collect()

231

In [23]:
models_output = []

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor3))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'r2','mae'])

    model_df.sort_values(by = ['mae', 'kfold_r2', 'r2' ], ascending=[True, False, False], inplace = True, ignore_index = True)


model_df

Unnamed: 0,name,kfold_r2,r2,mae
0,Extra Trees,0.890518,0.803279,0.545273
1,Random Forest,0.890133,0.776581,0.557935
2,Gradient Boosting,0.890543,0.806875,0.560675
3,Decision Tree,0.7884,0.664309,0.687369
4,Linear Regression,0.825607,0.767305,0.72443
5,Ridge Regression,0.824637,0.767293,0.724463
6,SVR,0.808377,0.742466,0.754071
7,LASSO Regression,0.528832,0.317659,1.225029


In [24]:
del model_df

gc.collect()

257

In [25]:
models_output = []

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor4))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'r2','mae'])

    model_df.sort_values(by = ['mae', 'kfold_r2', 'r2' ], ascending=[True, False, False], inplace = True, ignore_index = True)


model_df



Unnamed: 0,name,kfold_r2,r2,mae
0,Extra Trees,0.894215,0.804496,0.539739
1,Random Forest,0.892302,0.781345,0.553819
2,Gradient Boosting,0.892261,0.80745,0.560981
3,Decision Tree,0.799889,0.681797,0.672713
4,Linear Regression,0.835061,0.75419,0.699094
5,Ridge Regression,0.834903,0.754082,0.6992
6,SVR,0.816187,0.688588,0.755977
7,LASSO Regression,0.528293,0.317659,1.225029


In [26]:
del model_df

gc.collect()

257

In [27]:
models_output = []

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor5))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'r2','mae'])

    model_df.sort_values(by = ['mae', 'kfold_r2', 'r2' ], ascending=[True, False, False], inplace = True, ignore_index = True)


model_df

Unnamed: 0,name,kfold_r2,r2,mae
0,Extra Trees,0.892537,0.8004,0.549486
1,Gradient Boosting,0.894836,0.807969,0.556785
2,Random Forest,0.888266,0.772324,0.561207
3,Decision Tree,0.784574,0.672213,0.680282
4,Linear Regression,0.826058,0.770566,0.728251
5,Ridge Regression,0.826269,0.770556,0.728284
6,SVR,0.810463,0.737284,0.759277
7,LASSO Regression,0.530359,0.317659,1.225029


In [28]:
del model_df

gc.collect()

257

In [29]:
models_output = []

model_df = pd.DataFrame()

for i, (model_name, model) in enumerate(model_dict.items()):

    models_output.append(scorer(model_name, model, preprocessor6))

    model_df = pd.DataFrame(models_output, columns = ['name', 'kfold_r2', 'r2', 'mae'])

    model_df.sort_values(by = ['mae', 'kfold_r2', 'r2' ], ascending = [True, False, False], inplace = True, ignore_index = True)


model_df

Unnamed: 0,name,kfold_r2,r2,mae
0,Extra Trees,0.893103,0.80074,0.540752
1,Gradient Boosting,0.89089,0.808844,0.556504
2,Random Forest,0.892156,0.774821,0.561552
3,Decision Tree,0.812726,0.666934,0.689511
4,Linear Regression,0.838531,0.755365,0.701386
5,Ridge Regression,0.838557,0.75524,0.70148
6,SVR,0.815901,0.677821,0.764873
7,LASSO Regression,0.530071,0.317659,1.225029


## Best among all

In [30]:
# Type3 > Type 5 > Type 4 > Type 5 > Type2