In [148]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [3]:
df['furnishing_type'].value_counts()

0.0    2349
1.0    1018
2.0     187
Name: furnishing_type, dtype: int64

In [4]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [5]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [6]:
X = df.drop(columns=['price'])
y = df['price']

In [7]:
# Applying the lop1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [10]:
cols_to_encode =  ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']
cols_to_normalize = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

In [39]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), cols_to_normalize),
    ('cat', OrdinalEncoder(), cols_to_encode)
    ], remainder='passthrough')

In [40]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocess',preprocessor),
    ('model',LinearRegression())
])

In [16]:
# Kfold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [18]:
scores.mean(), scores.std()

(0.7363096633436829, 0.03238005754429933)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [20]:
pipeline.fit(X_train, y_train)

In [21]:
y_pred = pipeline.predict(X_test)

In [22]:
y_pred = np.expm1(y_pred)

In [23]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.9463822160089363

In [44]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [45]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [46]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [47]:
model_output[]

[['linear_reg', 0.7363096633436829, 0.9463822160089363],
 ['svr', 0.7642012011196353, 0.847263647348393],
 ['ridge', 0.7363125343993554, 0.946338774185337],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.7755715737273194, 0.7394832144711485],
 ['random forest', 0.8819833876776532, 0.5297279501441099],
 ['extra trees', 0.868903275505134, 0.5547221861368898],
 ['gradient boosting', 0.8725322190800296, 0.5757044663233726],
 ['adaboost', 0.7574452055691104, 0.8471205323269242],
 ['mlp', 0.8125981597083479, 0.691185495272455],
 ['xgboost', 0.8917010012719994, 0.5113240614244203]]

In [54]:
scores_df = pd.DataFrame(model_output, columns=['model', 'r2_score', 'mae_score'])
scores_df.set_index('model').sort_values(by='mae_score')

Unnamed: 0_level_0,r2_score,mae_score
model,Unnamed: 1_level_1,Unnamed: 2_level_1
xgboost,0.891701,0.511324
random forest,0.881983,0.529728
extra trees,0.868903,0.554722
gradient boosting,0.872532,0.575704
mlp,0.812598,0.691185
decision tree,0.775572,0.739483
adaboost,0.757445,0.847121
svr,0.764201,0.847264
ridge,0.736313,0.946339
linear_reg,0.73631,0.946382


## OneHotEncoding

In [69]:
cols_to_encode =  ['property_type', 'balcony', 'luxury_category', 'floor_category']
cols_to_normalize = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']
cols_to_ohe = ['sector','agePossession','furnishing_type']

In [70]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
    ('num',StandardScaler(), cols_to_normalize),
    ('cat1',OrdinalEncoder(), cols_to_encode),
    ('cat2',OneHotEncoder(drop='first'), cols_to_ohe)
],remainder='passthrough')

In [71]:
# Creating Pipeline
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model', LinearRegression())
])

In [72]:
# K-fold cross-validation
kfold= KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [73]:
scores.mean(), scores.std()

(0.8546150935945891, 0.015992535217722974)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [75]:
pipeline.fit(X_train, y_train)

In [76]:
y_pred = pipeline.predict(X_test)

In [77]:
mean_absolute_error(np.expm1(y_test),np.expm1(y_pred))

0.6496724551831464

In [78]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [79]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [80]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [81]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.886858,0.474162
10,xgboost,0.89237,0.503674
5,random forest,0.871636,0.527982
1,svr,0.882774,0.533683
9,mlp,0.879417,0.542892
7,gradient boosting,0.856518,0.602299
4,decision tree,0.79288,0.639896
0,linear_reg,0.854615,0.649672
2,ridge,0.854924,0.653746
8,adaboost,0.726843,0.889195


## OneHotEncoding with PCA

In [109]:
cols_to_encode =  ['property_type', 'balcony', 'luxury_category', 'floor_category']
cols_to_normalize = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']
cols_to_ohe = ['sector','agePossession','furnishing_type']

In [110]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), cols_to_normalize),
        ('cat', OrdinalEncoder(), cols_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),cols_to_ohe)
    ], 
    remainder='passthrough'
)

In [111]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [112]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [114]:
scores.mean(), scores.std()

(0.7644414727636657, 0.028840370479173635)

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [116]:
pipeline.fit(X_train,y_train)

In [117]:
y_pred = pipeline.predict(X_test)

In [118]:
y_pred = np.expm1(y_pred)

In [119]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.9061825500645798

## Target Encoder

In [122]:
# !pip install category_encoders

In [123]:
cols_to_encode =  ['property_type', 'balcony', 'luxury_category', 'floor_category','furnishing_type']
cols_to_normalize = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']

In [124]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), cols_to_normalize),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [125]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [126]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [127]:
scores.mean(),scores.std()

(0.829521918225536, 0.018384463379122893)

In [129]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [130]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [132]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [133]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.900674,0.45003
6,extra trees,0.902291,0.458448
10,xgboost,0.900643,0.483409
7,gradient boosting,0.889073,0.507164
4,decision tree,0.829546,0.549107
9,mlp,0.846759,0.614024
8,adaboost,0.817379,0.683274
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


## Hyperparameter Tuning

In [195]:
from sklearn.model_selection import GridSearchCV

In [196]:
param_grid = {
    'regressor__n_estimators':[50,100,200,300,400,500],
    'regressor__max_depth': [None,10,20,30,40,50],
    'regressor__max_samples': [0.1, 0.25, 0.5, 0.75, 1.0],
    'regressor__max_features': ['auto','sqrt']
}

In [197]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), cols_to_normalize),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [198]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [199]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
search = GridSearchCV(pipeline,param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [200]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 360 candidates, totalling 3600 fits


In [201]:
search.best_params_

{'regressor__max_depth': 30,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 200}

In [202]:
search.best_score_

0.9030580411373524

In [203]:
final_pipe = search.best_estimator_

## Exporting the model

In [204]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [205]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=400, max_depth=30, max_features='sqrt',max_samples=1.0 ))
])

In [206]:
pipeline.fit(X,y_transformed)

In [170]:
import pickle

with open('pipeline.pkl','wb') as file:
    pickle.dump(pipeline, file)

In [171]:
with open('df.pkl','wb') as file:
    pickle.dump(X, file)

## Trying out the predictions

In [207]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [208]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [209]:
np.expm1(pipeline.predict(one_df))

array([2.95621636])

In [212]:
'Flat'.upper()

'FLAT'