In [134]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [135]:
# gurgaon_properties_post_feature_selection_v2.csv
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [136]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [137]:
df['agePossession'].value_counts()

Relatively New        1732
Moderately Old         619
New Property           599
Old Property           327
Under Construction     277
Name: agePossession, dtype: int64

In [138]:
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [139]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [140]:
X = df.drop(columns='price')
y = df['price']

In [141]:
y_transformed = np.log1p(y)

# Ordinal Encoding

In [142]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [143]:
preprocessor = ColumnTransformer(
transformers = [
    ('num',StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
    ('cat',OrdinalEncoder(),columns_to_encode)
],
remainder='passthrough')

In [144]:
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

In [145]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [146]:
scores.mean()

0.7363096633436828

In [147]:
scores.std()

0.03238005754429936

In [148]:
X_train, X_test,y_train,y_test=train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [149]:
pipeline.fit(X_train,y_train)

In [150]:
y_pred = pipeline.predict(X_test)


In [151]:
y_pred = np.expm1(y_pred)

In [152]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.9463822160089356

In [153]:
def scorer(model_name, model):
    output = []
    output.append(model_name)
    
    pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',model)])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test,y_train,y_test=train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [154]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [155]:
model_output =[]
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))
    



In [156]:
model_output

[['linear_reg', 0.7363096633436828, 0.9463822160089356],
 ['svr', 0.7642012011196353, 0.8472636473483922],
 ['ridge', 0.7363125343993554, 0.946338774185337],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.777933275770901, 0.734830703728776],
 ['random forest', 0.8814554692468896, 0.5358366537837798],
 ['extra trees', 0.8680207395698734, 0.5437809278216167],
 ['gradient boosting', 0.872533974696393, 0.5759395411646321],
 ['adaboost', 0.7544384073282353, 0.8524896414617488],
 ['mlp', 0.811902938564938, 0.7224343239846016],
 ['xgboost', 0.8917010012719994, 0.5113240614244203]]

In [157]:
model_df =pd.DataFrame(model_output,columns=['name','r2','mae'])

In [158]:
model_df.sort_values('r2',ascending=False)

Unnamed: 0,name,r2,mae
10,xgboost,0.891701,0.511324
5,random forest,0.881455,0.535837
7,gradient boosting,0.872534,0.57594
6,extra trees,0.868021,0.543781
9,mlp,0.811903,0.722434
4,decision tree,0.777933,0.734831
1,svr,0.764201,0.847264
8,adaboost,0.754438,0.85249
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


### OneHotEncoding

In [159]:
preprocessor = ColumnTransformer(
transformers = [
    ('num',StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
    ('cat',OrdinalEncoder(),columns_to_encode),
    ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
],
remainder='passthrough')

In [160]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [161]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [162]:
scores.mean()

0.8546094810971422

In [163]:
scores.std()

0.015997422908695623

In [164]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [165]:
pipeline.fit(X_train,y_train)

In [166]:
y_pred = pipeline.predict(X_test)

In [167]:
y_pred = np.expm1(y_pred)

In [168]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6497514315131458

In [169]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [170]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [171]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [172]:
model_output

[['linear_reg', 0.8546094810971422, 0.6497514315131458],
 ['svr', 0.7697413260547326, 0.8341243500492146],
 ['ridge', 0.8546784656319515, 0.6528940620586247],
 ['LASSO', 0.05943378064493578, 1.528905986892753],
 ['decision tree', 0.8022076157892455, 0.6906597431376241],
 ['random forest', 0.8911400100489415, 0.4929285473916106],
 ['extra trees', 0.8940543933414954, 0.47251305460865967],
 ['gradient boosting', 0.8766775733377171, 0.5709798019708069],
 ['adaboost', 0.7462510880988102, 0.8473533973663244],
 ['mlp', 0.8716936301214944, 0.5437317355318642],
 ['xgboost', 0.8962183265106202, 0.488796256017752]]

In [173]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [174]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.894054,0.472513
10,xgboost,0.896218,0.488796
5,random forest,0.89114,0.492929
9,mlp,0.871694,0.543732
7,gradient boosting,0.876678,0.57098
0,linear_reg,0.854609,0.649751
2,ridge,0.854678,0.652894
4,decision tree,0.802208,0.69066
1,svr,0.769741,0.834124
8,adaboost,0.746251,0.847353


## OneHotEncoding With PCA

In [175]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [176]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [177]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\pipeline.py", line 416, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\pipeline.py", line 370, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\pipeline.py", line 950, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\decomposition\_pca.py", line 460, in fit_transform
    U, S, Vt = self._fit(X)
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\decomposition\_pca.py", line 478, in _fit
    raise TypeError(
TypeError: PCA does not support sparse input. See TruncatedSVD for a possible alternative.


In [None]:
scores.mean()

## Target Encoder

In [63]:
# import category_encoders as ce
# !pip install statsmodels==0.12.2
# !pip list
# !pip install numpy --upgrade --ignore-installed --user
# !pip install --upgrade pip --user




Collecting numpy
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/69/65/0d47953afa0ad569d12de5f65d964321c208492064c38fe3b0b9744f8d44/numpy-1.24.4-cp38-cp38-win_amd64.whl.metadata
  Downloading numpy-1.24.4-cp38-cp38-win_amd64.whl.metadata (5.6 kB)
Using cached numpy-1.24.4-cp38-cp38-win_amd64.whl (14.9 MB)
Installing collected packages: numpy
Successfully installed numpy-1.24.4


DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
arviz 0.11.4 requires typing-extensions<4,>=3.7.4.3, but you have typing-extensions 4.1.1 which is incompatible.
dtale 2.1.2 requires scikit-learn==0.24.2; python_version > "3.0", but you have scikit-learn 1.3.0 which is incompatible.
pandas-profiling 3.1.0 requires joblib~=1.0.1, but you have joblib 1.1.1 which is incompatible.
ppscore 1.2.0 requires scikit-learn<1.0.0,>=0.20.2, but you have scikit-learn 1.3.0 which is incompatible.
tensorflow-intel 2.12.0 requires keras<2

In [45]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

AttributeError: module 'numpy' has no attribute 'MachAr'

In [None]:

import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

## Hyperparameter Tuning

In [178]:
from sklearn.model_selection import GridSearchCV
import hashlib
from sklearn.preprocessing import FunctionTransformer

In [196]:
# param_grid = {
#     'regressor__n_estimators': [50, 100, 200, 300],
#     'regressor__max_depth': [None, 10, 20, 30],
#     'regressor__max_samples':[0.1, 0.25, 0.75, 1.0],
#     'regressor__max_features': ['auto', 'sqrt']
# }

param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['auto', 'sqrt']
}

In [197]:
def hash_encode(data, num_buckets):
    def hash_value(value):
        hashed_value = int(hashlib.md5(value.encode()).hexdigest(), 16)
        return hashed_value % num_buckets

    return data.applymap(hash_value)

In [226]:


columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']
# columns_to_encode = ['furnishing_type', 'luxury_category', 'floor_category']
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['agePossession'])
        
    ], 
    remainder='passthrough'
)

In [227]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', ExtraTreesRegressor())
])

In [228]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [229]:
# search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)
scores.mean()

0.8697883879948953

In [223]:
search.fit(X, y_transformed)

Fitting 15 folds for each of 216 candidates, totalling 3240 fits


1620 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
762 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Asus\AppData\Roaming\Python\Python38\site-packages\

In [209]:
final_pipe = search.best_estimator_

In [210]:
search.best_params_

{'regressor__max_depth': 20,
 'regressor__max_features': 'sqrt',
 'regressor__min_samples_leaf': 1,
 'regressor__min_samples_split': 2,
 'regressor__n_estimators': 200}

In [211]:
search.best_score_

0.8443003201149789

In [19]:
final_pipe.fit(X,y_transformed)

## Exporting model

In [96]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)


# columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
#         ('cat', OrdinalEncoder(), columns_to_encode),
#         ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
#         ('cat2', FunctionTransformer(func=hash_encode, kw_args={'num_buckets': 4}, validate=False), ['sector'])
        
#     ], 
#     remainder='passthrough'
# )

In [97]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

<IPython.core.display.Javascript object>

In [98]:
pipeline.fit(X,y_transformed)

In [99]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [24]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [25]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 84,2.0,2.0,1,Relatively New,532.0,0.0,0.0,unfurnished,Medium,Mid Floor
3550,house,sector 109,5.0,5.0,3+,Relatively New,6228.0,1.0,1.0,unfurnished,High,Low Floor
3551,flat,sector 2,1.0,1.0,1,Moderately Old,665.0,0.0,0.0,semifurnished,Medium,Mid Floor
3552,house,sector 43,5.0,6.0,3,Moderately Old,5490.0,1.0,1.0,unfurnished,Medium,Mid Floor
