In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA
from pathlib import Path

In [2]:
path = Path('../data/interim')

In [4]:
df = pd.read_csv(path / 'properties_post_feature_selection_v2.csv')

In [5]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,store room,pooja room,furnishing_type,luxury_category,floor_category
0,house,gurukul,2.4,3.0,3.0,3,Relatively New,2160.0,1.0,1.0,unfurnished,Medium,Mid Floor
1,house,thaltej,4.75,4.0,4.0,1,Old Property,3150.0,1.0,1.0,semifurnished,High,Low Floor
2,flat,nava naroda,0.27,2.0,2.0,1,Moderately Old,1125.0,0.0,0.0,unfurnished,Not available,Mid Floor
3,flat,usmanpura,2.1,3.0,3.0,2,Moderately Old,1814.0,0.0,1.0,semifurnished,Medium,High Floor
4,flat,gota,0.98,3.0,3.0,1,Moderately Old,1680.0,1.0,0.0,unfurnished,Medium,Mid Floor


In [6]:
df['furnishing_type'].value_counts()

unfurnished      4146
furnished         962
semifurnished     704
Name: furnishing_type, dtype: int64

In [7]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,store room,pooja room,furnishing_type,luxury_category,floor_category
0,house,gurukul,2.4,3.0,3.0,3,Relatively New,2160.0,1.0,1.0,unfurnished,Medium,Mid Floor
1,house,thaltej,4.75,4.0,4.0,1,Old Property,3150.0,1.0,1.0,semifurnished,High,Low Floor
2,flat,nava naroda,0.27,2.0,2.0,1,Moderately Old,1125.0,0.0,0.0,unfurnished,Not available,Mid Floor
3,flat,usmanpura,2.1,3.0,3.0,2,Moderately Old,1814.0,0.0,1.0,semifurnished,Medium,High Floor
4,flat,gota,0.98,3.0,3.0,1,Moderately Old,1680.0,1.0,0.0,unfurnished,Medium,Mid Floor


In [8]:
X = df.drop(columns=['price'])
y = df['price']

In [9]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [10]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [11]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [12]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [13]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [14]:
scores.mean(),scores.std()

(0.8504849306339821, 0.017663928609078165)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [16]:
pipeline.fit(X_train,y_train)

In [17]:
y_pred = pipeline.predict(X_test)

In [18]:
y_pred = np.expm1(y_pred)

In [19]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.4015976051560783

In [20]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [21]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [22]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [23]:
model_output

[['linear_reg', 0.8504849306339821, 0.4015976051560783],
 ['svr', 0.8681740967739294, 0.3858090332152926],
 ['ridge', 0.8504853112136258, 0.4016191038481374],
 ['LASSO', -0.002200636769258102, 1.0617060098836015],
 ['decision tree', 0.8325442782135066, 0.40541115564169594],
 ['random forest', 0.9049303740221004, 0.3146860679003189],
 ['extra trees', 0.8991762757594784, 0.3201733424993991],
 ['gradient boosting', 0.9066425423555788, 0.31759661169354636],
 ['adaboost', 0.8186400041597606, 0.4406037559623419],
 ['mlp', 0.875962940182782, 0.3859965872798738],
 ['xgboost', 0.9178015945774923, 0.29891120886864175]]

In [24]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [25]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.917802,0.298911
5,random forest,0.90493,0.314686
7,gradient boosting,0.906643,0.317597
6,extra trees,0.899176,0.320173
1,svr,0.868174,0.385809
9,mlp,0.875963,0.385997
0,linear_reg,0.850485,0.401598
2,ridge,0.850485,0.401619
4,decision tree,0.832544,0.405411
8,adaboost,0.81864,0.440604


### OneHotEncoding

In [26]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [27]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [28]:
columns_to_encode = ['property_type','balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']
numerical_columns = ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('sector', OneHotEncoder(drop='first'), ['sector'])
    ], 
    remainder='passthrough'
)

In [29]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [30]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [31]:
scores.mean()

0.8967338183147078

In [32]:
scores.std()

0.016394246682749487

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [34]:
pipeline.fit(X_train,y_train)

In [35]:
y_pred = pipeline.predict(X_test)

In [36]:
y_pred = np.expm1(y_pred)

In [37]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.33253321347451215

In [38]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [39]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [40]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [41]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [42]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.913394,0.282436
5,random forest,0.90628,0.307525
1,svr,0.918018,0.312094
10,xgboost,0.916991,0.314131
7,gradient boosting,0.907258,0.315467
9,mlp,0.912685,0.327496
0,linear_reg,0.896734,0.332533
2,ridge,0.896191,0.333858
4,decision tree,0.839788,0.398637
8,adaboost,0.808273,0.472685


### OneHotEncoding With PCA

In [43]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [44]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [45]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [46]:
scores.mean()

0.8526224980800633

In [47]:
scores.std()

0.017851659658089877

In [48]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [49]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [50]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [51]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [52]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.883298,0.340798
6,extra trees,0.886419,0.345916
1,svr,0.888986,0.355076
5,random forest,0.882372,0.359861
9,mlp,0.891407,0.360083
7,gradient boosting,0.874662,0.38873
0,linear_reg,0.852622,0.398127
2,ridge,0.852624,0.398163
4,decision tree,0.756995,0.521398
8,adaboost,0.703796,0.599982


### Target Encoder

In [53]:
!pip install category_encoders




[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [54]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [55]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [56]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [57]:
scores.mean(),scores.std()

(0.8878298021064414, 0.01672757227713949)

In [58]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [59]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [60]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [61]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [62]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.923166,0.281083
5,random forest,0.92132,0.288553
10,xgboost,0.921859,0.294859
7,gradient boosting,0.918263,0.299497
2,ridge,0.887833,0.35179
0,linear_reg,0.88783,0.351811
1,svr,0.889614,0.355918
9,mlp,0.898444,0.384572
4,decision tree,0.851723,0.391403
8,adaboost,0.844453,0.42839


### Hyperparameter Tuning

In [63]:
from sklearn.model_selection import GridSearchCV

In [64]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [65]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [66]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [67]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [68]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [70]:
# search.fit(X, y_transformed)

In [None]:
# final_pipe = search.best_estimator_

In [None]:
# search.best_params_

In [None]:
# search.best_score_

In [None]:
# final_pipe.fit(X,y_transformed)

### Exporting the model

In [75]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [76]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [77]:
pipeline.fit(X,y_transformed)

In [None]:
import pickle

# with open('pipeline.pkl', 'wb') as file:
#     pickle.dump(pipeline, file)

In [None]:
# with open('df.pkl', 'wb') as file:
#     pickle.dump(X, file)

### Trying out the predictions

In [78]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'store room', 'pooja room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [79]:
X.iloc[0].values

array(['house', 'gurukul', 3.0, 3.0, '3', 'Relatively New', 2160.0, 1.0,
       1.0, 'unfurnished', 'Medium', 'Mid Floor'], dtype=object)

In [84]:
data = [['house', 'vastral', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'pooja room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,pooja room,store room,furnishing_type,luxury_category,floor_category
0,house,vastral,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [85]:
np.expm1(pipeline.predict(one_df))

array([2.3711612])

In [86]:
X.dtypes

property_type       object
sector              object
bedRoom            float64
bathroom           float64
balcony             object
agePossession       object
built_up_area      float64
store room         float64
pooja room         float64
furnishing_type     object
luxury_category     object
floor_category      object
dtype: object

In [87]:
sorted(X['sector'].unique().tolist())

['bapunagar',
 'bhadaj',
 'bodakdev',
 'chandkheda',
 'chandlodia',
 'charodi',
 'ctm',
 'ghatlodia',
 'ghuma',
 'gota',
 'gurukul',
 'isanpur',
 'jagatpur',
 'jodhpur',
 'khokhara',
 'koteshwar',
 'krishna nagar',
 'maninagar',
 'memnagar',
 'motera',
 'nana chiloda',
 'naranpura',
 'naroda',
 'narol',
 'nava naroda',
 'navrangpura',
 'new maninagar',
 'new ranip',
 'nikol',
 'odhav',
 'paldi',
 'prahlad nagar',
 'sarkhej',
 'satellite',
 'science city',
 'sg highway',
 'shahibaug',
 'shela',
 'shilaj',
 'sola',
 'south bopal',
 'thaltej',
 'usmanpura',
 'vaishnodevi circle',
 'vasna',
 'vastral',
 'vastrapur',
 'vatva']