In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('../Data_Cleaning/Data/gurgaon_properties_post_feature_selection_v2.csv')
df = df.drop(columns=['pooja room', 'study room', 'others'])

In [3]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 85,2.2,4.0,5.0,3+,Relatively New,2547.0,0.0,0.0,1.0,Medium,Mid Floor
1,flat,sector 68,1.08,2.0,2.0,3,Relatively New,1130.0,0.0,0.0,0.0,Medium,Low Floor
2,flat,sohna road,0.99,2.0,2.0,2,Relatively New,1046.0,0.0,0.0,1.0,Low,Mid Floor
3,house,sector 38,8.0,10.0,10.0,3+,Relatively New,3123.0,0.0,0.0,1.0,Low,Low Floor
4,flat,sector 111,3.7,4.0,5.0,3+,Relatively New,2650.0,1.0,0.0,0.0,high,High Floor


In [4]:
df['furnishing_type'].value_counts()

1.0    2449
0.0    1031
2.0     195
Name: furnishing_type, dtype: int64

In [5]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished', 1.0:'semifurnished', 2.0:'furnished'})

In [6]:
df.sample(5)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
2436,flat,sector 60,2.82,3.0,3.0,3,Relatively New,2400.0,1.0,0.0,unfurnished,high,Low Floor
301,flat,sohna road,0.78,2.0,2.0,3,Under Construction,1223.0,0.0,0.0,semifurnished,Medium,Mid Floor
3111,house,sector 76,1.75,4.0,4.0,0,New Property,3111.0,0.0,0.0,semifurnished,Low,High Floor
1726,flat,sector 81,0.89,2.0,2.0,2,Relatively New,1194.0,0.0,0.0,semifurnished,Low,Mid Floor
2911,flat,sector 108,0.98,2.0,3.0,3,Relatively New,978.223232,1.0,1.0,semifurnished,Low,High Floor


In [7]:
X = df.drop(columns=['price'])
y = df['price']

In [8]:
# Applying the Log1p transformation to the target variable
y_transformed = np.log1p(y)

# Ordinal Encoding

In [9]:
categorical_columns = X.select_dtypes(include=['O']).columns
numerical_columns = X.select_dtypes(exclude=['O']).columns

In [10]:
df.head(5)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 85,2.2,4.0,5.0,3+,Relatively New,2547.0,0.0,0.0,semifurnished,Medium,Mid Floor
1,flat,sector 68,1.08,2.0,2.0,3,Relatively New,1130.0,0.0,0.0,unfurnished,Medium,Low Floor
2,flat,sohna road,0.99,2.0,2.0,2,Relatively New,1046.0,0.0,0.0,semifurnished,Low,Mid Floor
3,house,sector 38,8.0,10.0,10.0,3+,Relatively New,3123.0,0.0,0.0,semifurnished,Low,Low Floor
4,flat,sector 111,3.7,4.0,5.0,3+,Relatively New,2650.0,1.0,0.0,unfurnished,high,High Floor


In [11]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns)
    ], 
    remainder='passthrough'
)

In [12]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [13]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [14]:
scores.mean(), scores.std()

(0.7320061599419375, 0.022593455540507137)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [16]:
pipeline.fit(X_train, y_train)

In [17]:
y_pred = pipeline.predict(X_test)

In [18]:
y_pred = np.expm1(y_pred)

In [19]:
mean_absolute_error(np.expm1(y_test), y_pred)

0.9244403769277214

In [20]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns)
    ], 
    remainder='passthrough'
)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [21]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [22]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [23]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [24]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.897223,0.48433
5,random forest,0.886553,0.517003
6,extra trees,0.875434,0.552792
7,gradient boosting,0.873095,0.563789
4,decision tree,0.790906,0.622576
9,mlp,0.800808,0.693771
8,adaboost,0.759351,0.79605
1,svr,0.756437,0.848942
2,ridge,0.732008,0.924335
0,linear_reg,0.732006,0.92444


In [25]:
categorical_columns

Index(['property_type', 'sector', 'balcony', 'agePossession',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

# OneHotEncoding

In [35]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns),
        ('cat1',OneHotEncoder(drop='first', handle_unknown='ignore'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [36]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [37]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [38]:
scores.mean()

0.8554802494516961

In [39]:
scores.std()

0.020076954445690832

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [41]:
pipeline.fit(X_train, y_train)

In [42]:
y_pred = pipeline.predict(X_test)

In [43]:
y_pred = np.expm1(y_pred)

In [44]:
mean_absolute_error(np.expm1(y_test), y_pred)

0.6558800235587575

In [46]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [47]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])