1. Importing Libraries

In [27]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns, os , warnings 
warnings.filterwarnings("ignore")

Setting up display options

In [28]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

2. Reading the Data

In [29]:
PROJECT_DIR  = "/Users/abhisheksaurav/Desktop/real_estate_project/"
DATA_DIR = "Data Collection/Data/City/chandigarh/cleaned_data"

In [30]:
def get_data(name):
    file_name = "{}.csv".format(name)
    file_path = os.path.join(PROJECT_DIR, DATA_DIR, file_name)
    return pd.read_csv(file_path)

In [31]:
data = get_data("gurgaon_properties_post_feature_selection")

In [32]:
data.head()

Unnamed: 0,property type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxary_category,floor_category,price
0,1.0,34.0,5.0,5,3.0,0.0,1620.0,0,1,2,2.0,2.0,4.25
1,1.0,0.0,4.0,3,1.0,0.0,1284.0,0,0,0,1.0,2.0,1.85
2,1.0,53.0,6.0,5,2.0,0.0,3078.0,1,0,2,3.0,2.0,12.5
3,0.0,74.0,3.0,3,2.0,3.0,1600.0,0,0,0,2.0,0.0,1.82
4,0.0,53.0,3.0,3,2.0,0.0,2093.0,0,1,2,2.0,0.0,2.51


- Since we are going to perform linear regression model on our data we need to perform one hot encoding for our cateogorical columns
- One hot encoding on "sector", "agePossession", "luxary_category", "furnishing_type", "balcony" and "floor_category"



In [33]:
X = (
    data 
    .drop("price", axis= 1)
    
)

y = (
    data['price']
)

In [34]:
# Importing libraries required for the baseline linear regression model 

from sklearn.model_selection import KFold, cross_val_score 
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


In [35]:
# colums to encode 

cols_encode =["property type", "sector", "agePossession", "luxary_category", "furnishing_type", "balcony" , "floor_category"]

for i in cols_encode:
    X[i] = X[i].astype("O")

In [36]:
# applying log tranformation on the price olumn because it was highly right skewed 

y_transformed = np.log1p(y)

In [37]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3807 entries, 0 to 3806
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   property type    3807 non-null   object 
 1   sector           3807 non-null   object 
 2   bedRoom          3807 non-null   float64
 3   bathroom         3807 non-null   int64  
 4   balcony          3807 non-null   object 
 5   agePossession    3807 non-null   object 
 6   built_up_area    3807 non-null   float64
 7   servant room     3807 non-null   int64  
 8   store room       3807 non-null   int64  
 9   furnishing_type  3807 non-null   object 
 10  luxary_category  3807 non-null   object 
 11  floor_category   3807 non-null   object 
dtypes: float64(2), int64(3), object(7)
memory usage: 357.0+ KB


In [38]:
# applying column tranformations

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']), 
        ("cat", OneHotEncoder(drop= "first", handle_unknown= "ignore"), cols_encode)
    ], 
    remainder= "passthrough"
)

In [39]:
# creating a pipeline 

pipeline = Pipeline(
    [
    ("preprocessor", preprocessor), 
    ("regressor", LinearRegression())
    ]
)

In [40]:
# K-fold cross validation

kfold = KFold(n_splits= 10, shuffle= True, random_state= 42)
scores = cross_val_score(
    pipeline, 
    X, 
    y_transformed, 
    cv= kfold, 
    scoring= "r2"
    )

In [41]:
scores.mean()

np.float64(0.8389116670617722)

In [42]:
scores.std()

np.float64(0.02755586708122629)

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state= 42)

In [44]:
pipeline.fit(X_train, y_train)

In [45]:
y_pred = pipeline.predict(X_test)

In [46]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(np.expm1(y_test), y_pred)

np.float64(1.4942906758018761)

Using SVM

In [47]:
# applying column tranformations

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']), 
        ("cat", OneHotEncoder(drop= "first", handle_unknown= "ignore"), cols_encode)
    ], 
    remainder= "passthrough"
)

# creating a pipeline 

from sklearn.svm import SVR

pipeline1 = Pipeline(
    [
    ("preprocessor", preprocessor), 
    ("regressor", SVR(kernel= "rbf"))
    ]
)

# K-fold cross validation

kfold = KFold(n_splits= 10, shuffle= True, random_state= 42)
scores = cross_val_score(
    pipeline1, 
    X, 
    y_transformed, 
    cv= kfold, 
    scoring= "r2"
    )

pipeline1.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(scores.mean())
print(scores.std())
print(mean_absolute_error(np.expm1(y_test), y_pred))

0.8829070795384089
0.01711698807612729
1.4942906758018761
