In [27]:
import numpy as np
import pandas as pd
from pathlib import Path

In [28]:
path = Path('../data/interim')

In [29]:
df = pd.read_csv(path / 'properties_post_feature_selection_v2.csv')

In [30]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,store room,pooja room,furnishing_type,luxury_category,floor_category
0,flat,gota,0.9,3,3,1,Relatively New,1005.0,1,0,unfurnished,High,Mid Floor
1,house,south bopal,6.4,5,5,2,Relatively New,3988.0,0,0,semifurnished,Low,Low Floor
2,flat,gota,0.62,2,2,1,Relatively New,1068.0,1,0,unfurnished,Medium,Low Floor
3,flat,memnagar,0.85,3,3,1,Old Property,1373.0,1,0,semifurnished,Low,Mid Floor
4,house,south bopal,2.3,3,3,1,Moderately Old,2412.0,1,0,unfurnished,Low,Low Floor


In [31]:
# one hot encode -> sector, balcony, agePossession, furnishing type, luxury category, floor category

In [32]:
X = df.drop(columns=['price'])
y = df['price']

In [33]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [34]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [35]:
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder

In [36]:
# Define the preprocessor
columns_to_encode = ['sector','property_type', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']
numerical_columns = ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']
preprocessor = ColumnTransformer(
    transformers=[
#         ('label_encode', LabelEncoder(), columns_to_encode),  # Label encode the categorical columns
        ('cat', OneHotEncoder(drop='first'), columns_to_encode),  # One-hot encode the categorical columns
        ('num', StandardScaler(), numerical_columns)  # Standardize the numerical columns
    ],
    remainder='passthrough'
)

In [37]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [38]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [39]:
scores.mean()

0.918108479815924

In [40]:
scores.std()

0.014673912499371005

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [42]:
pipeline.fit(X_train,y_train)

In [43]:
y_pred = pipeline.predict(X_test)

In [44]:
y_pred = np.expm1(y_pred)

In [45]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.3388879114484973

In [46]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'store room', 'pooja room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [49]:
data = [['flat', 'bapunagar', 1, 1, '0', 'Moderately Old', 0.00, 1, 1, 'furnished', 'High', 'High Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'pooja room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,pooja room,store room,furnishing_type,luxury_category,floor_category
0,flat,bapunagar,1,1,0,Moderately Old,0.0,1,1,furnished,High,High Floor


In [50]:
np.expm1(pipeline.predict(one_df))

array([0.28822348], dtype=float32)