In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVR

In [8]:
pd.set_option('display.max_rows', None)


In [9]:
df = pd.read_csv('../Data_Cleaning/Data/gurgaon_properties_post_feature_selection.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)

In [10]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0.0,99.0,4.0,5.0,4.0,3.0,2547.0,0.0,0.0,1.0,1.0,2.0,2.2
1,0.0,79.0,2.0,2.0,3.0,3.0,1130.0,0.0,0.0,0.0,1.0,1.0,1.08
2,0.0,113.0,2.0,2.0,2.0,3.0,1046.0,0.0,0.0,1.0,0.0,2.0,0.99
3,1.0,47.0,10.0,10.0,4.0,3.0,3123.0,0.0,0.0,1.0,0.0,1.0,8.0
4,0.0,17.0,4.0,5.0,4.0,3.0,2650.0,1.0,0.0,0.0,2.0,0.0,3.7


In [11]:
# One hot encode -> sector, balacony, agePossession, furnishing type, luxury category, floor category

In [18]:
df = df.drop(columns="bathroom")

In [19]:
correlation_matrix = df.corr()
correlation_matrix.unstack().sort_values(ascending=False)

property_type    property_type      1.000000
sector           sector             1.000000
floor_category   floor_category     1.000000
luxury_category  luxury_category    1.000000
furnishing_type  furnishing_type    1.000000
store room       store room         1.000000
servant room     servant room       1.000000
built_up_area    built_up_area      1.000000
agePossession    agePossession      1.000000
balcony          balcony            1.000000
bedRoom          bedRoom            1.000000
price            price              1.000000
built_up_area    price              0.744501
price            built_up_area      0.744501
bedRoom          built_up_area      0.614923
built_up_area    bedRoom            0.614923
price            bedRoom            0.593783
bedRoom          price              0.593783
property_type    price              0.505717
price            property_type      0.505717
property_type    bedRoom            0.436964
bedRoom          property_type      0.436964
servant ro

In [25]:
X = df.drop(columns=['price'])
y = df['price']

In [26]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [27]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [28]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [29]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [30]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [31]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [32]:
scores.mean()

0.8892986247310708

In [33]:
scores.std()

0.013047128163839168

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [35]:
pipeline.fit(X_train, y_train)

In [36]:
y_pred = pipeline.predict(X_test)

In [37]:
y_pred = np.expm1(y_pred)

In [54]:
from sklearn.metrics import mean_absolute_error,accuracy_score
mean_absolute_error(np.expm1(y_test),y_pred)

0.5155829459135702

In [50]:
pipeline.named_steps["regressor"]

In [52]:
from scipy.stats import t
# Set confidence level and calculate degrees of freedom
confidence_level = 0.95
degrees_of_freedom = len(y_train) - 1
alpha = 1 - confidence_level

# Calculate t-score for two-tailed test
t_score = t.ppf(1 - alpha / 2, degrees_of_freedom)
std_error = np.sqrt(mse / len(y_train))
margin_of_error = t_score * std_error
lower_bound = y_pred - margin_of_error
upper_bound = y_pred + margin_of_error

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Prediction: {y_pred[0]:.4f}")
print(f"Confidence Interval: ({lower_bound[0]:.4f}, {upper_bound[0]:.4f})")

Mean Squared Error (MSE): 0.5156
Prediction: 1.1339
Confidence Interval: (1.1080, 1.1599)
