In [90]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [91]:
df = pd.read_csv('/content/feature_selection.csv')

In [92]:
df.head()

Unnamed: 0,City,Parking Spaces,Bedrooms,Bathrooms,Servant Quarters,Kitchens,Store Rooms,Age Possession,area,colony,province,property Type,price
0,2.0,1.0,3.0,3.0,0.0,1.0,0.0,3.0,1361.0,40.0,0.0,1.0,1.5
1,2.0,3.0,6.0,6.0,1.0,2.0,1.0,1.0,3812.0,22.0,0.0,1.0,4.75
2,2.0,2.0,3.0,3.0,1.0,1.0,1.0,0.0,2722.0,16.0,0.0,1.0,2.25
3,2.0,3.0,2.0,2.0,0.0,1.0,1.0,3.0,1171.0,32.0,0.0,0.0,1.15
4,2.0,1.0,4.0,5.0,0.0,0.0,0.0,4.0,1361.0,77.0,0.0,1.0,1.75


In [93]:
# one hot encode -> sector, balcony, agePossession, furnishing type, luxury category, floor category

Now we apply one-hot-encoding on catgory columns like `City` `Age Possession`  `colony` `property Type` `Provience`

In [94]:
X = df.drop(columns=['price'])
y = df['price']

In [95]:
columns_to_encode = ['City','Age Possession','colony','property Type','province']

In [96]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)
y_transformed

Unnamed: 0,price
0,0.916291
1,1.749200
2,1.178655
3,0.765468
4,1.011601
...,...
754,1.190888
755,2.442347
756,1.163151
757,1.504077


In [97]:
df.head()

Unnamed: 0,City,Parking Spaces,Bedrooms,Bathrooms,Servant Quarters,Kitchens,Store Rooms,Age Possession,area,colony,province,property Type,price
0,2.0,1.0,3.0,3.0,0.0,1.0,0.0,3.0,1361.0,40.0,0.0,1.0,1.5
1,2.0,3.0,6.0,6.0,1.0,2.0,1.0,1.0,3812.0,22.0,0.0,1.0,4.75
2,2.0,2.0,3.0,3.0,1.0,1.0,1.0,0.0,2722.0,16.0,0.0,1.0,2.25
3,2.0,3.0,2.0,2.0,0.0,1.0,1.0,3.0,1171.0,32.0,0.0,0.0,1.15
4,2.0,1.0,4.0,5.0,0.0,0.0,0.0,4.0,1361.0,77.0,0.0,1.0,1.75


In [98]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Bedrooms', 'Bathrooms', 'area', 'Parking Spaces','Servant Quarters','Kitchens','Store Rooms']),
        ('cat', OneHotEncoder(drop='first',handle_unknown='ignore'), columns_to_encode)
    ],
    remainder='passthrough'
)

In [99]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [100]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [108]:
scores.mean()

0.8188422083371065

In [109]:
scores.std()

0.04362913388706152

In [103]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [104]:
pipeline.fit(X_train,y_train)

In [105]:
y_pred = pipeline.predict(X_test)



In [106]:
y_pred = np.expm1(y_pred)

In [107]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

1.1068502032864516

In [110]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

