In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

dataset = pd.read_csv('Housing.csv', quoting=3)

In [2]:
dataset.select_dtypes([object])

Unnamed: 0,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,furnishingstatus
0,yes,no,no,no,yes,yes,furnished
1,yes,no,no,no,yes,no,furnished
2,yes,no,yes,no,no,yes,semi-furnished
3,yes,no,yes,no,yes,yes,furnished
4,yes,yes,yes,no,yes,no,furnished
...,...,...,...,...,...,...,...
540,yes,no,yes,no,no,no,unfurnished
541,no,no,no,no,no,no,semi-furnished
542,yes,no,no,no,no,no,unfurnished
543,no,no,no,no,no,no,furnished


In [3]:
dataset.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [4]:
X = dataset.drop(columns='price')
y = dataset['price'].values.reshape(-1,1)

In [5]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.1, random_state=100)

In [6]:
from sklearn.linear_model import BayesianRidge

regressor = BayesianRidge(max_iter=20000,alpha_1=7e-7, alpha_2=12e-7, lambda_1=7e-7, lambda_2=9e-7)


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler


no_yes_encoder = OrdinalEncoder(categories=[['no','yes'],['no','yes'],['no','yes'],['no','yes'],['no','yes'],['no','yes']])


steps = [
    ('no_yes',no_yes_encoder,[4,5,6,7,8,10]),
    ('furnishing', OrdinalEncoder(categories=[['unfurnished','semi-furnished','furnished']]), [11]),
    ('drop_columns','drop',['prefarea','furnishingstatus'])
]
preprocessor = ColumnTransformer(transformers=steps, remainder='passthrough')

In [8]:
target_pipeline = Pipeline(steps=[('scaler',StandardScaler())])
target_pipeline.fit(y_train)

In [9]:

bayesian_ridge_pipeline = Pipeline(
    steps=[('preprocessor', preprocessor),
           ('scaler', MinMaxScaler()),
           ('bayesian_ridge', regressor)])

bayesian_ridge_pipeline.fit(X_train,target_pipeline.transform(y_train))

In [10]:
# from sklearn.model_selection import GridSearchCV
# 
# params = [{'bayesian_ridge__alpha_1': np.arange(5e-7,2e-6,1e-7),
#            'bayesian_ridge__alpha_2': np.arange(5e-7,2e-6,1e-7), 
#            'bayesian_ridge__lambda_1':np.arange(5e-7,2e-6,1e-7),
#            'bayesian_ridge__lambda_2':np.arange(5e-7,2e-6,1e-7)}]
# 
# grid = GridSearchCV(bayesian_ridge_pipeline,param_grid=params, n_jobs=-1, cv=5, scoring='r2',refit=True)
# grid.fit(X_train,target_pipeline.transform(y_train))

In [11]:
# best_pipeline = grid.best_estimator_

In [12]:
from sklearn.metrics import mean_squared_error, r2_score

y_true = target_pipeline.transform(y_test)
y_pred = bayesian_ridge_pipeline.predict(X_test)
results = {'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),'R2': r2_score(y_true,y_pred)}

In [14]:
import joblib

joblib.dump(target_pipeline,'target_pipeline')
joblib.dump(bayesian_ridge_pipeline,'pred_pipeline')

['pred_pipeline']

In [13]:
results

{'RMSE': 0.6660677692363948, 'R2': 0.5896879182456}