In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

dataset = pd.read_csv('Housing.csv', quoting=3)

In [2]:
X = dataset.drop(columns='price')
y = dataset['price'].values.reshape(-1,1)

In [3]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=100)

In [4]:
from sklearn.linear_model import BayesianRidge

regressor = BayesianRidge(max_iter=20000)


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler


no_yes_encoder = OrdinalEncoder(categories=[['no','yes'],['no','yes'],['no','yes'],['no','yes'],['no','yes'],['no','yes']])


steps = [
    ('no_yes',no_yes_encoder,[4,5,6,7,8,10]),
    ('furnishing', OrdinalEncoder(categories=[['unfurnished','semi-furnished','furnished']]), [11]),
    #('drop_columns','drop',[10,11])
]
preprocessor = ColumnTransformer(transformers=steps, remainder='passthrough')

In [6]:
target_pipeline = Pipeline(steps=[('scaler',StandardScaler())])
target_pipeline.fit(y_train)

In [7]:

regressor_pipeline = Pipeline(
    steps=[('preprocessor', preprocessor),
           ('scaler', MinMaxScaler()),
           ('bayesian_ridge', regressor)])


In [8]:
from sklearn.model_selection import GridSearchCV

params = [{'bayesian_ridge__alpha_1': np.arange(5e-7,2e-6,1e-7),
           'bayesian_ridge__alpha_2': np.arange(5e-7,2e-6,1e-7), 
           'bayesian_ridge__lambda_1':np.arange(5e-7,2e-6,1e-7),
           'bayesian_ridge__lambda_2':np.arange(5e-7,2e-6,1e-7)}]

grid = GridSearchCV(regressor_pipeline,param_grid=params, n_jobs=-1, cv=5, scoring='r2',refit=True)
grid.fit(X_train,target_pipeline.transform(y_train))

In [9]:
best_pipeline = grid.best_estimator_

In [10]:
from sklearn.metrics import mean_squared_error, r2_score

def adjusted_r2(y_true: np.ndarray, y_pred: np.ndarray, num_rows: np.intc, num_cols: np.intc):
    return 1-((1-r2_score(y_true,y_pred))*(num_rows-1)/(num_rows-num_cols-1))

In [11]:


y_true = target_pipeline.transform(y_test)
y_pred = best_pipeline.predict(X_test)
results = {'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),'Adjusted R2': adjusted_r2(y_true,y_pred, len(y_true), grid.n_features_in_)}

In [12]:
import joblib

joblib.dump(target_pipeline,'pipeline-bin/target_pipeline')
joblib.dump(best_pipeline,'pipeline-bin/prediction_pipeline')

['pipeline-bin/prediction_pipeline']

In [13]:
results


{'RMSE': 0.5702827007060995, 'Adjusted R2': 0.6394012537961569}