In [9]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
dataset = pd.read_csv('Housing.csv', quoting=3)

In [10]:
X = dataset.drop(columns='price').values
y = dataset['price'].values

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler




steps = [
         ('mainroad', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[4]),
         ('guestroom', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[5]),
         ('basement', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[6]),
         ('hotwater', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[7]),
         ('airconditioning', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[8]),
         ('prefarea', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8), [10]),
         ('furnishing', OrdinalEncoder(categories=[["unfurnished", "semi-furnished", "furnished"]], dtype=np.int8), [11])
         ]

transformer = ColumnTransformer(transformers=steps, remainder='passthrough')


In [12]:
from sklearn.ensemble import GradientBoostingRegressor

regressor = GradientBoostingRegressor(random_state=100)


In [13]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

folds = 10
kf = KFold(random_state=100,shuffle=True, n_splits=folds)
i=1
r2s = []
mapes = []
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    scaler = StandardScaler()
    X_train = transformer.fit_transform(X_train)
    X_test = transformer.transform(X_test)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    y_train = scaler.fit_transform(y_train.reshape(-1,1))
    y_test = scaler.transform(y_test.reshape(-1,1))
    regressor.fit(X_train,y_train)
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    mape = mean_squared_error(y_test,y_pred)
    mape = np.sqrt(mape)
    r2s.append(r2)
    mapes.append(mape)
    print("FOLD: "+str(i))
    print("R squared: "+ str(r2))
    print("Root Mean Squared Error :  "+ str(mape))
    print()
    i+=1

FOLD: 1
R squared: 0.6385911668078385
Root Mean Squared Error :  0.6251160472616952

FOLD: 2
R squared: 0.7361461766446712
Root Mean Squared Error :  0.5164400691675359

FOLD: 3
R squared: 0.5443008503098012
Root Mean Squared Error :  0.76866687028843

FOLD: 4
R squared: 0.7365244435243212
Root Mean Squared Error :  0.5343296466151318

FOLD: 5
R squared: 0.6295962257993222
Root Mean Squared Error :  0.5758828512627019

FOLD: 6
R squared: 0.6761204298749626
Root Mean Squared Error :  0.4339538075450592

FOLD: 7
R squared: 0.6178402799330638
Root Mean Squared Error :  0.5110105395539727

FOLD: 8
R squared: 0.6064681989604456
Root Mean Squared Error :  0.5847379762228102

FOLD: 9
R squared: 0.6918595868311821
Root Mean Squared Error :  0.41887279477711903

FOLD: 10
R squared: 0.5401122783255063
Root Mean Squared Error :  0.9460132671412057



In [14]:
results = [type(regressor).__name__, folds,np.mean(r2s).round(4), np.std(r2s).round(4) , np.mean(mapes).round(4), np.std(mapes).round(4)]

In [15]:
print("Regressor: "+ results[0])
print("Number of folds: " + str(results[1]))
print("Mean R squared: " + str(results[2]))
print("STD R squared: "+ str(results[3]))
print("Mean RMSE: " + str(results[4]))
print("STD RMSE: "+ str(results[5]))

Regressor: GradientBoostingRegressor
Number of folds: 10
Mean R squared: 0.6418
STD R squared: 0.0659
Mean RMSE: 0.5915
STD RMSE: 0.1512


In [16]:
from csv import writer

with open('results.csv','a') as f_object:
    writer_object = writer(f_object)
    writer_object.writerow(results)
    f_object.close()