In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
dataset = pd.read_csv('Housing.csv', quoting=3)

In [2]:
X = dataset.drop(columns='price').values
y = dataset['price'].values

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler




steps = [
         ('mainroad', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[4]),
         ('guestroom', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[5]),
         ('basement', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[6]),
         ('hotwater', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[7]),
         ('airconditioning', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[8]),
         ('prefarea', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8), [10]),
         ('furnishing', OrdinalEncoder(categories=[["unfurnished", "semi-furnished", "furnished"]], dtype=np.int8), [11])
         ]

transformer = ColumnTransformer(transformers=steps, remainder='passthrough')


In [4]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()


In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

folds = 10
kf = KFold(random_state=100,shuffle=True, n_splits=folds)
i=1
r2s = []
mapes = []
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    scaler = StandardScaler()
    X_train = transformer.fit_transform(X_train)
    X_test = transformer.transform(X_test)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    y_train = scaler.fit_transform(y_train.reshape(-1,1))
    y_test = scaler.transform(y_test.reshape(-1,1))
    regressor.fit(X_train,y_train)
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    mape = mean_squared_error(y_test,y_pred)
    mape = np.sqrt(mape)
    r2s.append(r2)
    mapes.append(mape)
    print("FOLD: "+str(i))
    print("R squared: "+ str(r2))
    print("Root Mean Squared Error :  "+ str(mape))
    print()
    i+=1

FOLD: 1
R squared: 0.5899953960006645
Root Mean Squared Error :  0.6658181550840141

FOLD: 2
R squared: 0.7713682077104076
Root Mean Squared Error :  0.48073588892663693

FOLD: 3
R squared: 0.6636197223227707
Root Mean Squared Error :  0.660411027614568

FOLD: 4
R squared: 0.7106042399429051
Root Mean Squared Error :  0.5599963391414153

FOLD: 5
R squared: 0.6376172282338504
Root Mean Squared Error :  0.5696134241755251

FOLD: 6
R squared: 0.6900800570928693
Root Mean Squared Error :  0.4244988204865147

FOLD: 7
R squared: 0.6484552339097394
Root Mean Squared Error :  0.49011469114211514

FOLD: 8
R squared: 0.5921356740008685
Root Mean Squared Error :  0.5952909009149518

FOLD: 9
R squared: 0.6827224575683879
Root Mean Squared Error :  0.42503773699491454

FOLD: 10
R squared: 0.6125488962396692
Root Mean Squared Error :  0.8683199334931203



In [6]:
results = [type(regressor).__name__, folds,np.mean(r2s).round(4), np.std(r2s).round(4) , np.mean(mapes).round(4), np.std(mapes).round(4)]

In [7]:
print("Regressor: "+ results[0])
print("Number of folds: " + str(results[1]))
print("Mean R squared: " + str(results[2]))
print("STD R squared: "+ str(results[3]))
print("Mean RMSE: " + str(results[4]))
print("STD RMSE: "+ str(results[5]))

Regressor: LinearRegression
Number of folds: 10
Mean R squared: 0.6599
STD R squared: 0.0537
Mean RMSE: 0.574
STD RMSE: 0.1279


In [8]:
from csv import writer

with open('results.csv','a') as f_object:
    writer_object = writer(f_object)
    writer_object.writerow(results)
    f_object.close()