In [9]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
dataset = pd.read_csv('Housing.csv', quoting=3)

In [10]:
X = dataset.drop(columns='price').values
y = dataset['price'].values

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler




steps = [
         ('mainroad', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[4]),
         ('guestroom', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[5]),
         ('basement', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[6]),
         ('hotwater', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[7]),
         ('airconditioning', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8),[8]),
         ('prefarea', OrdinalEncoder(categories=[["no", "yes"]],dtype=np.int8), [10]),
         ('furnishing', OrdinalEncoder(categories=[["unfurnished", "semi-furnished", "furnished"]], dtype=np.int8), [11])
         ]

transformer = ColumnTransformer(transformers=steps, remainder='passthrough')


In [12]:
from sklearn.linear_model import SGDRegressor

regressor = SGDRegressor(random_state=100, shuffle=True)


In [13]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

folds = 10
kf = KFold(random_state=100,shuffle=True, n_splits=folds)
i=1
r2s = []
mapes = []
for train, test in kf.split(X):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    scaler = StandardScaler()
    X_train = transformer.fit_transform(X_train)
    X_test = transformer.transform(X_test)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    y_train = scaler.fit_transform(y_train.reshape(-1,1))
    y_test = scaler.transform(y_test.reshape(-1,1))
    regressor.fit(X_train,y_train)
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test,y_pred)
    mape = mean_squared_error(y_test,y_pred)
    mape = np.sqrt(mape)
    r2s.append(r2)
    mapes.append(mape)
    print("FOLD: "+str(i))
    print("R squared: "+ str(r2))
    print("Root Mean Squared Error :  "+ str(mape))
    print()
    i+=1

FOLD: 1
R squared: 0.5924341970617988
Root Mean Squared Error :  0.6638349823390809

FOLD: 2
R squared: 0.7726070840973647
Root Mean Squared Error :  0.47943164919583103

FOLD: 3
R squared: 0.6621430657679714
Root Mean Squared Error :  0.6618589905840481

FOLD: 4
R squared: 0.7084313430742526
Root Mean Squared Error :  0.5620947436699681

FOLD: 5
R squared: 0.6347277214060632
Root Mean Squared Error :  0.5718798591003335

FOLD: 6
R squared: 0.6883753848909746
Root Mean Squared Error :  0.4256646683514394

FOLD: 7
R squared: 0.645788633594638
Root Mean Squared Error :  0.4919700322854853

FOLD: 8
R squared: 0.5934392230035943
Root Mean Squared Error :  0.5943388540857039

FOLD: 9
R squared: 0.6803011245491047
Root Mean Squared Error :  0.4266565118812578

FOLD: 10
R squared: 0.6093116148386519
Root Mean Squared Error :  0.8719399369558518



In [14]:
results = [type(regressor).__name__, folds,np.mean(r2s).round(4), np.std(r2s).round(4) , np.mean(mapes).round(4), np.std(mapes).round(4)]

In [15]:
print("Regressor: "+ results[0])
print("Number of folds: " + str(results[1]))
print("Mean R squared: " + str(results[2]))
print("STD R squared: "+ str(results[3]))
print("Mean RMSE: " + str(results[4]))
print("STD RMSE: "+ str(results[5]))

Regressor: SGDRegressor
Number of folds: 10
Mean R squared: 0.6588
STD R squared: 0.0535
Mean RMSE: 0.575
STD RMSE: 0.1283


In [17]:
from csv import writer

with open('results.csv','a') as f_object:
    writer_object = writer(f_object)
    writer_object.writerow(results)
    f_object.close()