In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
from sklearn.neighbors import KNeighborsRegressor
from math import sqrt

In [3]:
realdf = pd.read_csv('Real estate valuation data set.csv')
realdf.keys()

Index(['No', 'X1 transaction date', 'X2 house age',
       'X3 distance to the nearest MRT station',
       'X4 number of convenience stores', 'X5 latitude', 'X6 longitude',
       'Y house price of unit area'],
      dtype='object')

In [4]:
X = realdf.copy()
y = realdf['Y house price of unit area']
del X['Y house price of unit area']

In [5]:
train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)

In [7]:
def find_rmse(test, pred):
    return sqrt(np.sum((test-pred)**2))

In [8]:
def k_opt(train_X, test_X, train_Y, test_Y, kf):
    l = 5
    errors = [0]*l
    for k in range(1,l+1):
        neighb = KNeighborsRegressor(n_neighbors=k)
        neighb.fit(train_X, train_Y)
        #for i in range(len(test_Y)):
        pred = neighb.predict(test_X)
        rmse = find_rmse(test_Y, pred)
        errors[k-1] = rmse
    min_index = 0
    for i in range(l):
        if errors[i] < errors[min_index]:
            min_index = i
    return min_index+1

In [9]:
'''for train_index, test_index in kf.split(X):
...    print("TRAIN:", train_index, "TEST:", test_index)
...    X_train, X_test = X[train_index], X[test_index]
...    y_train, y_test = y[train_index], y[test_index]'''

'for train_index, test_index in kf.split(X):\n   print("TRAIN:", train_index, "TEST:", test_index)\n   X_train, X_test = X[train_index], X[test_index]\n   y_train, y_test = y[train_index], y[test_index]'

In [24]:
def cross_opt(train_X, test_X, train_Y, test_Y, kf):
    errors_tr = []
    errors_val = []
    errors_test = []

    i = 0
    for train_index, test_index in kf.split(train_X,train_Y):
        #print (train_index,test_index,len(train_X),len(train_Y))
        
        tr_X, val_X = train_X[train_index], train_X[test_index]
        tr_Y, val_Y = train_Y[train_index], train_Y[test_index]
        
        k = k_opt(tr_X, val_X, tr_Y, val_Y, kf)
        
        neighb = KNeighborsRegressor(n_neighbors=k)
        neighb.fit(tr_X, tr_Y)
        
        pred_tr = neighb.predict(tr_X)
        pred_val = neighb.predict(val_X)
        pred_test = neighb.predict(test_X)
        
        rmse_tr = find_rmse(tr_Y, pred_tr)
        rmse_val = find_rmse(val_Y, pred_val)
        rmse_test = find_rmse(test_Y, pred_test)
        
        errors_tr.append(rmse_tr)
        errors_val.append(rmse_val)
        errors_test.append(rmse_test)
        i += 1
    return errors_tr, errors_val, errors_test

In [25]:
errors_tr, errors_val, errors_test = cross_opt(train_X.values, test_X.values, train_Y.values, test_Y.values, kf)

In [35]:
print('RMSE')
for i in range(5):
    print ('Fold',i+1)
    print ('  Train Error:', errors_tr[i], '\n  Validation Error:', 
           errors_val[i], '\n  Test Error:', errors_test[i], '\n')

RMSE
Fold 1
  Train Error: 131.31679709770566 
  Validation Error: 67.89323677657444 
  Test Error: 72.49223406682952 

Fold 2
  Train Error: 105.48472638254317 
  Validation Error: 103.83303002898451 
  Test Error: 70.69771035896424 

Fold 3
  Train Error: 127.16362058387611 
  Validation Error: 71.59118660840872 
  Test Error: 72.43281576744066 

Fold 4
  Train Error: 133.04885418522025 
  Validation Error: 61.58665764595445 
  Test Error: 68.64650901538984 

Fold 5
  Train Error: 133.4774437873306 
  Validation Error: 69.21785896717697 
  Test Error: 72.65274943180059 



numpy.ndarray

# Q2.b)

In [36]:
from sklearn import preprocessing

In [37]:
realdf = pd.read_csv('Real estate valuation data set.csv')
realdf.keys()

Index(['No', 'X1 transaction date', 'X2 house age',
       'X3 distance to the nearest MRT station',
       'X4 number of convenience stores', 'X5 latitude', 'X6 longitude',
       'Y house price of unit area'],
      dtype='object')

In [41]:
X = realdf.copy()
y = realdf['Y house price of unit area']
del X['Y house price of unit area']

In [42]:
tmp = X.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
tmp_scaled = min_max_scaler.fit_transform(tmp)
X_norm = pd.DataFrame(tmp_scaled)

In [44]:
train_X_norm, test_X_norm, train_Y_norm, test_Y_norm = train_test_split(X_norm, y, test_size=0.2, random_state=42)

In [45]:
err_tr_norm, err_val_norm, err_test_norm = cross_opt(train_X_norm.values, test_X_norm.values,
                                                     train_Y_norm.values, test_Y_norm.values, kf)

In [46]:
print('RMSE for normalised dataset')
for i in range(5):
    print ('Fold',i+1)
    print ('  Train Error:', err_tr_norm[i], '\n  Validation Error:', 
           err_val_norm[i], '\n  Test Error:', err_test_norm[i], '\n')

RMSE for normalised dataset
Fold 1
  Train Error: 132.35352960914946 
  Validation Error: 60.51579298001472 
  Test Error: 66.34043714055554 

Fold 2
  Train Error: 106.03343057734197 
  Validation Error: 113.16477367096176 
  Test Error: 66.61618421975247 

Fold 3
  Train Error: 132.78120951399714 
  Validation Error: 70.27521326897556 
  Test Error: 63.07055414375237 

Fold 4
  Train Error: 130.3199401473159 
  Validation Error: 67.95367245410655 
  Test Error: 66.63665057609063 

Fold 5
  Train Error: 125.7419403204657 
  Validation Error: 64.72105101467717 
  Test Error: 69.1301992217905 



In [48]:
print('Difference of RMSE: Rmse - Rmse_norm \n')
for i in range(5):
    print ('Fold',i+1)
    print ('  Train Error:', errors_tr[i] - err_tr_norm[i], '\n  Validation Error:', 
           errors_val[i] - err_val_norm[i], '\n  Test Error:', errors_test[i] - err_test_norm[i], '\n')

Difference of RMSE: Rmse - Rmse_norm 

Fold 1
  Train Error: -1.0367325114438017 
  Validation Error: 7.377443796559717 
  Test Error: 6.151796926273988 

Fold 2
  Train Error: -0.5487041947988018 
  Validation Error: -9.331743641977255 
  Test Error: 4.081526139211775 

Fold 3
  Train Error: -5.61758893012103 
  Validation Error: 1.3159733394331568 
  Test Error: 9.36226162368829 

Fold 4
  Train Error: 2.72891403790436 
  Validation Error: -6.367014808152099 
  Test Error: 2.009858439299208 

Fold 5
  Train Error: 7.735503466864898 
  Validation Error: 4.496807952499793 
  Test Error: 3.522550210010081 



In general, we see that for linear regression decision tree and random forest, the rmse is more, but, for ridge and lassoregression, the rmse is lesser.

In [None]:
 '''if i not in [0,1]:
            tr_X = pd.concat([splits_X[0], splits_X[1]], ignore_index=True)
            tr_Y = pd.concat([splits_Y[0], splits_Y[1]], ignore_index=True)
            for j in range(2, len(splits_X)):
                if j != i:
                    tr_X = pd.concat([tr_X, splits_X[j]], ignore_index=True)
                    tr_Y = pd.concat([tr_Y, splits_Y[j]], ignore_index=True)
        elif i == 0:
            tr_X = pd.concat([splits_X[1], splits_X[2]], ignore_index=True)
            tr_Y = pd.concat([splits_Y[1], splits_Y[2]], ignore_index=True)
            for j in range(3, len(splits_X)):
                if j != i:
                    tr_X = pd.concat([tr_X, splits_X[j]], ignore_index=True)
                    tr_Y = pd.concat([tr_Y, splits_Y[j]], ignore_index=True)
        else: #i=1
            tr_X = pd.concat([splits_X[0], splits_X[2]], ignore_index=True)
            tr_Y = pd.concat([splits_Y[0], splits_Y[2]], ignore_index=True)
            for j in range(3, len(splits_X)):
                if j != i:
                    tr_X = pd.concat([tr_X, splits_X[j]], ignore_index=True)
                    tr_Y = pd.concat([tr_Y, splits_Y[j]], ignore_index=True)'''