In [16]:
#1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

housing = pd.read_csv("../housing_path/housing.csv")

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

# fill the missing values in the total bedrooms column
median = housing["total_bedrooms"].median() # option 3
housing["total_bedrooms"].fillna(median, inplace=True)

# drop the ocean proximity feature
housing = housing.drop("ocean_proximity", axis=1)

# split the data 80/20 into a training and testing dataset
training, testing = train_test_split(housing, test_size=0.2)

# scaling the datasets
std_scaler = StandardScaler()
training_std = std_scaler.fit_transform(training)
testing_std = std_scaler.transform(testing)

# convert back to dataframe
training_fn = pd.DataFrame(training_std, columns = list(training.columns)[0:])
testing_fn = pd.DataFrame(testing_std, columns = list(testing.columns)[0:])

# identify the targets for the regression model
training_labels = training_fn["median_house_value"]
testing_labels = testing_fn["median_house_value"]

# train k-NN regressor
k = 5
knn_reg = KNeighborsRegressor(n_neighbors=k)
knn_reg.fit(training_fn, training_labels)

# use the model
knn_pred = knn_reg.predict(testing_fn)

# compute the RMSE
rmse = np.sqrt(mean_squared_error(testing_labels, knn_pred))
print("RMSE:", rmse)

RMSE: 0.14359624756818085


Question 1

While the value of the RMSE may vary depending on the dataset and the model constructed, a lower value indicates the model is performing well, as predictions are closer to the actual values. The RMSE value is a good score.

In [26]:
#2 and 3
# dropped ocean proximity earlier
# dropping 'total_bedrooms' and 'households' as they are directly related to 'total rooms'
training_clean = training[['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'median_income']]
testing_clean = testing[['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'median_income']]

# target remains in the dataset, and doesnt change, so label is the same as above.

# obtaining the dimension of feature vectors
n = len(training_clean.columns)

# train k-NN regressor
k = 5
knn_reg_u = KNeighborsRegressor(n_neighbors=k, weights='uniform')
knn_reg_d = KNeighborsRegressor(n_neighbors=k, weights='distance')
knn_reg_u.fit(training_clean, training_labels)
knn_reg_d.fit(training_clean, training_labels)

# use the model
knn_pred_u = knn_reg_u.predict(testing_clean)
knn_pred_d = knn_reg_d.predict(testing_clean)

# compute the RMSE
rmse_u = np.sqrt(mean_squared_error(testing_labels, knn_pred_u))
rmse_d = np.sqrt(mean_squared_error(testing_labels, knn_pred_d))

print("RMSE (uniform):", rmse_u)
print("RMSE (distance):", rmse_d)


RMSE (uniform): 0.9235044665154809
RMSE (distance): 0.9244120581986693


Question 2/3

the two factors dont seem to drastically affect the RMSE, with the distance weights factor often yielding a slightly higher RMSE than the uniform factor, however this is negligible. The RMSE values are also larger than in question 1, this would be due to the removal of the three features, giving the model less data to train with, and therefor decreasing the accuracy of the model.