In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import Ridge

In [2]:
# functions
def train_linear_regression_model(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:]


def rmse(y, y_pred):
    error = y - y_pred
    mse = (error**2).mean()
    return np.sqrt(mse)


def prepare_X_with_mean(df, train_mean=None):
    df_num = df.copy()
    if train_mean is not None:
        df_num.fillna(train_mean, inplace=True)

    X = df_num.values
    return X


def prepare_X_with_zero(df):
    df_num = df.copy()
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [3]:
df = pd.read_csv("housing.csv")
df.shape, df.columns

((20640, 10),
 Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
        'total_bedrooms', 'population', 'households', 'median_income',
        'median_house_value', 'ocean_proximity'],
       dtype='object'))

In [None]:
sns.histplot(df.median_house_value, bins=50, alpha=0.5)

In [5]:
df = df[df.ocean_proximity.isin(["<1H OCEAN", "INLAND"])]
FEATURES = [
    "latitude",
    "longitude",
    "housing_median_age",
    "total_rooms",
    "total_bedrooms",
    "population",
    "households",
    "median_income",
    "median_house_value",
]
df = df[FEATURES]
df.shape, df.columns

((15687, 9),
 Index(['latitude', 'longitude', 'housing_median_age', 'total_rooms',
        'total_bedrooms', 'population', 'households', 'median_income',
        'median_house_value'],
       dtype='object'))

In [6]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [7]:
df.population.median()

1195.0

In [8]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - (n_val + n_test)
n, n_train, n_val, n_test

(15687, 9413, 3137, 3137)

In [9]:
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train : n_train + n_val]
df_test = df.iloc[n_train + n_val :]
idx = np.arange(n)

np.random.seed(42)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train : n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val :]]

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train["median_house_value"]
del df_val["median_house_value"]
del df_test["median_house_value"]

len(df_train), len(df_val), len(df_test)

(9413, 3137, 3137)

In [10]:
# Fill NA with mean

train_mean = df_train.mean()

X_train = prepare_X_with_mean(df_train, train_mean)
w0, w = train_linear_regression_model(X_train, y_train)
y_pred = w0 + X_train.dot(w)
rmse_train = rmse(y_train, y_pred)

X_val = prepare_X_with_mean(df_val, train_mean)
y_pred = w0 + X_val.dot(w)
rmse_val = rmse(y_val, y_pred)

df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)
X_full_train = prepare_X_with_mean(df_full_train, train_mean)
y_full_train = np.concatenate([y_train, y_val])

w0, w = train_linear_regression_model(X_full_train, y_full_train)

X_test = prepare_X_with_mean(df_test, train_mean)
y_pred = w0 + X_test.dot(w)
rmse_test = rmse(y_test, y_pred)

print("Train RMSE: {:.2f}".format(rmse_train))
print("Validation RMSE: {:.2f}".format(rmse_val))
print("Test RMSE: {:.2f}".format(rmse_test))

Train RMSE: 0.34
Validation RMSE: 0.34
Test RMSE: 0.33


In [11]:
# List of regularization values (r)
reg_values = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

best_rmse = float("inf")
best_reg_value = None

for reg_value in reg_values:
    X_train = prepare_X_with_zero(df_train)
    w0, w = train_linear_regression_model(X_train, y_train)

    model = Ridge(alpha=reg_value)
    model.fit(X_train, y_train)

    X_val = prepare_X_with_zero(df_val)
    y_pred_val = model.predict(X_val)

    rmse_val = rmse(y_val, y_pred_val)

    if rmse_val < best_rmse:
        best_rmse = rmse_val
        best_reg_value = reg_value

print(
    f"Best RMSE on Validation set: {round(best_rmse, 2)} with regularization value r = {best_reg_value}"
)

X_full_train = prepare_X_with_zero(pd.concat([df_train, df_val]))
y_full_train = np.concatenate([y_train, y_val])
model = Ridge(alpha=best_reg_value)
model.fit(X_full_train, y_full_train)

X_test = prepare_X_with_zero(df_test)
y_pred_test = model.predict(X_test)
rmse_test = rmse(y_test, y_pred_test)

print("Test RMSE with Best Regularization: {:.2f}".format(rmse_test))

Best RMSE on Validation set: 0.34 with regularization value r = 0
Test RMSE with Best Regularization: 0.33


In [12]:
seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

rmse_scores = []

for seed in seed_values:
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
    df_val = df.iloc[idx[n_train : n_train + n_val]].reset_index(drop=True)
    df_test = df.iloc[idx[n_train + n_val :]].reset_index(drop=True)

    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    del df_train["median_house_value"]
    del df_val["median_house_value"]
    del df_test["median_house_value"]

    X_train = prepare_X_with_zero(df_train)
    w0, w = train_linear_regression_model(X_train, y_train)
    y_pred_val = w0 + prepare_X_with_zero(df_val).dot(w)
    rmse_val = rmse(y_val, y_pred_val)

    rmse_scores.append(rmse_val)

std_deviation = round(np.std(rmse_scores), 3)

print(f"Standard Deviation of RMSE Scores: {std_deviation}")

Standard Deviation of RMSE Scores: 0.005


In [13]:
# seed = 9
seed = 9
np.random.seed(seed)

idx = np.arange(n)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
df_val = df.iloc[idx[n_train : n_train + n_val]].reset_index(drop=True)
df_test = df.iloc[idx[n_train + n_val :]].reset_index(drop=True)

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train["median_house_value"]
del df_val["median_house_value"]
del df_test["median_house_value"]

df_combined_train = pd.concat([df_train, df_val])
df_combined_train = df_combined_train.reset_index(drop=True)

y_combined_train = np.concatenate([y_train, y_val])

X_combined_train = prepare_X_with_zero(df_combined_train)
X_test = prepare_X_with_zero(df_test)

reg_value = 0.001
model = Ridge(alpha=reg_value)
model.fit(X_combined_train, y_combined_train)

y_pred_test = model.predict(X_test)

rmse_test = rmse(y_test, y_pred_test)

print("Test RMSE with r=0.001: {:.2f}".format(rmse_test))

Test RMSE with r=0.001: 0.33
