In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('housing.csv', usecols = [
    'latitude',
    'longitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'median_house_value'
])

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [4]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [5]:
df.population.median()

1166.0

In [6]:
def split_dataset(df, seed=42):
    df = df.sample(frac=1, random_state=seed)
    n = len(df)

    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    
    df_train = df.iloc[:n_train]
    df_val = df.iloc[n_train:n_train+n_val]
    df_test = df.iloc[n_train+n_val:]
    
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']

    return df_train, df_val, df_test, y_train, y_val, y_test

In [7]:
df_train, df_val, df_test, y_train, y_val, y_test = split_dataset(df)

In [8]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [9]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [10]:
def prepare_X(df, fill_type = '0', central_value = None):
    if fill_type == '0':
        df['total_bedrooms'] = df['total_bedrooms'].fillna(0)
    elif fill_type == 'mean':
        df['total_bedrooms'] = df['total_bedrooms'].fillna(central_value)
    elif fill_type == 'median':
        df['total_bedrooms'] = df['total_bedrooms'].fillna(central_value)
    X = df.values
    return X

In [11]:
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
round(rmse(y_val, y_pred), 2)

0.33

In [12]:
train_mean = df_train['total_bedrooms'].mean()
X_train = prepare_X(df_train,'mean',train_mean)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val,'mean',train_mean)
y_pred = w0 + X_val.dot(w)
round(rmse(y_val, y_pred), 2)

0.33

In [13]:
train_median = df_train['total_bedrooms'].median()
X_train = prepare_X(df_train,'median',train_median)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val,'median',train_median)
y_pred = w0 + X_val.dot(w)
round(rmse(y_val, y_pred), 2)

0.33

In [14]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [15]:
reg_values = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

In [16]:
for r in reg_values:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = round(rmse(y_val, y_pred),2)
    
    print(r, w0, score)

0 -11.686975244463337 0.33
1e-06 -11.686959178633675 0.33
0.0001 -11.685368867734486 0.33
0.001 -11.670931320420612 0.33
0.01 -11.52849358854661 0.33
0.1 -10.27450028430847 0.33
1 -4.920480898460246 0.33
5 -1.4820957457819053 0.34
10 -0.7899311832739603 0.34


In [17]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [18]:
scores = []
for s in seeds:
    df_train, df_val, df_test, y_train, y_val, y_test = split_dataset(df, seed=s)
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    scores.append(score)
    print(s, w0, round(score,2))
round(np.std(np.array(scores)),3)

0 -11.900382140318047 0.34
1 -11.732757375384645 0.34
2 -11.806729361829014 0.33
3 -11.58790034978238 0.34
4 -11.389470590750427 0.34
5 -11.447114274662594 0.34
6 -11.37051635366835 0.35
7 -12.473448922947721 0.34
8 -11.800287430496683 0.35
9 -11.459046833924818 0.34


0.004

In [19]:
df_train, df_val, df_test, y_train, y_val, y_test = split_dataset(df, seed=9)

In [20]:
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)
y_full_train = np.concatenate([y_train, y_val])

In [21]:
X_full_train = prepare_X(df_full_train)
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
score

0.345316891435843