In [1]:
import pandas as pd
import numpy as np

In [2]:
# !pip install wget

In [3]:
# !python -m wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [4]:
column_list = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value']

In [5]:
df = pd.read_csv('housing.csv')

In [6]:
df = df[column_list]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   latitude            20640 non-null  float64
 1   longitude           20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [8]:
df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,37.88,-122.23,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,37.86,-122.22,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,37.85,-122.24,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,37.85,-122.25,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,37.85,-122.25,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


## Question 1

In [9]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

The total_bedrooms variable has 207 null values

## Question 2

In [10]:
df['population'].median()

1166.0

The median of the population variable is 1166.

## Question 3

In [11]:
df.shape[0]

20640

## Question 4

In [12]:
# Split and Shuffle
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

In [13]:
mean_total_bedrooms = df_train.total_bedrooms.mean()
mean_total_bedrooms

533.4803317730147

In [14]:
df_mean = df_shuffled.copy()
df_mean['total_bedrooms'].fillna(mean_total_bedrooms, inplace=True)
df_mean.isna().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

In [15]:
df_train_mean = df_mean.iloc[:n_train].copy()
df_val_mean = df_mean.iloc[n_train:n_train+n_val].copy()
df_test_mean = df_mean.iloc[n_train+n_val:].copy()

y_train_mean = np.log1p(df_train_mean.median_house_value.values)
y_val_mean = np.log1p(df_val_mean.median_house_value.values)
y_test_mean = np.log1p(df_test_mean.median_house_value.values)

del df_train_mean['median_house_value']
del df_val_mean['median_house_value']
del df_test_mean['median_house_value']

In [16]:
df_zero = df_shuffled.copy()
df_zero['total_bedrooms'].fillna(0, inplace=True)
df_zero.isna().sum()

latitude              0
longitude             0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

In [17]:
df_train_zero = df_zero.iloc[:n_train].copy()
df_val_zero = df_zero.iloc[n_train:n_train+n_val].copy()
df_test_zero = df_zero.iloc[n_train+n_val:].copy()

y_train_zero = np.log1p(df_train_zero.median_house_value.values)
y_val_zero = np.log1p(df_val_zero.median_house_value.values)
y_test_zero = np.log1p(df_test_zero.median_house_value.values)

del df_train_zero['median_house_value']
del df_val_zero['median_house_value']
del df_test_zero['median_house_value']

In [18]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [19]:
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [20]:
base = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income']

In [21]:
X_train_mean = prepare_X(df_train_mean)
w_0_mean, w_mean = train_linear_regression(X_train_mean, y_train_mean)

In [22]:
X_train_zero = prepare_X(df_train_zero)
w_0_zero, w_zero = train_linear_regression(X_train_zero, y_train_zero)

In [23]:
y_pred_mean = w_0_mean + X_train_mean.dot(w_mean)
y_pred_zero = w_0_zero + X_train_zero.dot(w_zero)

In [24]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [25]:
rmse(y_train_mean, y_pred_mean)

0.34104161810328065

In [26]:
rmse(y_train_zero, y_pred_zero)

0.3413135910156676

Both are equally good

## Question 4

In [27]:
r_list = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

In [28]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [29]:
X_train_zero = prepare_X(df_train_zero)
X_val_zero = prepare_X(df_val_zero)

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_train_zero, y_train_zero, r=r)
    y_pred_zero = w_0 + X_val_zero.dot(w)
    print('%6s' %r, '%.2f' %rmse(y_val_zero, y_pred_zero))

     0 0.33
 1e-06 0.33
0.0001 0.33
 0.001 0.33
  0.01 0.33
   0.1 0.33
     1 0.33
     5 0.34
    10 0.34


The smallest 'r' is 0

## Question 5

In [30]:
seed_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [31]:
def shuffle_and_split(df, s):
    np.random.seed(s)

    n = len(df)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]
    
    df = df_shuffled.copy()
    
    df['total_bedrooms'].fillna(0, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    x_train = df.iloc[:n_train].copy()
    x_val = df.iloc[n_train:n_train+n_val].copy()
    x_test = df.iloc[n_train+n_val:].copy()

    y_train = np.log1p(x_train.median_house_value.values)
    y_val = np.log1p(x_val.median_house_value.values)
    y_test = np.log1p(x_test.median_house_value.values)

    del x_train['median_house_value']
    del x_val['median_house_value']
    del x_test['median_house_value']
    
    return x_train, x_val, x_test, y_train, y_val, y_test

In [32]:
rmse_list = []
for s in seed_list:
    x_train, x_val, x_test, y_train,y_val, y_test = shuffle_and_split(df, s)
    X_train = prepare_X(x_train)
    X_val = prepare_X(x_val)
    
    w_0, w = train_linear_regression(X_train, y_train)
    
    y_pred = w_0 + X_val.dot(w)
    
    rmse_list.append(rmse(y_val, y_pred))

In [33]:
round(np.array(rmse_list).std(), 3)

0.004

## Question 6

In [56]:
s = 9
x_train, x_val, x_test, y_train,y_val, y_test = shuffle_and_split(df, s)
full_x_train = pd.concat([x_train, x_val])
full_x_train.reset_index(drop=True, inplace=True)

full_y_train = np.concatenate([y_train, y_val])


X_train = prepare_X(full_x_train)
    
w_0, w = train_linear_regression_reg(X_train, full_y_train, r=0.001)
    
y_pred = w_0 + x_test.dot(w)
    
rmse(y_test, y_pred)

0.3453168914380239