In [8]:
import pandas as pd
import numpy as np

In [77]:
data = pd.read_csv('housing.csv')

In [78]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [79]:
features = ['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income']

## Question 1

In [11]:
data.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

## Question 2

In [12]:
data['population'].median()

1166.0

In [54]:
np.random.seed(42)

n = len(data)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

data_shuffled = data.iloc[idx]

data_train = data_shuffled.iloc[:n_train].copy()
data_val = data_shuffled.iloc[n_train:n_train+n_val].copy()
data_test = data_shuffled.iloc[n_train+n_val:].copy()

In [15]:
y_train_orig = data_train.median_house_value.values
y_val_orig = data_val.median_house_value.values
y_test_orig = data_test.median_house_value.values

y_train = np.log1p(data_train.median_house_value.values)
y_val = np.log1p(data_val.median_house_value.values)
y_test = np.log1p(data_test.median_house_value.values)

## Question 3

In [16]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [17]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [55]:
data_filled_1 = data_shuffled.copy()
data_filled_2 = data_shuffled.copy()

data_filled_1['total_bedrooms'] = data_filled_1['total_bedrooms'].fillna(0)
data_filled_2['total_bedrooms'] = data_filled_2['total_bedrooms'].fillna(data_train['total_bedrooms'].mean())

In [27]:
X_train_1 = data_filled_1[features].iloc[:n_train]
X_train_2 = data_filled_2[features].iloc[:n_train]

### Fillna with 0

In [39]:
w_0, w_1 = train_linear_regression(X_train_1, y_train)

In [40]:
y_pred = w_0 + X_train_1.dot(w_1)

In [41]:
rmse(y_train, y_pred)

0.3413135910156669

In [42]:
X_val =  data_filled_1[features].iloc[n_train:n_train+n_val]
y_pred = w_0 + X_val.dot(w_1)

In [43]:
round(rmse(y_val, y_pred),2)

0.33

### Fillna with mean

In [44]:
w_0, w_1 = train_linear_regression(X_train_2, y_train)

In [45]:
y_pred = w_0 + X_train_2.dot(w_1)

In [46]:
rmse(y_train, y_pred)

0.3410416181032812

In [47]:
X_val = data_filled_2[features].iloc[n_train:n_train+n_val]
y_pred = w_0 + X_val.dot(w_1)

In [48]:
round(rmse(y_val, y_pred),2)

0.33

## Question 4

In [83]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [57]:
X_train = data_filled_1[features].iloc[:n_train]
X_val = data_filled_1[features].iloc[n_train:n_train+n_val]
y_train = data_train['median_house_value']

for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    print('%6s' %r, rmse(y_val, y_pred))

     0 226285.51450739696
 1e-06 226285.50979091122
0.0001 226285.0430237888
 0.001 226280.80859623034
  0.01 226239.33167861842
   0.1 225897.65179384075
     1 224916.24516788416
     5 224694.61782650277
    10 224685.69939234183


## Question 5

In [68]:
rmse_list = []
for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(i)

    n = len(data)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)
    
    data_shuffled = data.iloc[idx]

    data_train = data_shuffled.iloc[:n_train].copy()
    data_val = data_shuffled.iloc[n_train:n_train+n_val].copy()
    data_test = data_shuffled.iloc[n_train+n_val:].copy()
    
    #data_cp = data_shuffled.copy()
    data_shuffled['total_bedrooms'] = data_shuffled['total_bedrooms'].fillna(0)
    X_train = data_shuffled[features].iloc[:n_train]
    y_train = data_train['median_house_value']
    y_val = data_val['median_house_value']
    w_0, w_1 = train_linear_regression(X_train, y_train)
    X_val = data_cp[features].iloc[n_train:n_train+n_val]
    y_pred = w_0 + X_val.dot(w_1)
    rmse_i = rmse(y_val, y_pred)
    rmse_list.append(rmse_i)

In [69]:
rmse_list

[67042.45571629998,
 68778.54165853202,
 68827.06735864774,
 69341.67786562562,
 68528.07371654388,
 64230.646211067404,
 75199.91443118553,
 69618.53684988635,
 70817.62892047862,
 66733.19790472598]

In [70]:
np.std(rmse_list)

2723.6552300961316

## Question 6

In [102]:
np.random.seed(9)

n = len(data)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

data_shuffled = data.iloc[idx]

data_train = data_shuffled.iloc[:n_train].copy()
data_val = data_shuffled.iloc[n_train:n_train+n_val].copy()
data_test = data_shuffled.iloc[n_train+n_val:].copy()

#data_cp = data_shuffled.copy()
data_shuffled['total_bedrooms'] = data_shuffled['total_bedrooms'].fillna(0)
data_train_1 = pd.concat([data_train.reset_index(drop=True), data_val.reset_index(drop=True)]).reset_index(drop=True)
X_train = data_train_1[features]
y_train = data_train_1['median_house_value']
y_test = data_test['median_house_value']
w_0, w_1 = train_linear_regression_reg(X_train, y_train, r=0.001)
X_test = data_shuffled[features].iloc[n_train+n_val:]
y_pred = w_0 + X_test.dot(w_1)
rmse = rmse(y_test, y_pred)
print(rmse)

TypeError: 'numpy.float64' object is not callable

In [100]:
X_train

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,38.67,-121.80,10.0,2086.0,380.0,1073.0,378.0,4.5526
1,33.66,-117.97,22.0,3914.0,600.0,1871.0,607.0,5.8541
2,40.28,-124.25,32.0,1430.0,419.0,434.0,187.0,1.9417
3,32.69,-117.11,37.0,2395.0,627.0,2489.0,599.0,1.5933
4,34.21,-118.37,36.0,1392.0,326.0,1181.0,303.0,3.1563
...,...,...,...,...,...,...,...,...
16507,38.31,-122.30,34.0,1797.0,395.0,1162.0,407.0,3.4550
16508,34.27,-118.54,28.0,2309.0,300.0,931.0,302.0,6.7415
16509,33.93,-118.13,34.0,2122.0,517.0,1578.0,488.0,3.1496
16510,34.11,-118.37,42.0,5518.0,979.0,1863.0,957.0,8.5842
