# Homework №2

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline


Download dataset from [Kaggle](https://www.kaggle.com/datasets/camnugent/california-housing-prices) and load into pandas DataFrame

In [4]:
df = pd.read_csv("housing.csv")

Remove unnecessary column

In [5]:
df.drop(df.columns[[-1]], axis=1, inplace=True)

**Question 1:**
Find a feature with missing values. How many missing values does it have?

In [6]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

**Question 2**
What's the median (50% percentile) for variable 'population'?

In [7]:
df['population'].median()

1166.0

**Split the data**<br>Shuffle the initial dataset, use seed 42.<br>Split your data in train/val/test sets, with 60%/20%/20% distribution.


In [8]:
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()


Target value ('median_house_value') is not in dataframe.
Apply the log transformation to the median_house_value variable using the np.log1p() function

In [9]:
y_train_orig = df_train.median_house_value.values
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']


**Question 3**

In [10]:
base = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income']

Fill missing values with zeros

In [11]:
def prepare_X_zero(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [12]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]


In [13]:
X_train = prepare_X_zero(df_train)
w_0, w = train_linear_regression(X_train, y_train)
y_pred = w_0 + X_train.dot(w)

In [14]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)


In [15]:
rmse_zero = rmse(y_train, y_pred)
round(rmse_zero, 2)

0.34

In [16]:
X_val = prepare_X_zero(df_val)
y_pred = w_0 + X_val.dot(w)


In [17]:
rmse_zero = rmse(y_val, y_pred)
round(rmse_zero, 2)

0.33

Fill missing values with mean

In [18]:
df_train_mean = df_train.total_bedrooms.mean()

In [19]:
def prepare_X_mean(df):
    df_num = df[base]
    df_num = df_num.fillna(int(df_train_mean))
    X = df_num.values
    return X

In [20]:
X_train = prepare_X_mean(df_train)
w_0, w = train_linear_regression(X_train, y_train)
y_pred = w_0 + X_train.dot(w)

In [21]:
rmse_mean = rmse(y_train, y_pred)
round(rmse_mean,2)

0.34

In [22]:
X_val = prepare_X_mean(df_val)
y_pred = w_0 + X_val.dot(w)

In [23]:
rmse_mean = rmse(y_val, y_pred)
round(rmse_mean,2)

0.33

**Question 4**<br>Train a regularized linear regression.

In [24]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]


In [25]:
X_train = prepare_X_zero(df_train)
X_val = prepare_X_zero(df_val)

for r in [0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    print('%6s, %.2f' % (r, rmse(y_val, y_pred)))

 1e-06, 0.33
0.0001, 0.33
 0.001, 0.33
  0.01, 0.33
   0.1, 0.33
     1, 0.33
     5, 0.34
    10, 0.34


**Question 5**

Seed 0

In [26]:
np.random.seed(0)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()


In [27]:
y_train_orig = df_train.median_house_value.values
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']


In [28]:
X_train = prepare_X_zero(df_train)
w_0, w = train_linear_regression(X_train, y_train)
y_pred = w_0 + X_train.dot(w)

In [29]:
rmse(y_train, y_pred)

0.34289366609207744

In [30]:
X_val = prepare_X_zero(df_val)
y_pred = w_0 + X_val.dot(w)

In [31]:
rmse(y_val, y_pred)

0.33884304805306176

In the loop try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].

In [32]:
rmse_array = []
for i in range(10):
    np.random.seed(i)

    n = len(df)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    y_train_orig = df_train.median_house_value.values
    y_val_orig = df_val.median_house_value.values
    y_test_orig = df_test.median_house_value.values

    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']

    X_train = prepare_X_zero(df_train)
    w_0, w = train_linear_regression(X_train, y_train)

    X_val = prepare_X_zero(df_val)
    y_pred = w_0 + X_val.dot(w)
    rmse_array.append(rmse(y_val, y_pred))

print(rmse_array)


[0.33884304805306176, 0.3362387255955556, 0.3320912318832111, 0.34051536090332046, 0.3389024066570592, 0.3434866725716007, 0.34519809530968937, 0.3395989927408897, 0.3466230873188141, 0.3365926124142404]


Compute the standard deviation.
Round the result to 3 decimal digits (round(std, 3))

In [33]:
print(round(np.std(rmse_array),3))


0.004


**Question 6**<br>Split the dataset like previously, use seed 9.
Combine train and validation datasets

In [46]:
np.random.seed(9)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()
df_train_and_val = df_shuffled.iloc[:n_train+n_val].copy() #combined train and validation datasets


y_train_orig = df_train.median_house_value.values
y_val_orig = df_val.median_house_value.values
y_test_orig = df_test.median_house_value.values
y_train_and_val_orig = df_train_and_val.median_house_value.values

y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)
y_train_and_val = np.log1p(df_train_and_val.median_house_value.values)


del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

X_train_and_val = prepare_X_zero(df_train_and_val)
X_val = prepare_X_zero(df_val)

w_0, w = train_linear_regression_reg(X_train_and_val, y_train_and_val, r=0.001)
y_pred = w_0 + X_train_and_val.dot(w)

round(rmse(y_train_and_val, y_pred),2)


0.34