#**Regression homework**


In [None]:
import numpy as np
import pandas as pd

In [None]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)
df = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]

df.head(10)

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369
5,190,,2484.883986,2008,17.271818
6,240,127.0,3006.542287,2012,13.210412
7,150,239.0,3638.65778,2020,12.848884
8,250,174.0,2714.21931,2016,16.823554
9,150,123.0,3509.036569,2005,12.298355


In [None]:
df.info()

In [None]:
df.describe(include='all', percentiles=[.01,.05,.25,.5,.75,.95,.99]).T

###**<font color='red'>Question 1 </font>**

There's one column with missing values. What is it?

- 'engine_displacement'
- <font color='green'>'horsepower'</font> ✅
- 'vehicle_weight'
- 'model_year'

In [None]:
df.columns[df.isna().any()].to_list()

['horsepower']

###**<font color='red'>Question 2</font>**

What's the median (50% percentile) for variable 'horsepower'?

- 49
- 99
- <font color='green'>149</font> ✅
- 199



In [None]:
df['horsepower'].median()

**Prepare and split the dataset**
- Shuffle the dataset (the filtered one you created above), use seed 42.
- Split your data in train/val/test sets, with 60%/20%/20% distribution.

Use the same code as in the lectures

In [None]:
def split_dataset(df, seed=42, val_ratio=0.2, test_ratio=0.2):
    n = len(df)
    n_val = int(n * val_ratio)
    n_test = int(n * test_ratio)
    n_train = n - n_val - n_test

    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]].reset_index(drop=True)
    df_val = df.iloc[idx[n_train : n_train + n_val]].reset_index(drop=True)
    df_test = df.iloc[idx[n_train + n_val :]].reset_index(drop=True)

    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    y_test = df_test.fuel_efficiency_mpg.values

    return df_train, df_val, df_test, y_train, y_val, y_test

###**<font color='red'>Question 3</font>**
- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)

Which option gives better RMSE?

Options:
- With 0
- <font color='green'>With mean</font> ✅
- Both are equally good

In [None]:
df_train, df_val, df_test, y_train, y_val, y_test = split_dataset(df)

In [None]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

In [None]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [None]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

In [None]:
def X_fillna_zeros(df):
  df_num = df[base].copy()
  df_num = df_num.fillna(0)
  X = df_num.values
  return X

In [None]:
def X_fillna_mean(df):
  df_num = df[base].copy()
  mean = df_train['horsepower'].mean()
  df_num['horsepower'] = df_num['horsepower'].fillna(mean)
  X = df_num.values
  return X

In [None]:
# Fillna with zeros
X_train = X_fillna_zeros(df_train)
w_0, w = train_linear_regression(X_train, y_train)

X_val = X_fillna_zeros(df_val)
y_pred = w_0 + X_val.dot(w)
np.round(rmse(y_val, y_pred), 2)

In [None]:
# Fillna with mean
X_train = X_fillna_mean(df_train)
w_0, w = train_linear_regression(X_train, y_train)

X_val = X_fillna_mean(df_val)
y_pred = w_0 + X_val.dot(w)
np.round(rmse(y_val, y_pred), 2)

###**<font color='red'>Question 4</font>**
- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?
- If there are multiple options, select the smallest r.

Options:
- 0
- <font color='green'>0.01</font> ✅
- 1
- 10
- 100


In [None]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [None]:
X_train = X_fillna_zeros(df_train)
X_val = X_fillna_zeros(df_val)

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    print('%6s' %r, np.round(rmse(y_val, y_pred), 4))

###**<font color='red'>Question 5</font>**
- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

What's the value of std?

- 0.001
- <font color='green'>0.006</font> ✅
- 0.060
- 0.600


*Note: Standard deviation shows how different the values are. If it's low, then all values are approximately the same. If it's high, the values are different. If standard deviation of scores is low, then our model is stable.*

In [None]:
rmse_scores = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
  df_train, df_val, _, y_train, y_val, _ = split_dataset(df, seed=seed)

  X_train = X_fillna_zeros(df_train)
  w_0, w = train_linear_regression(X_train, y_train)

  X_val = X_fillna_zeros(df_val)
  y_pred = w_0 + X_val.dot(w)

  rmse_score = rmse(y_val, y_pred)
  rmse_scores.append(rmse_score)

np.round(np.std(rmse_scores), 10)

np.float64(0.0069894464)