In [40]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-10-08 22:06:42--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-10-08 22:06:42 (39.5 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [41]:
# Prepared Functions
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [42]:
# read the csv and make sure it's there
df_full = pd.read_csv('car_fuel_efficiency.csv')
len(df_full)

9704

In [43]:
# select on the needed feature columns for df
base = ['engine_displacement','horsepower','vehicle_weight','model_year', 'fuel_efficiency_mpg']
df = df_full[keep_cols].copy()
len(df), df.head()

(9704,
    engine_displacement  horsepower  vehicle_weight  model_year  \
 0                  170       159.0     3413.433759        2003   
 1                  130        97.0     3149.664934        2007   
 2                  170        78.0     3079.038997        2018   
 3                  220         NaN     2542.392402        2009   
 4                  210       140.0     3460.870990        2009   
 
    fuel_efficiency_mpg  
 0            13.231729  
 1            13.688217  
 2            14.246341  
 3            16.912736  
 4            12.488369  )

# Question 1

In [44]:
na_counts = df.isna().sum()
na_counts

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

# Question 2

In [25]:
df['horsepower'].median()

np.float64(149.0)

# Question 3

In [26]:
seed = 42
df_shuffled = df.sample(frac=1.0, random_state=seed)
n = len(df_shuffled)

n_train = int(0.6 * n)
n_val = int(0.2 * n)

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

len(df_train), len(df_val), len(df_test)

(5822, 1940, 1942)

In [29]:
# Identify the feature with missing values
na_feature = na_counts.idxmax()

# Prepare y vectors
y_train = df_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values

# Option 1: fill with 0 
X_train_zero = prepare_X(df_train)
X_val_zero = prepare_X(df_val)
w0_z, w_z = train_linear_regression(X_train_zero, y_train)
y_pred_z = w0_z + X_val_zero.dot(w_z)
rmse_zero = rmse(y_val, y_pred_z)
rmse_zero

np.float64(2.1932345197329491e-10)

In [31]:
# Option 2: fill with mean 
train_mean = df_train[na_feature].mean()
df_train_mean = df_train.copy()
df_val_mean = df_val.copy()
df_train_mean[na_feature] = df_train_mean[na_feature].fillna(train_mean)
df_val_mean[na_feature] = df_val_mean[na_feature].fillna(train_mean)

X_train_mean = prepare_X(df_train_mean)
X_val_mean = prepare_X(df_val_mean)
w0_m, w_m = train_linear_regression(X_train_mean, y_train)
y_pred_m = w0_m + X_val_mean.dot(w_m)
rmse_mean = rmse(y_val, y_pred_m)
f'With zero: {rmse_zero}. With mean: {rmse_mean}' 

'With zero: 2.1932345197329491e-10. With mean: 5.073792192183386e-11'

# Question 4

In [36]:
r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
scores = {}
for r in r_list:
    w0, w = train_linear_regression_reg(X_train_zero, y_train, r=r)
    y_pred = w0 + X_val_zero.dot(w)
    scores[r] = rmse(y_val, y_pred)
scores

{0: np.float64(2.1932345197329491e-10),
 0.01: np.float64(3.2644013317314747e-06),
 0.1: np.float64(3.2236575981105124e-05),
 1: np.float64(0.0003208855341629006),
 5: np.float64(0.001600053469139714),
 10: np.float64(0.003190244640006075),
 100: np.float64(0.03024096704220114)}

# Question 5

In [55]:
vals = []
for s in range(10):
    d = df.sample(frac=1.0, random_state=s+42)
    n = len(d)
    n_tr = int(0.6 * n)
    n_va = int(0.2 * n)
    d_tr = d.iloc[:n_tr].copy()
    d_va = d.iloc[n_tr:n_tr+n_va].copy()

    y_tr = d_tr['fuel_efficiency_mpg'].values
    y_va = d_va['fuel_efficiency_mpg'].values
    X_tr = prepare_X(d_tr)
    X_va = prepare_X(d_va)

    w0, w = train_linear_regression(X_tr, y_tr)
    y_p = w0 + X_va.dot(w)
    vals.append(rmse(y_va, y_p))

float(np.std(vals))

1.665296561747579e-10

# Question 6

In [38]:
d = df.sample(frac=1.0, random_state=9)
n = len(d)
n_tr = int(0.6 * n)
n_va = int(0.2 * n)
d_tr = d.iloc[:n_tr].copy()
d_va = d.iloc[n_tr:n_tr+n_va].copy()
d_te = d.iloc[n_tr+n_va:].copy()

d_trval = pd.concat([d_tr, d_va], ignore_index=True)
y_trval = d_trval['fuel_efficiency_mpg'].values
y_te = d_te['fuel_efficiency_mpg'].values
X_trval = prepare_X(d_trval)
X_te = prepare_X(d_te)
w0, w = train_linear_regression_reg(X_trval, y_trval, r=0.001)
y_pred_te = w0 + X_te.dot(w)
rmse(y_te, y_pred_te)

np.float64(2.457409262341462e-07)