In [40]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-10-08 22:06:42--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv’


2025-10-08 22:06:42 (39.5 MB/s) - ‘car_fuel_efficiency.csv’ saved [874188/874188]



In [100]:
# Prepared Functions
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [42]:
# read the csv and make sure it's there
df_full = pd.read_csv('car_fuel_efficiency.csv')
len(df_full)

9704

In [43]:
# select on the needed feature columns for df
base = ['engine_displacement','horsepower','vehicle_weight','model_year', 'fuel_efficiency_mpg']
df = df_full[keep_cols].copy()
len(df), df.head()

(9704,
    engine_displacement  horsepower  vehicle_weight  model_year  \
 0                  170       159.0     3413.433759        2003   
 1                  130        97.0     3149.664934        2007   
 2                  170        78.0     3079.038997        2018   
 3                  220         NaN     2542.392402        2009   
 4                  210       140.0     3460.870990        2009   
 
    fuel_efficiency_mpg  
 0            13.231729  
 1            13.688217  
 2            14.246341  
 3            16.912736  
 4            12.488369  )

# Question 1

In [44]:
na_counts = df.isna().sum()
na_counts

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

# Question 2

In [111]:
hp_median = df['horsepower'].median()
float(hp_median)

149.0

In [95]:
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

n, n_val, n_test, n_train

(9704, 1940, 1940, 5824)

In [96]:
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

In [97]:
idx = np.arange(n)

In [98]:
np.random.seed(42)
np.random.shuffle(idx)

In [99]:
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

# should be shuffled if index is random looking
df_train.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
483,220,144.0,2535.887591,2009,16.642943
7506,160,141.0,2741.170484,2019,16.298377
8795,230,155.0,2471.880237,2017,18.591822
1688,150,206.0,3748.164469,2015,11.818843
6217,300,111.0,2135.716359,2006,19.402209


# Question 3

In [71]:
# Prepare y vectors
y_train = df_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values

# Option 1: fill with 0 see prepare_X() func
X_train_zero = prepare_X(df_train)
X_val_zero = prepare_X(df_val)
w0, w = train_linear_regression(X_train_zero, y_train)
y_pred = w0 + X_val_zero.dot(w)
rmse_zero = rmse(y_val, y_pred)

rmse_zero

np.float64(1.1802679318480694e-10)

In [91]:
# Option 2: fill with mean 
train_mean = df_train[na_feature].mean()
df_train_mean = df_train.copy()
df_val_mean = df_val.copy()
df_train_mean['horsepower'] = df_train_mean['horsepower'].fillna(train_mean)
df_val_mean['horsepower'] = df_val_mean['horsepower'].fillna(train_mean)

X_train_mean = prepare_X(df_train_mean)
X_val_mean = prepare_X(df_val_mean)
w0, w = train_linear_regression(X_train_mean, y_train)
y_pred = w0 + X_val_mean.dot(w)
rmse_mean = rmse(y_val, y_pred)

f'With zero: {rmse_zero}. With mean: {rmse_mean}'



'With zero: 1.1802679318480694e-10. With mean: 2.9358598219804703e-10'

# Question 4

In [101]:
r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
scores = {}
for r in r_list:
    w0, w = train_linear_regression_reg(X_train_zero, y_train, r=r)
    y_pred = w0 + X_val_zero.dot(w)
    scores[r] = rmse(y_val, y_pred)

scores

{0: np.float64(1.1802679318480694e-10),
 0.01: np.float64(3.2654405592349402e-06),
 0.1: np.float64(3.224279380323949e-05),
 1: np.float64(0.00032091893538337645),
 5: np.float64(0.00160020032296472),
 10: np.float64(0.0031905329326770053),
 100: np.float64(0.030243772204872323)}

# Question 5

In [112]:
vals = [] 
for s in range(10):
    d = df.sample(frac=1.0, random_state=s) 
    n = len(d) 
    n_tr = int(0.6 * n) 
    n_va = int(0.2 * n) 
    d_tr = d.iloc[:n_tr].copy() 
    d_va = d.iloc[n_tr:n_tr+n_va].copy() 
    y_tr = d_tr['fuel_efficiency_mpg'].values 
    y_va = d_va['fuel_efficiency_mpg'].values 
    X_tr = prepare_X(d_tr) 
    X_va = prepare_X(d_va) 
    w0, w = train_linear_regression(X_tr, y_tr) 
    y_p = w0 + X_va.dot(w) 
    vals.append(rmse(y_va, y_p)) 

float(np.std(vals))

1.5258467355687043e-10

# Question 6

In [110]:
d = df.sample(frac=1.0, random_state=9)
n = len(d)
n_tr = int(0.6 * n)
n_va = int(0.2 * n)
d_tr = d.iloc[:n_tr].copy()
d_va = d.iloc[n_tr:n_tr+n_va].copy()
d_te = d.iloc[n_tr+n_va:].copy()

d_trval = pd.concat([d_tr, d_va], ignore_index=True)
y_trval = d_trval['fuel_efficiency_mpg'].values
y_te = d_te['fuel_efficiency_mpg'].values
X_trval = prepare_X(d_trval)
X_te = prepare_X(d_te)
w0, w = train_linear_regression_reg(X_trval, y_trval, r=0.001)
y_pred_te = w0 + X_te.dot(w)

float(rmse(y_te, y_pred_te))

2.457409262341462e-07