In [51]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

In [24]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'

In [25]:
df = pd.read_csv(data)

In [26]:
df = df[['engine_displacement','horsepower','vehicle_weight','model_year','fuel_efficiency_mpg']]

In [27]:
df['fuel_efficiency_mpg'].describe()

count    9704.000000
mean       14.985243
std         2.556468
min         6.200971
25%        13.267459
50%        15.006037
75%        16.707965
max        25.967222
Name: fuel_efficiency_mpg, dtype: float64

In [28]:
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [29]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [30]:
df['horsepower'].median()

np.float64(149.0)

In [31]:
df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)


## Prepare and split the dataset

In [32]:
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [33]:
n

9704

In [34]:
n_val, n_test, n_train

(1940, 1940, 5824)

In [35]:
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

In [36]:
idx = np.arange(n)

In [37]:
np.random.seed(2)
np.random.shuffle(idx)

In [38]:
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

In [39]:
df_train.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
246,170,164.0,2990.040917,2019,15.963019
8125,170,149.657292,2729.623741,2012,15.931964
1927,200,142.0,3126.513375,2019,14.284901
8235,200,148.0,3136.477901,2003,14.86521
424,230,141.0,3384.681613,2006,12.428822


In [40]:
len(df_train), len(df_val), len(df_test)

(5824, 1940, 1940)

In [41]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [42]:
df_train.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,164.0,2990.040917,2019,15.963019
1,170,149.657292,2729.623741,2012,15.931964
2,200,142.0,3126.513375,2019,14.284901
3,200,148.0,3136.477901,2003,14.86521
4,230,141.0,3384.681613,2006,12.428822


In [43]:
y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

In [44]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [45]:
X_train = df_train
X_val = df_val
X_test = df_test

In [46]:
len(y_train)

5824

## Linear regression

In [54]:
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)


# --------------------
# Q3: Impute with 0 vs with mean (train mean only), LinearRegression (no regularization)
# --------------------
print("Q3: Imputation strategy comparison (0 vs mean)")

# a) with 0
X_train_0 = X_train.copy()
X_val_0   = X_val.copy()
for c in X_train_0.columns:
    X_train_0[c] = X_train_0[c].fillna(0)
    X_val_0[c]   = X_val_0[c].fillna(0)

lr0 = LinearRegression()
lr0.fit(X_train_0, y_train)
rmse_0 = rmse(y_val, lr0.predict(X_val_0))

# b) with mean (train only)
X_train_m = X_train.copy()
X_val_m   = X_val.copy()
means = X_train_m.mean(numeric_only=True)
for c in X_train_m.columns:
    X_train_m[c] = X_train_m[c].fillna(means[c])
    X_val_m[c]   = X_val_m[c].fillna(means[c])

lr_mean = LinearRegression()
lr_mean.fit(X_train_m, y_train)
rmse_mean = rmse(y_val, lr_mean.predict(X_val_m))

print(f"RMSE (fill 0):    {round(rmse_0, 2)}")
print(f"RMSE (fill mean): {round(rmse_mean, 2)}")

if abs(rmse_0 - rmse_mean) < 1e-9:
    q3_answer = "Both are equally good"
else:
    q3_answer = "With 0" if rmse_0 < rmse_mean else "With mean"
print("Q3 answer:", q3_answer)

# --------------------
# Q4: Regularized linear regression (Ridge) with NAs filled with 0
#     try r in [0, 0.01, 0.1, 1, 5, 10, 100]
#     choose the best on validation; if ties, smallest r
#     Then map to the closest among the allowed options: [0, 0.01, 1, 10, 100]
# --------------------
print("Q4: Ridge regularization sweep")

# prepare zero-imputed copies (reuse from above to keep logic clear)
X_train_r = X_train.copy().fillna(0)
X_val_r   = X_val.copy().fillna(0)

r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_by_r = {}
for r in r_list:
    if r == 0:
        model = LinearRegression()
    else:
        model = Ridge(alpha=r, random_state=42)
    model.fit(X_train_r, y_train)
    y_pred = model.predict(X_val_r)
    rm = rmse(y_val, y_pred)
    rmse_by_r[r] = rm
    print(f"r={r:<5}: RMSE={round(rm, 2)}")

# pick best r (true best)
best_r_true = sorted(rmse_by_r.items(), key=lambda kv: (kv[1], kv[0]))[0][0]

# map to closest option set for the quiz
allowed_opts_q4 = np.array([0, 0.01, 1, 10, 100], dtype=float)
q4_choice = float(closest_option(best_r_true, allowed_opts_q4))
print(f"Best r (actual): {best_r_true}  -> Q4 choice (closest allowed): {q4_choice}")

# --------------------
# Q5: Stability across seeds 0..9
#     Split 60/20/20, fill NAs with 0, LinearRegression, get RMSE on validation, std of scores
# --------------------
print("Q5: Seed sensitivity (std of validation RMSE)")

seeds = list(range(10))
scores = []
for s in seeds:
    d = shuffle_df(df, seed=s)
    tr, va, te = split_60_20_20(d)
    Xtr = tr[FEATURES].fillna(0)
    ytr = tr[TARGET].values
    Xva = va[FEATURES].fillna(0)
    yva = va[TARGET].values

    m = LinearRegression()
    m.fit(Xtr, ytr)
    pred = m.predict(Xva)
    scores.append(rmse(yva, pred))

scores = np.array(scores, dtype=float)
std_scores = float(np.std(scores))
print("Validation RMSE per seed:", [round(s, 3) for s in scores])
print("STD of RMSE over seeds:", round(std_scores, 3))

# map to closest option
q5_choice = float(closest_option(std_scores, np.array([0.001, 0.006, 0.060, 0.600], dtype=float)))
print("Q5 choice (closest):", q5_choice)

# --------------------
# Q6: Seed 9, combine train+val, fill NAs with 0, train Ridge r=0.001, RMSE on test
# --------------------
print("Q6: Final model and test RMSE (seed=9, r=0.001, fill 0)")

seed = 9
d = shuffle_df(df, seed=seed)
tr, va, te = split_60_20_20(d)
tv = pd.concat([tr, va], axis=0).reset_index(drop=True)

X_tv  = tv[FEATURES].fillna(0)
y_tv  = tv[TARGET].values
X_tst = te[FEATURES].fillna(0)
y_tst = te[TARGET].values

final = Ridge(alpha=0.001, random_state=42)
final.fit(X_tv, y_tv)
rmse_test = rmse(y_tst, final.predict(X_tst))
print("Test RMSE:", round(rmse_test, 3))

# map to closest option
q6_choice = float(closest_option(rmse_test, np.array([0.15, 0.515, 5.15, 51.5])))
print("Q6 choice (closest):", q6_choice)

# --------------------
# Summary
# --------------------
print("Summary / Answers")
print(f"Q1: column with missing values -> {col_with_nas}")
# For Q2 pick closest among provided options [49, 99, 149, 199]
q2_choice = int(closest_option(median_hp, np.array([49, 99, 149, 199], dtype=float)))
print(f"Q2: horsepower median -> {median_hp} (closest option: {q2_choice})")
print(f"Q3: better imputation -> {q3_answer}")
print(f"Q4: best regularization r -> actual best {best_r_true} (closest option: {q4_choice})")
print(f"Q5: std of val RMSE across seeds -> {round(std_scores, 3)} (closest option: {q5_choice})")
print(f"Q6: test RMSE (seed=9, r=0.001) -> {round(rmse_test, 3)} (closest option: {q6_choice})")

Q3: Imputation strategy comparison (0 vs mean)
RMSE (fill 0):    0.0
RMSE (fill mean): 0.0
Q3 answer: Both are equally good
Q4: Ridge regularization sweep
r=0    : RMSE=0.0
r=0.01 : RMSE=0.0
r=0.1  : RMSE=0.0
r=1    : RMSE=0.0
r=5    : RMSE=0.0
r=10   : RMSE=0.0
r=100  : RMSE=0.0


NameError: name 'closest_option' is not defined