In [1]:
import os
import requests
import io
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
print("Downloading dataset...")
r = requests.get(url)
r.raise_for_status()
df = pd.read_csv(io.StringIO(r.text))


Downloading dataset...


In [3]:
cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[cols].copy()

In [4]:
for c in cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

In [5]:
print("\n--- EDA ---")
print("fuel_efficiency_mpg summary:")
print(df['fuel_efficiency_mpg'].describe())

skewness = df['fuel_efficiency_mpg'].skew()
print(f"Skewness of fuel_efficiency_mpg: {skewness:.3f}")
if skewness > 1:
    print("This suggests a long (right) tail.")
elif skewness < -1:
    print("This suggests a long left tail.")
else:
    print("No very long tail indicated by skewness.")


--- EDA ---
fuel_efficiency_mpg summary:
count    9704.000000
mean       14.985243
std         2.556468
min         6.200971
25%        13.267459
50%        15.006037
75%        16.707965
max        25.967222
Name: fuel_efficiency_mpg, dtype: float64
Skewness of fuel_efficiency_mpg: -0.012
No very long tail indicated by skewness.


In [6]:
print("\n--- Missing values per column ---")
na_counts = df.isna().sum()
print(na_counts)

q1_col = na_counts[na_counts > 0].index.tolist()
print("\nAnswer Q1: Column(s) with missing values:", q1_col)


--- Missing values per column ---
engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

Answer Q1: Column(s) with missing values: ['horsepower']


In [7]:
if len(q1_col) == 1:
    print("Q1 final answer:", q1_col[0])
else:
    print("Q1 final answer (multiple):", q1_col)

Q1 final answer: horsepower


In [8]:
median_hp = df['horsepower'].median()
print(f"\nAnswer Q2: median(horsepower) = {median_hp}")


Answer Q2: median(horsepower) = 149.0


In [9]:
def split_data(df_filtered, seed=42):
    n = len(df_filtered)
    idx = np.arange(n)
    rng = np.random.RandomState(seed)
    rng.shuffle(idx)

    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    n_test = n - n_train - n_val

    train_idx = idx[:n_train]
    val_idx = idx[n_train:n_train + n_val]
    test_idx = idx[n_train + n_val:]

    df_train = df_filtered.iloc[train_idx].reset_index(drop=True)
    df_val = df_filtered.iloc[val_idx].reset_index(drop=True)
    df_test = df_filtered.iloc[test_idx].reset_index(drop=True)

    return df_train, df_val, df_test

In [10]:
FEATURES = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
TARGET = 'fuel_efficiency_mpg'

In [16]:
def train_and_eval(df_train, df_val, fill_strategy='zero', reg=None):
    X_train = df_train[FEATURES].copy()
    X_val = df_val[FEATURES].copy()
    y_train = df_train[TARGET].values
    y_val = df_val[TARGET].values

    if fill_strategy == 'zero':
        X_train = X_train.fillna(0)
        X_val = X_val.fillna(0)
    elif fill_strategy == 'mean':
        means = X_train.mean()
        X_train = X_train.fillna(means)
        X_val = X_val.fillna(means)
    else:
        raise ValueError("fill_strategy must be 'zero' or 'mean'")

    if reg is None or reg == 0:
        model = LinearRegression()
    else:
        model = Ridge(alpha=reg, random_state=0)

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds)**0.5
    return rmse

In [17]:
print("\n--- Question 3: missing-value strategy comparison ---")
seed = 42
df_train, df_val, df_test = split_data(df, seed=seed)

rmse_zero = train_and_eval(df_train, df_val, fill_strategy='zero', reg=None)
rmse_mean = train_and_eval(df_train, df_val, fill_strategy='mean', reg=None)

print("RMSE (fill with 0):", round(rmse_zero, 2))
print("RMSE (fill with mean):", round(rmse_mean, 2))

if round(rmse_zero, 2) < round(rmse_mean, 2):
    q3_answer = "With 0"
elif round(rmse_mean, 2) < round(rmse_zero, 2):
    q3_answer = "With mean"
else:
    q3_answer = "Both are equally good"

print("Answer Q3:", q3_answer)


--- Question 3: missing-value strategy comparison ---
RMSE (fill with 0): 0.52
RMSE (fill with mean): 0.46
Answer Q3: With mean


In [18]:
print("\n--- Question 4: regularized linear regression (fill NA with 0) ---")
r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_by_r = {}
for r in r_list:
    reg = None if r == 0 else r
    rmse = train_and_eval(df_train, df_val, fill_strategy='zero', reg=reg)
    rmse_by_r[r] = round(rmse, 2)
    print(f"r={r} -> RMSE={rmse_by_r[r]}")

best_r = min(rmse_by_r.items(), key=lambda x: (x[1], x[0]))[0]
print("Answer Q4: best r =", best_r)


--- Question 4: regularized linear regression (fill NA with 0) ---
r=0 -> RMSE=0.52
r=0.01 -> RMSE=0.52
r=0.1 -> RMSE=0.52
r=1 -> RMSE=0.52
r=5 -> RMSE=0.52
r=10 -> RMSE=0.52
r=100 -> RMSE=0.52
Answer Q4: best r = 0


In [19]:
print("\n--- Question 5: effect of seed on RMSE ---")
seeds = list(range(10))
rmse_seeds = []
for s in seeds:
    tr, va, te = split_data(df, seed=s)
    rmse_s = train_and_eval(tr, va, fill_strategy='zero', reg=None)
    rmse_seeds.append(rmse_s)
    print(f"seed={s} -> RMSE={rmse_s:.6f}")

std_rmse = float(np.std(rmse_seeds))
print("Standard deviation of RMSEs across seeds:", round(std_rmse, 3))
print("Answer Q5:", round(std_rmse, 3))


--- Question 5: effect of seed on RMSE ---
seed=0 -> RMSE=0.521099
seed=1 -> RMSE=0.521842
seed=2 -> RMSE=0.523038
seed=3 -> RMSE=0.516122
seed=4 -> RMSE=0.511187
seed=5 -> RMSE=0.528683
seed=6 -> RMSE=0.532242
seed=7 -> RMSE=0.509526
seed=8 -> RMSE=0.514908
seed=9 -> RMSE=0.513133
Standard deviation of RMSEs across seeds: 0.007
Answer Q5: 0.007


In [23]:
print("\n--- Question 6: final model and test RMSE ---")
seed = 9
df_train, df_val, df_test = split_data(df, seed=seed)

df_train_val = pd.concat([df_train, df_val], ignore_index=True)

r = 0.001
X_train_val = df_train_val[FEATURES].fillna(0)
y_train_val = df_train_val[TARGET].values
X_test = df_test[FEATURES].fillna(0)
y_test = df_test[TARGET].values

model_final = Ridge(alpha=r, random_state=0)
model_final.fit(X_train_val, y_train_val)
preds_test = model_final.predict(X_test)
mse_test = mean_squared_error(y_test, preds_test)
rmse_test = mse_test ** 0.5
print(f"Test RMSE (seed=9, r=0.001, fill=0): {rmse_test:.6f}")
print("Answer Q6 (rounded where appropriate):", round(rmse_test, 3))


--- Question 6: final model and test RMSE ---
Test RMSE (seed=9, r=0.001, fill=0): 0.515492
Answer Q6 (rounded where appropriate): 0.515
