In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

Read Data

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"

In [5]:
df = pd.read_csv(url)
df

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [6]:
columns = ['engine_displacement','horsepower','vehicle_weight','model_year','fuel_efficiency_mpg']
df = df[columns].copy()

def rmse(y_true, y_pred):
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(y_true, y_pred))

#### Q1. There's one column with missing values. What is it?

In [12]:
missing_columns = df.isnull().any()
col_nm = missing_columns[missing_columns > 0].idxmax() if (missing_columns > 0).any() else None

print(f"Column with missing values: {col_nm}")

Column with missing values: horsepower


#### Q2. What's the median (50% percentile) for variable 'horsepower'?

In [13]:
print(f"The median of 'horsepower' :", df['horsepower'].median())

The median of 'horsepower' : 149.0


In [14]:
def split_df(base_df, seed=42):
    d = base_df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    n = len(d); n_train = int(0.6*n); n_val = int(0.2*n)
    return d.iloc[:n_train].copy(), d.iloc[n_train:n_train+n_val].copy(), d.iloc[n_train+n_val:].copy()

features = ['engine_displacement','horsepower','vehicle_weight','model_year']
target = 'fuel_efficiency_mpg'

#### Q3.

In [16]:
dtr, dvl, dte = split_df(df, seed=42)
Xtr, Xvl = dtr[features].copy(), dvl[features].copy()
ytr, yvl = dtr[target].values, dvl[target].values

In [19]:
# fill 0
Xtr0, Xvl0 = Xtr.copy().fillna(0), Xvl.copy().fillna(0)
m0 = LinearRegression().fit(Xtr0, ytr)
rmse0 = round(rmse(yvl, m0.predict(Xvl0)), 2)

# fill mean (train only)
Xtrm, Xvlm = Xtr.copy(), Xvl.copy()
for c in Xtrm.columns:
    if Xtrm[c].isna().any():
        mval = Xtrm[c].mean()
        Xtrm[c] = Xtrm[c].fillna(mval)
        Xvlm[c] = Xvlm[c].fillna(mval)
mm = LinearRegression().fit(Xtrm, ytr)
rmsem = round(rmse(yvl, mm.predict(Xvlm)), 2)

better = "With 0" if rmse0 < rmsem else ("With mean" if rmsem < rmse0 else "Both are equally good")
print(f"Q3 RMSE → fill-0: {rmse0} | fill-mean: {rmsem} | Answer: {better}")

Q3 RMSE → fill-0: 0.52 | fill-mean: 0.46 | Answer: With mean


#### Q4.

In [20]:
r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
Xtr0, Xvl0 = dtr[features].fillna(0), dvl[features].fillna(0)
ytr, yvl = dtr[target].values, dvl[target].values

scores = []
for r in r_list:
    model = LinearRegression() if r==0 else Ridge(alpha=r, random_state=42)
    model.fit(Xtr0, ytr)
    s = round(rmse(yvl, model.predict(Xvl0)), 2)
    scores.append(s)
    print(f"r={r:<6} RMSE={s}")
best_rmse = min(scores)
best_r = r_list[scores.index(best_rmse)]
print("Q4 best r:", best_r, "with RMSE:", best_rmse)

r=0      RMSE=0.52
r=0.01   RMSE=0.52
r=0.1    RMSE=0.52
r=1      RMSE=0.52
r=5      RMSE=0.52
r=10     RMSE=0.52
r=100    RMSE=0.52
Q4 best r: 0 with RMSE: 0.52


#### Q5.

In [21]:
# Q5: seeds 0–9, split 60/20/20, fill 0, no regularization, std of val RMSE
vals = []
for s in range(10):
    dtr, dvl, _ = split_df(df, seed=s)
    Xtr, Xvl = dtr[features].fillna(0), dvl[features].fillna(0)
    ytr, yvl = dtr[target].values, dvl[target].values
    m = LinearRegression().fit(Xtr, ytr)
    vals.append(rmse(yvl, m.predict(Xvl)))
std = round(np.std(vals), 3)
print("Q5 std across seeds 0–9:", std)

Q5 std across seeds 0–9: 0.007


#### Q6.

In [25]:
# Q6: seed=9, train on train+val, fill 0, r=0.001, test RMSE
dtr, dvl, dte = split_df(df, seed=9)
Xtr_full = pd.concat([dtr[features], dvl[features]]).fillna(0)
ytr_full = np.concatenate([dtr[target].values, dvl[target].values])
Xte, yte = dte[features].fillna(0), dte[target].values
ridge = Ridge(alpha=0.001, random_state=9).fit(Xtr_full, ytr_full)
rmse_test = rmse(yte, ridge.predict(Xte))
print("Q6 test RMSE:", rmse_test, " rounded:", round(rmse_test, 3))

Q6 test RMSE: 0.5154915324831211  rounded: 0.515
