In [83]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt

In [84]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'

In [85]:
df = pd.read_csv(data)

In [86]:
df = df[['engine_displacement','horsepower','vehicle_weight','model_year','fuel_efficiency_mpg']]

In [87]:
df['fuel_efficiency_mpg'].describe()

count    9704.000000
mean       14.985243
std         2.556468
min         6.200971
25%        13.267459
50%        15.006037
75%        16.707965
max        25.967222
Name: fuel_efficiency_mpg, dtype: float64

In [88]:
df.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.87099,2009,12.488369


In [89]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [90]:
df['horsepower'].median()

np.float64(149.0)

In [91]:
# df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)


## Prepare and split the dataset

In [92]:
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [93]:
n

9704

In [94]:
n_val, n_test, n_train

(1940, 1940, 5824)

In [95]:
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

In [96]:
idx = np.arange(n)

In [97]:
np.random.seed(2)
np.random.shuffle(idx)

In [98]:
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

In [99]:
df_train.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
246,170,164.0,2990.040917,2019,15.963019
8125,170,,2729.623741,2012,15.931964
1927,200,142.0,3126.513375,2019,14.284901
8235,200,148.0,3136.477901,2003,14.86521
424,230,141.0,3384.681613,2006,12.428822


In [100]:
len(df_train), len(df_val), len(df_test)

(5824, 1940, 1940)

In [101]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [102]:
df_train.head()

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,164.0,2990.040917,2019,15.963019
1,170,,2729.623741,2012,15.931964
2,200,142.0,3126.513375,2019,14.284901
3,200,148.0,3136.477901,2003,14.86521
4,230,141.0,3384.681613,2006,12.428822


In [103]:
y_train = np.log1p(df_train.fuel_efficiency_mpg.values)
y_val = np.log1p(df_val.fuel_efficiency_mpg.values)
y_test = np.log1p(df_test.fuel_efficiency_mpg.values)

In [104]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [105]:
X_train = df_train
X_val = df_val
X_test = df_test

In [106]:
len(y_train)

5824

## Linear regression

In [110]:
# =========================
# Q3: Imputación (0 vs media-TRAIN) + LinearRegression
# =========================

# a) Imputar con 0
X_train_0 = X_train.copy().fillna(0)
X_val_0   = X_val.copy().fillna(0)
lr0 = LinearRegression()
lr0.fit(X_train_0, y_train)
rmse_0 = sqrt(mean_squared_error(y_val, lr0.predict(X_val_0)))

# b) Imputar con la media calculada SOLO en train
X_train_m = X_train.copy()
X_val_m   = X_val.copy()
means = X_train_m.mean(numeric_only=True)
X_train_m = X_train_m.fillna(means)
X_val_m   = X_val_m.fillna(means)
lr_mean = LinearRegression()
lr_mean.fit(X_train_m, y_train)
rmse_mean = sqrt(mean_squared_error(y_val, lr_mean.predict(X_val_m)))

print(f"Q3 RMSE (fill 0):    {round(rmse_0, 2)}")
print(f"Q3 RMSE (fill mean): {round(rmse_mean, 2)}")
if abs(rmse_0 - rmse_mean) < 1e-12:
    print("Q3 -> Both are equally good")
else:
    print("Q3 ->", "With 0" if rmse_0 < rmse_mean else "With mean")

# =========================
# Q4: Ridge con NAs=0, barrer r y elegir mejor por RMSE (val)
# =========================
r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
X_train_r = X_train.copy().fillna(0)
X_val_r   = X_val.copy().fillna(0)

rmse_by_r = {}
for r in r_list:
    model = LinearRegression() if r == 0 else Ridge(alpha=r)
    model.fit(X_train_r, y_train)
    pred = model.predict(X_val_r)
    rm = sqrt(mean_squared_error(y_val, pred))
    rmse_by_r[r] = rm
    print(f"Q4 r={r:<6} -> RMSE={round(rm, 2)}")

# mejor r real (si empate, el más pequeño)
best_r = sorted(rmse_by_r.items(), key=lambda kv: (kv[1], kv[0]))[0][0]
print("Q4 best r (real):", best_r)

# mapear a opciones permitidas del enunciado
allowed_q4 = np.array([0, 0.01, 1, 10, 100], dtype=float)
closest = allowed_q4[np.argmin(np.abs(allowed_q4 - float(best_r)))]
print("Q4 choice (closest option):", float(closest))


features = ['engine_displacement','horsepower','vehicle_weight','model_year']
target   = 'fuel_efficiency_mpg'
# ======================
# Q5: calcular std de RMSE en validación con seeds 0..9
# ======================
rmses = []

for seed in range(10):
    # barajar con semilla
    df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    
    # split 60/20/20
    n = len(df_shuffled)
    n_train = int(0.6*n)
    n_val   = int(0.2*n)
    
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val   = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test  = df_shuffled.iloc[n_train+n_val:].copy()
    
    X_train, y_train = df_train[features].fillna(0), df_train[target].values
    X_val,   y_val   = df_val[features].fillna(0),   df_val[target].values

    model = LinearRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    rmse = sqrt(mean_squared_error(y_val, pred))
    rmses.append(rmse)

std_rmse = np.std(rmses)
print("Q5 -> STD of validation RMSE:", round(std_rmse, 3))

# ======================
# Q6: seed=9, entrenar con train+val y evaluar en test con Ridge r=0.001
# ======================
seed = 9
df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)

n = len(df_shuffled)
n_train = int(0.6*n)
n_val   = int(0.2*n)

df_train = df_shuffled.iloc[:n_train].copy()
df_val   = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test  = df_shuffled.iloc[n_train+n_val:].copy()

# combinar train+val
df_trainval = pd.concat([df_train, df_val]).reset_index(drop=True)

X_trainval, y_trainval = df_trainval[features].fillna(0), df_trainval[target].values
X_test,     y_test     = df_test[features].fillna(0),     df_test[target].values

final_model = Ridge(alpha=0.001)
final_model.fit(X_trainval, y_trainval)
pred_test = final_model.predict(X_test)
rmse_test = sqrt(mean_squared_error(y_test, pred_test))

print("Q6 -> Test RMSE:", round(rmse_test, 3))

Q3 RMSE (fill 0):    0.04
Q3 RMSE (fill mean): 0.04
Q3 -> With mean
Q4 r=0      -> RMSE=0.04
Q4 r=0.01   -> RMSE=0.04
Q4 r=0.1    -> RMSE=0.04
Q4 r=1      -> RMSE=0.04
Q4 r=5      -> RMSE=0.04
Q4 r=10     -> RMSE=0.04
Q4 r=100    -> RMSE=0.04
Q4 best r (real): 100
Q4 choice (closest option): 100.0
Q5 -> STD of validation RMSE: 0.007
Q6 -> Test RMSE: 0.515
