In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('car_fuel_efficiency.csv')
print(df.shape)

(9704, 11)


In [3]:
colscheck = ['engine_displacement',
'horsepower',
'vehicle_weight',
'model_year',
'fuel_efficiency_mpg']

In [4]:
df[colscheck].isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [5]:
df['horsepower'].median()

149.0

In [6]:
df.head(1)

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

np.random.seed(42)
df = df.sample(frac=1, random_state=42)

n = len(df)
n_train = int(0.6 * n)
n_val = int(0.2 * n)
n_test = n - n_train - n_val

df_train = df.iloc[:n_train].copy()
df_val = df.iloc[n_train:n_train+n_val].copy()
df_test = df.iloc[n_train+n_val:].copy()

feature = colscheck
target = "fuel_efficiency_mpg"

In [11]:
df_val_mean.isnull().sum()

engine_displacement    0
horsepower             0
vehicle_weight         0
model_year             0
fuel_efficiency_mpg    0
dtype: int64

In [15]:
df_train_0 = df_train[feature+[target]].copy()
df_val_0 = df_val[feature+[target]].copy()
df_train_0['horsepower'] = df_train_0['horsepower'].fillna(0)
df_val_0['horsepower'] = df_val_0['horsepower'].fillna(0)

mean_value = df_train['horsepower'].mean()
df_train_mean = df_train[feature+[target]].copy()
df_val_mean = df_val[feature+[target]].copy()
df_train_mean['horsepower'] = df_train_mean['horsepower'].fillna(mean_value)
df_val_mean['horsepower'] = df_val_mean['horsepower'].fillna(mean_value)

def train_and_eval(df_tr, df_vl):
    model = LinearRegression()
    X_train = df_tr[feature].values
    y_train = df_tr[target].values
    X_val = df_vl[feature].values
    y_val = df_vl[target].values

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    return round(rmse, 2)

In [16]:
rmse_0 = train_and_eval(df_train_0, df_val_0)
rmse_mean = train_and_eval(df_train_mean, df_val_mean)

print(f"RMSE (fill with 0): {rmse_0}")
print(f"RMSE (fill with mean): {rmse_mean}")

if rmse_0 < rmse_mean:
    print("👉 Better option: fill with 0")
elif rmse_mean < rmse_0:
    print("👉 Better option: fill with mean")
else:
    print("👉 Both are equally good")


RMSE (fill with 0): 0.0
RMSE (fill with mean): 0.0
👉 Both are equally good




In [18]:
from sklearn.linear_model import Ridge

df_train = df_train_0.copy()
df_val = df_val_0.copy()

X_train = df_train[feature].values
y_train = df_train[target].values
X_val = df_val[feature].values
y_val = df_val[target].values

r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = {}

for r in r_list:
    model = Ridge(alpha=r)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    rmse_scores[r] = round(rmse, 2)

for r, rmse in rmse_scores.items():
    print(f"r={r}: RMSE={rmse}")

best_r = min(rmse_scores, key=lambda x: (rmse_scores[x], x))
print(f"Best RMSE: {rmse_scores[best_r]} with r={best_r}")

r=0: RMSE=0.0
r=0.01: RMSE=0.0
r=0.1: RMSE=0.0
r=1: RMSE=0.0
r=5: RMSE=0.0
r=10: RMSE=0.0
r=100: RMSE=0.02
Best RMSE: 0.0 with r=0




In [20]:
rmse_scores = []

for seed in range(10):
    df_shuffled = df.sample(frac=1, random_state=seed)
    n = len(df_shuffled)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()

    df_train[feature] = df_train[feature].fillna(0)
    df_val[feature] = df_val[feature].fillna(0)

    model = LinearRegression()
    model.fit(df_train[feature], df_train[target])

    y_pred = model.predict(df_val[feature])
    rmse = mean_squared_error(df_val[target], y_pred, squared=False)
    rmse_scores.append(rmse)

std_rmse = np.std(rmse_scores)
print("RMSE per seed:", [round(x, 3) for x in rmse_scores])
print("Standard Deviation:", round(std_rmse, 3))

RMSE per seed: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Standard Deviation: 0.0




In [21]:
np.random.seed(9)
df = df.sample(frac=1, random_state=9)

n = len(df)
n_train = int(0.6 * n)
n_val = int(0.2 * n)
n_test = n - n_train - n_val

df_train = df.iloc[:n_train].copy()
df_val = df.iloc[n_train:n_train+n_val].copy()
df_test = df.iloc[n_train+n_val:].copy()

df_full_train = pd.concat([df_train, df_val]).reset_index(drop=True)

df_full_train[feature] = df_full_train[feature].fillna(0)
df_test[feature] = df_test[feature].fillna(0)

model = Ridge(alpha=0.001)
model.fit(df_full_train[feature], df_full_train[target])

y_pred = model.predict(df_test[feature])
rmse = mean_squared_error(df_test[target], y_pred, squared=False)

print("RMSE on test:", round(rmse, 3))

RMSE on test: 0.0


