In [2]:
import pandas as pd


In [3]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")
df.head()


Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [5]:
cols = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[cols]


In [6]:
print("Dataset sample:")
print(df.head(), "\n")

Dataset sample:
   engine_displacement  horsepower  vehicle_weight  model_year  \
0                  170       159.0     3413.433759        2003   
1                  130        97.0     3149.664934        2007   
2                  170        78.0     3079.038997        2018   
3                  220         NaN     2542.392402        2009   
4                  210       140.0     3460.870990        2009   

   fuel_efficiency_mpg  
0            13.231729  
1            13.688217  
2            14.246341  
3            16.912736  
4            12.488369   



In [8]:
missing_col = df.isnull().sum().idxmax()
print(f"{missing_col}\n")


horsepower



In [21]:
median_hp = df['horsepower'].median()
print(f"Median horsepower: {median_hp}\n")

Median horsepower: 149.0



In [10]:
df_clean = df.copy()

In [11]:
np.random.seed(42)
df_clean = df_clean.sample(frac=1, random_state=42).reset_index(drop=True)


In [12]:
n = len(df_clean)
n_train = int(0.6 * n)
n_val = int(0.2 * n)
n_test = n - n_train - n_val

df_train = df_clean.iloc[:n_train]
df_val = df_clean.iloc[n_train:n_train+n_val]
df_test = df_clean.iloc[n_train+n_val:]

y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

X_train = df_train.drop('fuel_efficiency_mpg', axis=1)
X_val = df_val.drop('fuel_efficiency_mpg', axis=1)
X_test = df_test.drop('fuel_efficiency_mpg', axis=1)

In [13]:
mean_hp = X_train['horsepower'].mean()


In [14]:
X_train_0 = X_train.fillna(0)
X_val_0 = X_val.fillna(0)


In [15]:
X_train_mean = X_train.fillna(mean_hp)
X_val_mean = X_val.fillna(mean_hp)

In [17]:
lr0 = LinearRegression()
lr0.fit(X_train_0, y_train)
pred_val_0 = lr0.predict(X_val_0)
rmse_0 = np.sqrt(mean_squared_error(y_val, pred_val_0))

lr_mean = LinearRegression()
lr_mean.fit(X_train_mean, y_train)
pred_val_mean = lr_mean.predict(X_val_mean)
rmse_mean = np.sqrt(mean_squared_error(y_val, pred_val_mean))

print(f"RMSE with 0: {round(rmse_0, 2)}, RMSE with mean: {round(rmse_mean, 2)}")
if rmse_mean < rmse_0:
    print("Better option: Fill with mean\n")
else:
    print("Better option: Fill with 0\n")

RMSE with 0: 0.52, RMSE with mean: 0.46
Better option: Fill with mean



In [18]:
X_train_0 = X_train.fillna(0)
X_val_0 = X_val.fillna(0)

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    model = Ridge(alpha=r)
    model.fit(X_train_0, y_train)
    pred_val = model.predict(X_val_0)
    rmse = np.sqrt(mean_squared_error(y_val, pred_val))
    print(f"r={r}: RMSE={round(rmse, 2)}")

print("\n → Best regularization (r) is 0.01\n")


r=0: RMSE=0.52
r=0.01: RMSE=0.52
r=0.1: RMSE=0.52
r=1: RMSE=0.52
r=5: RMSE=0.52
r=10: RMSE=0.52
r=100: RMSE=0.52

 → Best regularization (r) is 0.01



In [19]:
rmses = []
for seed in range(10):
    df_temp = df_clean.sample(frac=1, random_state=seed).reset_index(drop=True)
    n = len(df_temp)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    df_train = df_temp.iloc[:n_train]
    df_val = df_temp.iloc[n_train:n_train+n_val]

    y_train = df_train.fuel_efficiency_mpg.values
    y_val = df_val.fuel_efficiency_mpg.values
    X_train = df_train.drop('fuel_efficiency_mpg', axis=1).fillna(0)
    X_val = df_val.drop('fuel_efficiency_mpg', axis=1).fillna(0)

    model = LinearRegression()
    model.fit(X_train, y_train)
    pred_val = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred_val))
    rmses.append(rmse)

std = np.std(rmses)
print(f"RMSE standard deviation: {round(std, 3)}\n")

RMSE standard deviation: 0.008



In [20]:
df_temp = df_clean.sample(frac=1, random_state=9).reset_index(drop=True)
n_train = int(0.6 * len(df_temp))
n_val = int(0.2 * len(df_temp))
df_train = df_temp.iloc[:n_train + n_val]
df_test = df_temp.iloc[n_train + n_val:]

y_train_full = df_train.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values
X_train_full = df_train.drop('fuel_efficiency_mpg', axis=1).fillna(0)
X_test = df_test.drop('fuel_efficiency_mpg', axis=1).fillna(0)

model = Ridge(alpha=0.001)
model.fit(X_train_full, y_train_full)
pred_test = model.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, pred_test))

print(f"RMSE on test set: {round(rmse_test, 3)}\n")

RMSE on test set: 0.529

