In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [5]:
# Load the dataset
df = pd.read_csv('car_fuel_efficiency.csv')

In [6]:
# Check for missing values in each column
missing_values = df.isnull().sum()
print("Columns with missing values:")
print(missing_values[missing_values > 0])

Columns with missing values:
num_cylinders    482
horsepower       708
acceleration     930
num_doors        502
dtype: int64


In [11]:
#compute median of horsepower
horsepower_median = df['horsepower'].median()
print(f"The median of horsepower is {horsepower_median}")

The median of horsepower is 149.0


In [13]:
# Handle non-numeric values in horsepower
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')

In [14]:
# Handle missing values for numerical columns (avoiding inplace=True)
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col] = df[col].fillna(df[col].mean())
    

In [15]:
# One-hot encode categorical variables
df = pd.get_dummies(df, columns=df.select_dtypes(include=['object']).columns, drop_first=True)


In [22]:
# Define features and target
X = df.drop('fuel_efficiency_mpg', axis=1)
y = df['fuel_efficiency_mpg']

In [18]:
df


Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg,origin_Europe,origin_USA,fuel_type_Gasoline,drivetrain_Front-wheel drive
0,170,3.000000,159.000000,3413.433759,17.7,2003,0.000000,13.231729,True,False,True,False
1,130,5.000000,97.000000,3149.664934,17.8,2007,0.000000,13.688217,False,True,True,True
2,170,3.962481,78.000000,3079.038997,15.1,2018,0.000000,14.246341,True,False,True,True
3,220,4.000000,149.657292,2542.392402,20.2,2009,2.000000,16.912736,False,True,False,False
4,210,1.000000,140.000000,3460.870990,14.4,2009,2.000000,12.488369,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.000000,164.000000,2981.107371,17.3,2013,-0.006412,15.101802,True,False,False,True
9700,180,3.962481,154.000000,2439.525729,15.0,2004,0.000000,17.962326,False,True,True,False
9701,220,2.000000,138.000000,2583.471318,15.1,2008,-1.000000,17.186587,False,True,False,False
9702,230,4.000000,177.000000,2905.527390,19.4,2011,1.000000,15.331551,False,True,False,True


In [20]:
df.index

RangeIndex(start=0, stop=9704, step=1)

In [21]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,num_doors,fuel_efficiency_mpg,origin_Europe,origin_USA,fuel_type_Gasoline,drivetrain_Front-wheel drive
0,170,3.0,159.0,3413.433759,17.7,2003,0.0,13.231729,True,False,True,False
1,130,5.0,97.0,3149.664934,17.8,2007,0.0,13.688217,False,True,True,True
2,170,3.962481,78.0,3079.038997,15.1,2018,0.0,14.246341,True,False,True,True
3,220,4.0,149.657292,2542.392402,20.2,2009,2.0,16.912736,False,True,False,False
4,210,1.0,140.0,3460.87099,14.4,2009,2.0,12.488369,True,False,True,False


In [25]:
# Split the data: 60% train, 20% validation, 20% test
X_full, X_test, y_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.25, random_state=42)

In [26]:
# Option 1: Fill missing horsepower with 0
X_train_zero = X_train.copy()
X_val_zero = X_val.copy()
X_train_zero['horsepower'] = X_train_zero['horsepower'].fillna(0)
X_val_zero['horsepower'] = X_val_zero['horsepower'].fillna(0)

In [27]:
# Scale features
scaler_zero = StandardScaler()
X_train_zero_scaled = scaler_zero.fit_transform(X_train_zero)
X_val_zero_scaled = scaler_zero.transform(X_val_zero)

In [28]:
# Train linear regression model
model_zero = LinearRegression()
model_zero.fit(X_train_zero_scaled, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [29]:
# Predict and compute RMSE on validation set
y_pred_zero = model_zero.predict(X_val_zero_scaled)
rmse_zero = np.sqrt(mean_squared_error(y_val, y_pred_zero))
rmse_zero_rounded = round(rmse_zero, 2)
print(f"RMSE (fill with 0): {rmse_zero_rounded}")

RMSE (fill with 0): 0.39


In [30]:
# Option 2: Fill missing horsepower with training set mean
horsepower_mean = X_train['horsepower'].mean()
X_train_mean = X_train.copy()
X_val_mean = X_val.copy()
X_train_mean['horsepower'] = X_train_mean['horsepower'].fillna(horsepower_mean)
X_val_mean['horsepower'] = X_val_mean['horsepower'].fillna(horsepower_mean)

In [31]:
# Scale features
scaler_mean = StandardScaler()
X_train_mean_scaled = scaler_mean.fit_transform(X_train_mean)
X_val_mean_scaled = scaler_mean.transform(X_val_mean)

In [32]:
# Train linear regression model
model_mean = LinearRegression()
model_mean.fit(X_train_mean_scaled, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [33]:
# Predict and compute RMSE on validation set
y_pred_mean = model_mean.predict(X_val_mean_scaled)
rmse_mean = np.sqrt(mean_squared_error(y_val, y_pred_mean))
rmse_mean_rounded = round(rmse_mean, 2)
print(f"RMSE (fill with mean): {rmse_mean_rounded}")

RMSE (fill with mean): 0.39


In [34]:
# Compare RMSE
if rmse_zero_rounded < rmse_mean_rounded:
    print("Filling missing values with 0 gives better RMSE.")
elif rmse_mean_rounded < rmse_zero_rounded:
    print("Filling missing values with the mean gives better RMSE.")
else:
    print("Both options give the same RMSE.")

Both options give the same RMSE.


In [35]:
# List of r (alpha) values to test
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]

# Store RMSE results
rmse_results = {}



In [39]:
# Train and evaluate Ridge models for each r
for r in r_values:
    # Train Ridge regression model
    model = Ridge(alpha=r)
    model.fit(X_train_mean_scaled, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val_mean_scaled)
    
    # Compute RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_rounded = round(rmse, 2)
    rmse_results[r] = rmse_rounded
    print(f"RMSE (r={r}): {rmse_rounded}")


RMSE (r=0): 0.39
RMSE (r=0.01): 0.39
RMSE (r=0.1): 0.39
RMSE (r=1): 0.39
RMSE (r=5): 0.39
RMSE (r=10): 0.39
RMSE (r=100): 0.39


In [40]:
# Find the best r (smallest r if multiple have the same RMSE)
best_rmse = min(rmse_results.values())
best_r = min([r for r, rmse in rmse_results.items() if rmse == best_rmse])
print(f"\nBest r: {best_r} with RMSE: {rmse_results[best_r]}")
print(f"Answer: {best_r}")


Best r: 0 with RMSE: 0.39
Answer: 0


In [41]:
# List of seed values to test
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [42]:
# Store RMSE scores
rmse_scores = []

In [43]:
# Iterate over seeds
for seed in seeds:
    # Split the data: 60% train, 20% validation, 20% test
    X_full, X_test, y_full, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.25, random_state=seed)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train linear regression model (no regularization)
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val_scaled)
    
    # Compute RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)
    print(f"RMSE (seed={seed}): {round(rmse, 2)}")

RMSE (seed=0): 0.4
RMSE (seed=1): 0.4
RMSE (seed=2): 0.4
RMSE (seed=3): 0.39
RMSE (seed=4): 0.4
RMSE (seed=5): 0.4
RMSE (seed=6): 0.38
RMSE (seed=7): 0.4
RMSE (seed=8): 0.4
RMSE (seed=9): 0.39


In [44]:
# Compute standard deviation of RMSE scores
std_rmse = np.std(rmse_scores)
std_rmse_rounded = round(std_rmse, 3)
print(f"\nStandard deviation of RMSE scores: {std_rmse_rounded}")
print(f"Answer: {std_rmse_rounded}")


Standard deviation of RMSE scores: 0.006
Answer: 0.006


In [54]:
# Handle non-numeric values in horsepower
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')

# Fill missing values in horsepower with 0
df['horsepower'] = df['horsepower'].fillna(0)

# Handle missing values for other numerical columns with mean
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    if col != 'horsepower':
        df[col] = df[col].fillna(df[col].mean())

# One-hot encode categorical variables (if not already encoded)
if df.select_dtypes(include=['object']).columns.any():
    df = pd.get_dummies(df, columns=df.select_dtypes(include=['object']).columns, drop_first=True)

# Debug: Check final columns
print("Final columns after preprocessing:", df.columns.tolist())

# Define features and target
X = df.drop('fuel_efficiency_mpg', axis=1)
y = df['fuel_efficiency_mpg']

# Split the data: 60% train, 20% validation, 20% test with seed 9
X_full, X_test, y_full, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.25, random_state=9)


Final columns after preprocessing: ['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year', 'num_doors', 'fuel_efficiency_mpg', 'origin_Europe', 'origin_USA', 'fuel_type_Gasoline', 'drivetrain_Front-wheel drive']


In [55]:
# Combine train and validation datasets
X_train_val = pd.concat([X_train, X_val], axis=0)
y_train_val = pd.concat([y_train, y_val], axis=0)

In [56]:
# Scale features
scaler = StandardScaler()
X_train_val_scaled = scaler.fit_transform(X_train_val)
X_test_scaled = scaler.transform(X_test)

In [57]:
# --- Ridge Regression with r=0.001 (as specified) ---
model_ridge = Ridge(alpha=0.001)
model_ridge.fit(X_train_val_scaled, y_train_val)

0,1,2
,alpha,0.001
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [58]:
# Predict on test set
y_pred_ridge = model_ridge.predict(X_test_scaled)

In [59]:
# Compute RMSE on test set
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
rmse_ridge_rounded = round(rmse_ridge, 2)
print(f"RMSE on test set (Ridge, r=0.001): {rmse_ridge_rounded}")
print(f"Answer: {rmse_ridge_rounded}")

RMSE on test set (Ridge, r=0.001): 0.4
Answer: 0.4


In [60]:
# --- Linear Regression for Comparison ---
model_linear = LinearRegression()
model_linear.fit(X_train_val_scaled, y_train_val)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [61]:
# Predict on test set
y_pred_linear = model_linear.predict(X_test_scaled)

In [62]:
# Compute RMSE on test set
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
rmse_linear_rounded = round(rmse_linear, 2)
print(f"RMSE on test set (Linear Regression): {rmse_linear_rounded}")

RMSE on test set (Linear Regression): 0.4
