In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [3]:
url = './hm6/car_fuel_efficiency.csv'
df = pd.read_csv(url)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

Dataset shape: (9704, 11)
Columns: ['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain', 'num_doors', 'fuel_efficiency_mpg']


In [4]:
df = df.fillna(0)

In [5]:
y = df['fuel_efficiency_mpg'].values
X = df.drop('fuel_efficiency_mpg', axis=1)

X_train_full, X_temp, y_train_full, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=1
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

print(f"\nTrain: {len(X_train_full)} samples")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")


Train: 5822 samples
Validation: 1941 samples
Test: 1941 samples


In [6]:
train_dicts = X_train_full.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')
test_dicts = X_test.to_dict(orient='records')

dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)

print(f"\nAfter DictVectorizer:")
print(f"  X_train shape: {X_train.shape}")
print(f"  Features: {len(dv.feature_names_)}")


After DictVectorizer:
  X_train shape: (5822, 14)
  Features: 14


In [11]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train_full)

feature_idx = dt.tree_.feature[0]
feature_name = dv.feature_names_[feature_idx]

print(f"\nFeature used for splitting: {feature_name}")
print(f"\n Q1 ANSWER: {feature_name}")



Feature used for splitting: vehicle_weight

 Q1 ANSWER: vehicle_weight


In [13]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train_full)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print(f"\nRMSE on validation: {rmse:.6f}")

options_q2 = [0.045, 0.45, 4.5, 45.0]
closest_q2 = min(options_q2, key=lambda x: abs(x - rmse))

print(f"\n Q2 ANSWER: {closest_q2}")
print(f"  (Actual RMSE: {rmse:.3f})")


RMSE on validation: 0.461065

 Q2 ANSWER: 0.45
  (Actual RMSE: 0.461)


In [14]:
n_estimators_range = range(10, 201, 10)
rmse_scores = []

print("\nTesting different n_estimators:")
for n in n_estimators_range:
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train_full)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)
    print(f"  n_estimators={n:3d}: RMSE = {rmse:.6f}")

rmse_rounded = [round(r, 3) for r in rmse_scores]
min_rmse = min(rmse_rounded)
min_idx = rmse_rounded.index(min_rmse)
best_n = list(n_estimators_range)[min_idx]

print(f"\nBest RMSE: {min_rmse:.3f}")
print(f"First achieved at n_estimators: {best_n}")

print(f"\n Q3 ANSWER: {best_n}")


Testing different n_estimators:
  n_estimators= 10: RMSE = 0.461065
  n_estimators= 20: RMSE = 0.446893
  n_estimators= 30: RMSE = 0.441077
  n_estimators= 40: RMSE = 0.439747
  n_estimators= 50: RMSE = 0.438051
  n_estimators= 60: RMSE = 0.436498
  n_estimators= 70: RMSE = 0.436911
  n_estimators= 80: RMSE = 0.437080
  n_estimators= 90: RMSE = 0.436289
  n_estimators=100: RMSE = 0.436142
  n_estimators=110: RMSE = 0.435730
  n_estimators=120: RMSE = 0.435976
  n_estimators=130: RMSE = 0.435617
  n_estimators=140: RMSE = 0.435807
  n_estimators=150: RMSE = 0.435826
  n_estimators=160: RMSE = 0.435816
  n_estimators=170: RMSE = 0.435758
  n_estimators=180: RMSE = 0.435665
  n_estimators=190: RMSE = 0.435824
  n_estimators=200: RMSE = 0.435400

Best RMSE: 0.435
First achieved at n_estimators: 200

 Q3 ANSWER: 200


In [None]:
max_depth_values = [10, 15, 20, 25]
results_q4 = {}

for max_d in max_depth_values:
    print(f"\nTesting max_depth={max_d}:")
    rmse_list = []
    
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=max_d,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train, y_train_full)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    
    mean_rmse = np.mean(rmse_list)
    results_q4[max_d] = mean_rmse
    print(f"  Mean RMSE: {mean_rmse:.6f}")

best_max_depth = min(results_q4, key=results_q4.get)

print(f"\n Q4 ANSWER: {best_max_depth}")
print(f"  Mean RMSE: {results_q4[best_max_depth]:.6f}")



Testing max_depth=10:
  Mean RMSE: 0.436293

Testing max_depth=15:
  Mean RMSE: 0.437634

Testing max_depth=20:
