In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [4]:
import urllib.request, os

In [5]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
filename = "car_fuel_efficiency.csv"

In [6]:
if not os.path.exists(filename):
    print(f"Downloading {url} -> {filename}")
    urllib.request.urlretrieve(url, filename)
else:
    print(f"{filename} already present")

car_fuel_efficiency.csv already present


In [7]:
df = pd.read_csv(filename)

In [8]:
print("\nFirst 5 rows:")
print(df.head())
print("\nInfo:")
print(df.info())


First 5 rows:
   engine_displacement  num_cylinders  horsepower  vehicle_weight  \
0                  170            3.0       159.0     3413.433759   
1                  130            5.0        97.0     3149.664934   
2                  170            NaN        78.0     3079.038997   
3                  220            4.0         NaN     2542.392402   
4                  210            1.0       140.0     3460.870990   

   acceleration  model_year  origin fuel_type         drivetrain  num_doors  \
0          17.7        2003  Europe  Gasoline    All-wheel drive        0.0   
1          17.8        2007     USA  Gasoline  Front-wheel drive        0.0   
2          15.1        2018  Europe  Gasoline  Front-wheel drive        0.0   
3          20.2        2009     USA    Diesel    All-wheel drive        2.0   
4          14.4        2009  Europe  Gasoline    All-wheel drive        2.0   

   fuel_efficiency_mpg  
0            13.231729  
1            13.688217  
2            14.2463

In [9]:
df = df.fillna(0)
print("\nMissing values after fillna(0):")
print(df.isnull().sum().sum())   


Missing values after fillna(0):
0


In [10]:
target = "fuel_efficiency_mpg"
y = df[target].values
X = df.drop(columns=[target])

In [11]:
df_full_train, df_test, y_full_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=1
)

In [12]:
df_train, df_val, y_train, y_val = train_test_split(
    df_full_train, y_full_train, test_size=0.25, random_state=1
)

In [13]:
print("\nSplit sizes:")
print(f"Train   : {len(df_train)} rows  ({len(df_train)/len(df):.0%})")
print(f"Val     : {len(df_val)} rows  ({len(df_val)/len(df):.0%})")
print(f"Test    : {len(df_test)} rows  ({len(df_test)/len(df):.0%})")


Split sizes:
Train   : 5822 rows  (60%)
Val     : 1941 rows  (20%)
Test    : 1941 rows  (20%)


In [14]:
dv = DictVectorizer(sparse=True)

In [15]:
X_train_dict = df_train.to_dict(orient="records")
X_train_sp = dv.fit_transform(X_train_dict)

In [16]:
X_val_dict = df_val.to_dict(orient="records")
X_val_sp   = dv.transform(X_val_dict)

In [17]:
X_test_dict = df_test.to_dict(orient="records")
X_test_sp   = dv.transform(X_test_dict)

In [18]:
print("\nSparse matrix shapes:")
print(f"X_train_sp : {X_train_sp.shape}   (type: {type(X_train_sp)})")
print(f"X_val_sp   : {X_val_sp.shape}")
print(f"X_test_sp  : {X_test_sp.shape}")


Sparse matrix shapes:
X_train_sp : (5822, 14)   (type: <class 'scipy.sparse._csr.csr_matrix'>)
X_val_sp   : (1941, 14)
X_test_sp  : (1941, 14)


In [19]:
print("\nTarget vectors:")
print(f"y_train    : {y_train.shape}   dtype={y_train.dtype}")
print(f"y_val      : {y_val.shape}")
print(f"y_test     : {y_test.shape}")


Target vectors:
y_train    : (5822,)   dtype=float64
y_val      : (1941,)
y_test     : (1941,)


In [20]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_sp, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [21]:
tree_rules = export_text(dt, feature_names=dv.get_feature_names_out())
print("Decision Tree Structure (max_depth=1):")
print(tree_rules)

Decision Tree Structure (max_depth=1):
|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [22]:
root_feature_idx = dt.tree_.feature[0]  
root_feature_name = dv.get_feature_names_out()[root_feature_idx]
print(f"\nRoot split feature: {root_feature_name}")


Root split feature: vehicle_weight


In [23]:
print(df.columns.tolist())
# Output: ['displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'fuel_efficiency_mpg']

['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain', 'num_doors', 'fuel_efficiency_mpg']


In [24]:
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train_sp, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
y_pred = rf.predict(X_val_sp)

In [26]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE on validation set: {rmse:.3f}")

RMSE on validation set: 0.459


In [27]:
results = []
for n in range(10, 201, 10):                     # 10 to 200 step 10
    rf = RandomForestRegressor(
        n_estimators=n,
        random_state=1,
        n_jobs=-1
    )
    rf.fit(X_train_sp, y_train)
    y_pred = rf.predict(X_val_sp)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_3 = round(rmse, 3)                     # 3 decimal places
    results.append((n, rmse, rmse_3))
    print(f"n_estimators={n:3d}  RMSE={rmse:.5f}  (rounded to {rmse_3})")

n_estimators= 10  RMSE=0.45866  (rounded to 0.459)
n_estimators= 20  RMSE=0.45368  (rounded to 0.454)
n_estimators= 30  RMSE=0.45117  (rounded to 0.451)
n_estimators= 40  RMSE=0.44836  (rounded to 0.448)
n_estimators= 50  RMSE=0.44618  (rounded to 0.446)
n_estimators= 60  RMSE=0.44530  (rounded to 0.445)
n_estimators= 70  RMSE=0.44467  (rounded to 0.445)
n_estimators= 80  RMSE=0.44499  (rounded to 0.445)
n_estimators= 90  RMSE=0.44520  (rounded to 0.445)
n_estimators=100  RMSE=0.44490  (rounded to 0.445)
n_estimators=110  RMSE=0.44372  (rounded to 0.444)
n_estimators=120  RMSE=0.44410  (rounded to 0.444)
n_estimators=130  RMSE=0.44377  (rounded to 0.444)
n_estimators=140  RMSE=0.44350  (rounded to 0.444)
n_estimators=150  RMSE=0.44302  (rounded to 0.443)
n_estimators=160  RMSE=0.44279  (rounded to 0.443)
n_estimators=170  RMSE=0.44289  (rounded to 0.443)
n_estimators=180  RMSE=0.44255  (rounded to 0.443)
n_estimators=190  RMSE=0.44261  (rounded to 0.443)
n_estimators=200  RMSE=0.44252 

In [29]:
best_n = None
best_rmse = np.inf

print("\n--- RMSE changes (rounded to 3 decimals) ---")
for n, _, rmse_3 in results:
    print(f"{n:3d}: {rmse_3}")
    if rmse_3 < best_rmse:
        best_rmse = rmse_3
        best_n = n


--- RMSE changes (rounded to 3 decimals) ---
 10: 0.459
 20: 0.454
 30: 0.451
 40: 0.448
 50: 0.446
 60: 0.445
 70: 0.445
 80: 0.445
 90: 0.445
100: 0.445
110: 0.444
120: 0.444
130: 0.444
140: 0.444
150: 0.443
160: 0.443
170: 0.443
180: 0.443
190: 0.443
200: 0.443


In [30]:
improvement_stop = None
prev_rmse_3 = None
for n, _, rmse_3 in results:
    if prev_rmse_3 is not None and rmse_3 >= prev_rmse_3:
        improvement_stop = n - 10          # the previous step was the last improvement
        break
    prev_rmse_3 = rmse_3

if improvement_stop is None:                 # never stopped
    improvement_stop = 200

print("\nBest rounded RMSE :", best_rmse, "at n_estimators =", best_n)
print("RMSE stops improving (3-decimal) after n_estimators =", improvement_stop)


Best rounded RMSE : 0.443 at n_estimators = 150
RMSE stops improving (3-decimal) after n_estimators = 60


In [31]:
depths      = [10, 15, 20, 25]
estimators  = list(range(10, 201, 10))   # 10,20,...,200
results = []                             # (depth, n_est, rmse)

In [32]:
for depth in depths:
    rmse_list = []
    print(f"\n=== max_depth = {depth} ===")
    for n in estimators:
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train_sp, y_train)
        y_pred = rf.predict(X_val_sp)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
        print(f"  n_estimators={n:3d}  RMSE={rmse:.5f}")

    mean_rmse = np.mean(rmse_list)
    results.append((depth, mean_rmse, rmse_list))
    print(f"  ---> mean RMSE = {mean_rmse:.5f}")


=== max_depth = 10 ===
  n_estimators= 10  RMSE=0.45063
  n_estimators= 20  RMSE=0.44711
  n_estimators= 30  RMSE=0.44562
  n_estimators= 40  RMSE=0.44324
  n_estimators= 50  RMSE=0.44207
  n_estimators= 60  RMSE=0.44174
  n_estimators= 70  RMSE=0.44141
  n_estimators= 80  RMSE=0.44150
  n_estimators= 90  RMSE=0.44155
  n_estimators=100  RMSE=0.44126
  n_estimators=110  RMSE=0.44052
  n_estimators=120  RMSE=0.44077
  n_estimators=130  RMSE=0.44063
  n_estimators=140  RMSE=0.44033
  n_estimators=150  RMSE=0.43995
  n_estimators=160  RMSE=0.43980
  n_estimators=170  RMSE=0.44000
  n_estimators=180  RMSE=0.43975
  n_estimators=190  RMSE=0.43987
  n_estimators=200  RMSE=0.43984
  ---> mean RMSE = 0.44188

=== max_depth = 15 ===
  n_estimators= 10  RMSE=0.45796
  n_estimators= 20  RMSE=0.45344
  n_estimators= 30  RMSE=0.45109
  n_estimators= 40  RMSE=0.44817
  n_estimators= 50  RMSE=0.44628
  n_estimators= 60  RMSE=0.44538
  n_estimators= 70  RMSE=0.44491
  n_estimators= 80  RMSE=0.44486
 

In [33]:
best_depth, best_mean = min(results, key=lambda x: x[1])[:2]
print("\n" + "="*50)
print(f"BEST max_depth = {best_depth}")
print(f"Mean RMSE across n_estimators = {best_mean:.5f}")
print("="*50)


BEST max_depth = 10
Mean RMSE across n_estimators = 0.44188


In [34]:
importances = rf.feature_importances_
feature_names = dv.get_feature_names_out()

In [35]:
imp_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

print("\n=== Top 10 most important features ===")
print(imp_df.head(10).to_string(index=False))


=== Top 10 most important features ===
            feature  importance
     vehicle_weight    0.959328
         horsepower    0.015883
       acceleration    0.011637
         model_year    0.003307
engine_displacement    0.003275
      num_cylinders    0.002219
          num_doors    0.001559
         origin=USA    0.000538
      origin=Europe    0.000479
        origin=Asia    0.000454


In [36]:
options = ["vehicle_weight", "horsepower", "acceleration", "engine_displacement"]

In [37]:
mapping = {
    "weight": "vehicle_weight",
    "horsepower": "horsepower",
    "acceleration": "acceleration",
    "displacement": "engine_displacement"
}

In [38]:
option_imp = {}
for col in mapping:
    # find the exact feature name that contains this column
    matches = [f for f in feature_names if col in f]
    # for numeric columns there is only one match
    imp = imp_df[imp_df["feature"].isin(matches)]["importance"].sum()
    option_imp[mapping[col]] = imp

In [39]:
for name, imp in option_imp.items():
    print(f"{name:18} : {imp:.6f}")

best_option = max(option_imp, key=option_imp.get)
print(f"\nMost important feature among the four: **{best_option}**")

vehicle_weight     : 0.959328
horsepower         : 0.015883
acceleration       : 0.011637
engine_displacement : 0.003275

Most important feature among the four: **vehicle_weight**


In [40]:
X_train_sp = dv.fit_transform(df_train.to_dict(orient="records"))
X_val_sp   = dv.transform(df_val.to_dict(orient="records"))

dtrain = xgb.DMatrix(X_train_sp, label=y_train)
dval   = xgb.DMatrix(X_val_sp,   label=y_val)

In [41]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [42]:
xgb_params_03 = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

In [43]:
print("\n=== Training with eta = 0.3 ===")
model_03 = xgb.train(
    params=xgb_params_03,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist,
    early_stopping_rounds=10,
    verbose_eval=10
)


=== Training with eta = 0.3 ===
[0]	train-rmse:1.81393	val-rmse:1.85444
[10]	train-rmse:0.37115	val-rmse:0.43896
[20]	train-rmse:0.33553	val-rmse:0.43376
[23]	train-rmse:0.32815	val-rmse:0.43601


In [44]:
best_iter_03 = model_03.best_iteration
rmse_03 = model_03.best_score
print(f"eta=0.3  best iteration = {best_iter_03}  val-RMSE = {rmse_03:.5f}")

eta=0.3  best iteration = 14  val-RMSE = 0.43349


In [45]:
xgb_params_01 = xgb_params_03.copy()
xgb_params_01['eta'] = 0.1

print("\n=== Training with eta = 0.1 ===")
model_01 = xgb.train(
    params=xgb_params_01,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist,
    early_stopping_rounds=10,
    verbose_eval=10
)


=== Training with eta = 0.1 ===
[0]	train-rmse:2.28944	val-rmse:2.34561
[10]	train-rmse:0.91008	val-rmse:0.94062
[20]	train-rmse:0.48983	val-rmse:0.53064
[30]	train-rmse:0.38342	val-rmse:0.44289
[40]	train-rmse:0.35343	val-rmse:0.42746
[50]	train-rmse:0.33998	val-rmse:0.42498
[60]	train-rmse:0.33054	val-rmse:0.42456
[65]	train-rmse:0.32602	val-rmse:0.42493


In [46]:
best_iter_01 = model_01.best_iteration
rmse_01 = model_01.best_score
print(f"eta=0.1  best iteration = {best_iter_01}  val-RMSE = {rmse_01:.5f}")

eta=0.1  best iteration = 56  val-RMSE = 0.42426


In [47]:
print("\n" + "="*50)
if rmse_01 < rmse_03:
    winner = "0.1"
    print(f"eta = {winner} gives the LOWER RMSE ({rmse_01:.5f} < {rmse_03:.5f})")
elif rmse_03 < rmse_01:
    winner = "0.3"
    print(f"eta = {winner} gives the LOWER RMSE ({rmse_03:.5f} < {rmse_01:.5f})")
else:
    winner = "both equal"
    print(f"Both eta give the SAME RMSE = {rmse_03:.5f}")
print("="*50)


eta = 0.1 gives the LOWER RMSE (0.42426 < 0.43349)
