In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")

In [5]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [6]:
df = df.fillna(0)

In [7]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,0.0,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,0.0,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


# Preparing the data set

In [8]:
# First split: train vs temp(train=60%, temp=40%)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=1)
# Second split: validation vs test (each is half of temp)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=1)

In [9]:
target = 'fuel_efficiency_mpg'
# Drop target column for features
X_train = train_df.drop(target, axis=1)
y_train = train_df[target]
X_valid = valid_df.drop(target, axis=1)
y_valid = valid_df[target]
X_test = test_df.drop(target, axis=1)
y_test = test_df[target]

In [10]:
X_train_dict = X_train.to_dict(orient='records')
X_valid_dict = X_valid.to_dict(orient='records')
X_test_dict = X_test.to_dict(orient='records')

In [11]:
dv = DictVectorizer(sparse=True)
X_train_matrix = dv.fit_transform(X_train_dict)
X_valid_matrix = dv.transform(X_valid_dict)
X_test_matrix = dv.transform(X_test_dict)

# Q1

In [12]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train_matrix, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [13]:
# Feature index used for the first split
split_feature_index = dt.tree_.feature[0]  # Gets the root split feature

# Looks the feature name:
split_feature_name = dv.feature_names_[split_feature_index]
split_feature_name

'vehicle_weight'

# Q2

In [14]:
# Initialize and train the model
rf = RandomForestRegressor(
    n_estimators=10, 
    random_state=1, 
    n_jobs=-1
)
rf.fit(X_train_matrix, y_train)

# Predict on the validation set
y_pred = rf.predict(X_valid_matrix)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
rmse

np.float64(0.4602815367032658)

# Q3

In [15]:
test_n_list = [10, 20, 30]
for n in test_n_list:
    print("Training with n_estimators =", n)
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train_matrix, y_train)
    y_pred = rf.predict(X_valid_matrix)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(f"RMSE: {round(rmse, 3)}")

Training with n_estimators = 10
RMSE: 0.46
Training with n_estimators = 20
RMSE: 0.446
Training with n_estimators = 30
RMSE: 0.44


In [16]:
n_list = range(10, 101, 10)  # Try up to 100 for now
for n in n_list:
    print("Training with n_estimators =", n)
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train_matrix, y_train)
    y_pred = rf.predict(X_valid_matrix)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(f"RMSE: {round(rmse, 3)}")

Training with n_estimators = 10
RMSE: 0.46
Training with n_estimators = 20
RMSE: 0.446
Training with n_estimators = 30
RMSE: 0.44
Training with n_estimators = 40
RMSE: 0.438
Training with n_estimators = 50
RMSE: 0.437
Training with n_estimators = 60
RMSE: 0.436
Training with n_estimators = 70
RMSE: 0.436
Training with n_estimators = 80
RMSE: 0.436
Training with n_estimators = 90
RMSE: 0.435
Training with n_estimators = 100
RMSE: 0.435


# Q4

In [17]:
max_depth_values = [10, 15, 20, 25]

n_estimators_values = [10, 50, 100]

mean_rmse_per_depth = []

for max_depth in max_depth_values:
    rmse_list = []
    for n in n_estimators_values:
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=max_depth,
            random_state=1,
            n_jobs=1  # 1 thread avoids CPU overload!
        )
        rf.fit(X_train_matrix, y_train)   # Or sample if needed
        y_pred = rf.predict(X_valid_matrix)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        print(f"max_depth={max_depth}, n_estimators={n}, RMSE={round(rmse, 3)}")
        rmse_list.append(rmse)
    mean_rmse = np.mean(rmse_list)
    mean_rmse_per_depth.append(mean_rmse)
    print(f"Mean RMSE for max_depth={max_depth}: {round(mean_rmse, 3)}")

max_depth=10, n_estimators=10, RMSE=0.451
max_depth=10, n_estimators=50, RMSE=0.436
max_depth=10, n_estimators=100, RMSE=0.435
Mean RMSE for max_depth=10: 0.441
max_depth=15, n_estimators=10, RMSE=0.462
max_depth=15, n_estimators=50, RMSE=0.437
max_depth=15, n_estimators=100, RMSE=0.435
Mean RMSE for max_depth=15: 0.445
max_depth=20, n_estimators=10, RMSE=0.46
max_depth=20, n_estimators=50, RMSE=0.437
max_depth=20, n_estimators=100, RMSE=0.435
Mean RMSE for max_depth=20: 0.444
max_depth=25, n_estimators=10, RMSE=0.461
max_depth=25, n_estimators=50, RMSE=0.437
max_depth=25, n_estimators=100, RMSE=0.435
Mean RMSE for max_depth=25: 0.444


# Q5

In [18]:
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1  # You can use -1 here, since it's a single fit
)
rf.fit(X_train_matrix, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [19]:
# Get importances
importances = rf.feature_importances_

# Get feature name
feature_names = dv.feature_names_ 

# Pair and print
for i, name in enumerate(feature_names):
    print(f"{name}: {importances[i]}")

acceleration: 0.011442313735237557
drivetrain=All-wheel drive: 0.0003428361850955144
drivetrain=Front-wheel drive: 0.00027678432674605027
engine_displacement: 0.003159424030350312
fuel_type=Diesel: 0.00034918888161679113
fuel_type=Gasoline: 0.0003071871419763815
horsepower: 0.015933481489766168
model_year: 0.003066056772754424
num_cylinders: 0.0023231449140431217
num_doors: 0.0015756291753787894
origin=Asia: 0.00043094762405633503
origin=Europe: 0.00041885303929771223
origin=USA: 0.0004959383688367859
vehicle_weight: 0.9598782143148441


# Q6

In [20]:
!pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [21]:
# Convert training and validation data
dtrain = xgb.DMatrix(X_train_matrix, label=y_train)
dvalid = xgb.DMatrix(X_valid_matrix, label=y_valid)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

model_03 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)
# Check best RMSE score
model_03.best_score

[0]	train-rmse:1.83282	valid-rmse:1.82567
[1]	train-rmse:1.33231	valid-rmse:1.32771
[2]	train-rmse:0.99034	valid-rmse:0.99257
[3]	train-rmse:0.76090	valid-rmse:0.76897
[4]	train-rmse:0.61110	valid-rmse:0.62742
[5]	train-rmse:0.51643	valid-rmse:0.54010
[6]	train-rmse:0.45800	valid-rmse:0.48954
[7]	train-rmse:0.42172	valid-rmse:0.46026
[8]	train-rmse:0.39836	valid-rmse:0.44332
[9]	train-rmse:0.38494	valid-rmse:0.43456
[10]	train-rmse:0.37400	valid-rmse:0.43004
[11]	train-rmse:0.36596	valid-rmse:0.42696
[12]	train-rmse:0.36050	valid-rmse:0.42569
[13]	train-rmse:0.35549	valid-rmse:0.42519
[14]	train-rmse:0.35143	valid-rmse:0.42455
[15]	train-rmse:0.34792	valid-rmse:0.42450
[16]	train-rmse:0.34533	valid-rmse:0.42478
[17]	train-rmse:0.34356	valid-rmse:0.42472
[18]	train-rmse:0.34129	valid-rmse:0.42472
[19]	train-rmse:0.33846	valid-rmse:0.42502
[20]	train-rmse:0.33724	valid-rmse:0.42509
[21]	train-rmse:0.33463	valid-rmse:0.42538
[22]	train-rmse:0.33260	valid-rmse:0.42543
[23]	train-rmse:0.330

0.4244959963107532

In [23]:
xgb_params['eta'] = 0.1
model_01 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10)
model_01.best_score

[0]	train-rmse:2.31334	valid-rmse:2.30592
[1]	train-rmse:2.09552	valid-rmse:2.08865
[2]	train-rmse:1.90001	valid-rmse:1.89221
[3]	train-rmse:1.72438	valid-rmse:1.71766
[4]	train-rmse:1.56719	valid-rmse:1.56150
[5]	train-rmse:1.42645	valid-rmse:1.42157
[6]	train-rmse:1.30047	valid-rmse:1.29580
[7]	train-rmse:1.18786	valid-rmse:1.18468
[8]	train-rmse:1.08744	valid-rmse:1.08657
[9]	train-rmse:0.99801	valid-rmse:0.99964
[10]	train-rmse:0.91846	valid-rmse:0.92183
[11]	train-rmse:0.84797	valid-rmse:0.85324
[12]	train-rmse:0.78540	valid-rmse:0.79241
[13]	train-rmse:0.73026	valid-rmse:0.73968
[14]	train-rmse:0.68164	valid-rmse:0.69327
[15]	train-rmse:0.63889	valid-rmse:0.65351
[16]	train-rmse:0.60130	valid-rmse:0.61854
[17]	train-rmse:0.56852	valid-rmse:0.58847
[18]	train-rmse:0.53982	valid-rmse:0.56232
[19]	train-rmse:0.51488	valid-rmse:0.53952
[20]	train-rmse:0.49316	valid-rmse:0.52039
[21]	train-rmse:0.47428	valid-rmse:0.50442
[22]	train-rmse:0.45775	valid-rmse:0.49005
[23]	train-rmse:0.443

0.4158992988437659