In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer


In [4]:
!wget -q https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

In [5]:
df = pd.read_csv('car_fuel_efficiency.csv')
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [10]:
# Train/validation/test split (60/20/20) with random_state=1

df_full_train, df_test = train_test_split(df, test_size=0.20, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)


for d in (df_train, df_val, df_test):
    d.reset_index(drop=True, inplace=True)

len(df_train), len(df_val), len(df_test)

(5822, 1941, 1941)

In [11]:
target = 'fuel_efficiency_mpg'

y_train = df_train[target].values
y_val   = df_val[target].values
y_test  = df_test[target].values

# Drop target from dataframes
for d in (df_train, df_val, df_test):
    d.drop(columns=[target], inplace=True)


In [13]:
# Fill NaNs with zeros
train_dicts = df_train.fillna(0).to_dict(orient='records')
val_dicts   = df_val.fillna(0).to_dict(orient='records')
test_dicts  = df_test.fillna(0).to_dict(orient='records')

dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)
X_val   = dv.transform(val_dicts)
X_test  = dv.transform(test_dicts)

X_train.shape, X_val.shape, X_test.shape, len(dv.get_feature_names_out())


((5822, 14), (1941, 14), (1941, 14), 14)

# Question 1

In [15]:
from sklearn.tree import DecisionTreeRegressor, export_text

dt_reg = DecisionTreeRegressor(max_depth=1, random_state=1)
dt_reg.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [16]:
feature_names = list(dv.get_feature_names_out())
print(export_text(dt_reg, feature_names=feature_names))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



# Question 2

In [21]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [23]:
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_val = rf.predict(X_val)

rmse = np.sqrt(((y_val - y_pred_val) ** 2).mean())
rmse

np.float64(0.4595777223092726)

# Question 3

In [24]:
scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    y_pred_val = rf.predict(X_val)
    
    rmse = np.sqrt(((y_val - y_pred_val) ** 2).mean())
    scores.append((n, rmse))

scores

[]

# Question 4

In [25]:
max_depth_values = [10, 15, 20, 25]

scores = []

for d in max_depth_values:
    rmses = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(max_depth=d, n_estimators=n,
                                   random_state=1,
                                   n_jobs=-1
                                  )
        
        rf.fit(X_train, y_train)
        y_pred_val = rf.predict(X_val)
        
        rmse = np.sqrt(((y_val - y_pred_val) ** 2).mean())
        rmses.append(rmse)
    
    scores.append((d, np.mean(rmses)))

scores_df = pd.DataFrame(scores, columns=['max_depth', 'mean_rmse'])
scores_df

Unnamed: 0,max_depth,mean_rmse
0,10,0.441808
1,15,0.445417
2,20,0.446253
3,25,0.44591


# Question 5

In [26]:
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)

rf.fit(X_train, y_train)

importances = rf.feature_importances_
feature_names = dv.get_feature_names_out()

fi_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
})

fi_df_sorted = fi_df.sort_values(by='importance', ascending=False)
fi_df_sorted.head()

Unnamed: 0,feature,importance
13,vehicle_weight,0.95915
6,horsepower,0.015998
0,acceleration,0.01148
3,engine_displacement,0.003273
7,model_year,0.003212


# Question 6



In [27]:
import xgboost as xgb

In [28]:
# Create DMatrix for train and validation and set up watchlist
features = list(dv.get_feature_names_out())

dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

watchlist = [(dtrain, 'train'), (dval, 'val')]

In [31]:
# Train with eta=0.3 for 100 rounds

xgb_params_03 = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_03 = xgb.train(
    params=xgb_params_03,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist,
    verbose_eval=False
)

y_pred_val_03 = model_03.predict(dval)
rmse_03 = float(np.sqrt(((y_val - y_pred_val_03) ** 2).mean()))
print("eta=0.3 RMSE (val):", rmse_03)


eta=0.3 RMSE (val): 0.45017755678087246


In [33]:
# Train with eta=0.1 for 100 rounds
xgb_params_01 = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_01 = xgb.train(
    params=xgb_params_01,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist,
    verbose_eval=False
)

y_pred_val_01 = model_01.predict(dval)
rmse_01 = float(np.sqrt(((y_val - y_pred_val_01) ** 2).mean()))
print("eta=0.1 RMSE (val):", rmse_01)

eta=0.1 RMSE (val): 0.42622800553359225
