In [27]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [3]:
df= pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


In [5]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [6]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [9]:
for col in df.columns:
    if df[col].dtype == 'float64':
        df[col] = df[col].fillna(0.0)
    elif df[col].dtype == 'int64':
        df[col] = df[col].fillna(0.0)
    else:
        df[col] = df[col].fillna('NA') 
df


Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,0.0,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,0.0,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,0.0,15.101802
9700,180,0.0,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [10]:
df.isnull().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

# Split the data

In [11]:
df_full_train, df_test = train_test_split(df.drop(columns='fuel_efficiency_mpg'), test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.2, random_state=1)

numerical_features = df_train.select_dtypes(include="number").columns.tolist()
categorical_features = df_train.select_dtypes(exclude="number").columns.tolist()
feature_columns = categorical_features + numerical_features

train_records = df_train[feature_columns].to_dict(orient='records')
validation_records = df_val[feature_columns].to_dict(orient='records')
test_records = df_test[feature_columns].to_dict(orient='records')

dv= DictVectorizer(sparse= False)
X_train = dv.fit_transform(train_records)
X_val = dv.transform(validation_records)
X_test= dv.transform(test_records)
print(len(X_train))

y_train = df.fuel_efficiency_mpg.loc[df_train.index].values
y_val = df.fuel_efficiency_mpg.loc[df_val.index].values
y_test = df.fuel_efficiency_mpg.loc[df_test.index].values


model = DecisionTreeRegressor(max_depth=1, random_state=1)
model.fit(X_train, y_train) 


6210


0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


# Question 1

In [12]:
feature_importance = model.feature_importances_
print(feature_importance)
important_feature_index = feature_importance.argmax()
important_feature = dv.feature_names_[important_feature_index]

leaf_values = model.apply(X_train) 
print(len(leaf_values))

unique_leaf_indices = np.unique(leaf_values)

print(f'The feature used for splitting the data is: {important_feature}\n')
print('Leaf node values:')
for index in unique_leaf_indices:
    leaf_value = y_train[leaf_values == index].mean()
    count_in_leaf = np.sum(leaf_values == index)
    print(f'Value at leaf node {index}: {leaf_value:.2f}, Count: {count_in_leaf}')

print("\nFeature Importances:")
for name, importance in zip(dv.feature_names_, feature_importance):
    print(f"Feature: {name}, Importance: {importance:.2f}")

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
6210
The feature used for splitting the data is: vehicle_weight

Leaf node values:
Value at leaf node 1: 16.87, Count: 3248
Value at leaf node 2: 12.92, Count: 2962

Feature Importances:
Feature: acceleration, Importance: 0.00
Feature: drivetrain=All-wheel drive, Importance: 0.00
Feature: drivetrain=Front-wheel drive, Importance: 0.00
Feature: engine_displacement, Importance: 0.00
Feature: fuel_type=Diesel, Importance: 0.00
Feature: fuel_type=Gasoline, Importance: 0.00
Feature: horsepower, Importance: 0.00
Feature: model_year, Importance: 0.00
Feature: num_cylinders, Importance: 0.00
Feature: num_doors, Importance: 0.00
Feature: origin=Asia, Importance: 0.00
Feature: origin=Europe, Importance: 0.00
Feature: origin=USA, Importance: 0.00
Feature: vehicle_weight, Importance: 1.00


# Question 2

In [13]:
model = RandomForestRegressor(random_state=1, n_jobs=-1, n_estimators=10)
model.fit(X_train, y_train) 

y_predict = model.predict(X_val)

rmse= root_mean_squared_error(y_val, y_predict)
print('The value of rmse:', rmse)

The value of rmse: 0.46421145335023656


# Question 3

In [14]:
n_estimator= range(10,201, 10)
rmse_values=[]

for estimator in n_estimator:
    model = RandomForestRegressor(estimator, random_state=1, n_jobs=-1 )
    model.fit(X_train, y_train)
    y_predict = model.predict(X_val)
    rmse= root_mean_squared_error(y_val, y_predict)
    rmse_values.append(rmse)

    print(f"RMSE value for each estimator {estimator}: {rmse:.3f}")
    
    

RMSE value for each estimator 10: 0.464
RMSE value for each estimator 20: 0.453
RMSE value for each estimator 30: 0.450
RMSE value for each estimator 40: 0.446
RMSE value for each estimator 50: 0.444
RMSE value for each estimator 60: 0.443
RMSE value for each estimator 70: 0.442
RMSE value for each estimator 80: 0.442
RMSE value for each estimator 90: 0.441
RMSE value for each estimator 100: 0.441
RMSE value for each estimator 110: 0.441
RMSE value for each estimator 120: 0.440
RMSE value for each estimator 130: 0.440
RMSE value for each estimator 140: 0.440
RMSE value for each estimator 150: 0.440
RMSE value for each estimator 160: 0.440
RMSE value for each estimator 170: 0.440
RMSE value for each estimator 180: 0.440
RMSE value for each estimator 190: 0.440
RMSE value for each estimator 200: 0.440


# Question 4

In [31]:
n_estimator= range(10,201, 10)
max_depth_values=[10, 15, 20, 25]

for depth in max_depth_values:
    rmse_values=[]
    for estimator in n_estimator:
        model = RandomForestRegressor(estimator, max_depth=depth, random_state=1, n_jobs=-1 )
        model.fit(X_train, y_train)
        y_predict = model.predict(X_val)
        rmse= root_mean_squared_error(y_val, y_predict)
        rmse_values.append(rmse)
        print(f"RMSE value for each depth {depth} and estimator {estimator}: {rmse:.3f}")

    mean_value= np.mean(rmse_values)
    print(f"Mean value with depth {depth}:{mean_value}")

RMSE value for each depth 10 and estimator 10: 0.465
RMSE value for each depth 10 and estimator 20: 0.457
RMSE value for each depth 10 and estimator 30: 0.456
RMSE value for each depth 10 and estimator 40: 0.454
RMSE value for each depth 10 and estimator 50: 0.453
RMSE value for each depth 10 and estimator 60: 0.453
RMSE value for each depth 10 and estimator 70: 0.452
RMSE value for each depth 10 and estimator 80: 0.452
RMSE value for each depth 10 and estimator 90: 0.451
RMSE value for each depth 10 and estimator 100: 0.451
RMSE value for each depth 10 and estimator 110: 0.451
RMSE value for each depth 10 and estimator 120: 0.452
RMSE value for each depth 10 and estimator 130: 0.451
RMSE value for each depth 10 and estimator 140: 0.452
RMSE value for each depth 10 and estimator 150: 0.452
RMSE value for each depth 10 and estimator 160: 0.451
RMSE value for each depth 10 and estimator 170: 0.451
RMSE value for each depth 10 and estimator 180: 0.451
RMSE value for each depth 10 and esti

In [28]:
model = RandomForestRegressor(max_depth=20, random_state=1, n_estimators=15, n_jobs=-1)
model.fit(X_train, y_train)

feature_importance = model.feature_importances_

leaf_values = model.apply(X_train) 

print(leaf_values.shape)

for name, importance in zip(dv.feature_names_, feature_importance):
    print(f"Feature: {name}, Importance: {importance:.4f}")

most_important_feature = dv.feature_names_[feature_importance.argmax()]
print(f"\nMost important feature: {most_important_feature}")



(6210, 15)
Feature: acceleration, Importance: 0.0115
Feature: drivetrain=All-wheel drive, Importance: 0.0003
Feature: drivetrain=Front-wheel drive, Importance: 0.0003
Feature: engine_displacement, Importance: 0.0034
Feature: fuel_type=Diesel, Importance: 0.0004
Feature: fuel_type=Gasoline, Importance: 0.0003
Feature: horsepower, Importance: 0.0158
Feature: model_year, Importance: 0.0033
Feature: num_cylinders, Importance: 0.0022
Feature: num_doors, Importance: 0.0015
Feature: origin=Asia, Importance: 0.0005
Feature: origin=Europe, Importance: 0.0005
Feature: origin=USA, Importance: 0.0005
Feature: vehicle_weight, Importance: 0.9594

Most important feature: vehicle_weight


# Question 6

In [30]:
X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.2, random_state=1)

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}


for eta in [0.3, 0.1]:
    params = xgb_params.copy()
    params['eta'] = eta

    print(f"\nTraining model with eta = {eta}")
    model = xgb.train(params=params, dtrain=dtrain, num_boost_round=100, evals=[(dval, 'val')], verbose_eval=False )
    y_pred = model.predict(dval)

    rmse = root_mean_squared_error(y_val, y_pred)
    print(f"Validation RMSE with eta={eta}: {rmse:.4f}")   




Training model with eta = 0.3
Validation RMSE with eta=0.3: 0.4694

Training model with eta = 0.1
Validation RMSE with eta=0.1: 0.4421
