In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")

In [3]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [4]:
df = df.fillna(0)

In [5]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,0.0,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,0.0,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


# Preparing the data set

In [6]:
# First split: train vs temp(train=60%, temp=40%)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=1)
# Second split: validation vs test (each is half of temp)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=1)

In [7]:
target = 'fuel_efficiency_mpg'
# Drop target column for features
X_train = train_df.drop(target, axis=1)
y_train = train_df[target]
X_valid = valid_df.drop(target, axis=1)
y_valid = valid_df[target]
X_test = test_df.drop(target, axis=1)
y_test = test_df[target]

In [8]:
X_train_dict = X_train.to_dict(orient='records')
X_valid_dict = X_valid.to_dict(orient='records')
X_test_dict = X_test.to_dict(orient='records')

In [9]:
dv = DictVectorizer(sparse=True)
X_train_matrix = dv.fit_transform(X_train_dict)
X_valid_matrix = dv.transform(X_valid_dict)
X_test_matrix = dv.transform(X_test_dict)

# Q1

In [10]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train_matrix, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [11]:
# Feature index used for the first split
split_feature_index = dt.tree_.feature[0]  # Gets the root split feature

# Looks the feature name:
split_feature_name = dv.feature_names_[split_feature_index]
split_feature_name

'vehicle_weight'

# Q2

In [12]:
# Initialize and train the model
rf = RandomForestRegressor(
    n_estimators=10, 
    random_state=1, 
    n_jobs=-1
)
rf.fit(X_train_matrix, y_train)

# Predict on the validation set
y_pred = rf.predict(X_valid_matrix)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
rmse

np.float64(0.4602815367032658)

# Q3

In [13]:
test_n_list = [10, 20, 30]
for n in test_n_list:
    print("Training with n_estimators =", n)
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train_matrix, y_train)
    y_pred = rf.predict(X_valid_matrix)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(f"RMSE: {round(rmse, 3)}")

Training with n_estimators = 10
RMSE: 0.46
Training with n_estimators = 20
RMSE: 0.446
Training with n_estimators = 30
RMSE: 0.44


In [14]:
n_list = range(10, 101, 10)  # Try up to 100 for now
for n in n_list:
    print("Training with n_estimators =", n)
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train_matrix, y_train)
    y_pred = rf.predict(X_valid_matrix)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(f"RMSE: {round(rmse, 3)}")

Training with n_estimators = 10
RMSE: 0.46
Training with n_estimators = 20
RMSE: 0.446
Training with n_estimators = 30
RMSE: 0.44
Training with n_estimators = 40
RMSE: 0.438
Training with n_estimators = 50
RMSE: 0.437
Training with n_estimators = 60
RMSE: 0.436
Training with n_estimators = 70
RMSE: 0.436
Training with n_estimators = 80
RMSE: 0.436
Training with n_estimators = 90
RMSE: 0.435
Training with n_estimators = 100
RMSE: 0.435


# Q4