In [1]:
import wget
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
wget.download('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

'car_fuel_efficiency (1).csv'

In [5]:
df = pd.read_csv('car_fuel_efficiency.csv')
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


#### Preparing the dataset

In [6]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [7]:
df = df.fillna(0)

In [8]:
df.dtypes

engine_displacement      int64
num_cylinders          float64
horsepower             float64
vehicle_weight         float64
acceleration           float64
model_year               int64
origin                  object
fuel_type               object
drivetrain              object
num_doors              float64
fuel_efficiency_mpg    float64
dtype: object

In [9]:
categorical = ['origin', 
               'fuel_type', 
               'drivetrain'
]
numerical = ['engine_displacement', 
             'num_cylinders', 
             'horsepower', 
             'vehicle_weight', 
             'acceleration', 
             'model_year', 
             'num_doors']

In [10]:
df_full_train, df_test = train_test_split(df, test_size= 0.2, random_state = 1)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1)

In [11]:
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

In [12]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [13]:
dv = DictVectorizer(sparse=True)

#### Question 1. Most important featute

In [14]:
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [15]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

# get feature_importance with dv sparse=True
feature_names = dv.feature_names_   
pd.Series(dt.feature_importances_, index=feature_names)

acceleration                    0.0
drivetrain=All-wheel drive      0.0
drivetrain=Front-wheel drive    0.0
engine_displacement             0.0
fuel_type=Diesel                0.0
fuel_type=Gasoline              0.0
horsepower                      0.0
model_year                      0.0
num_cylinders                   0.0
num_doors                       0.0
origin=Asia                     0.0
origin=Europe                   0.0
origin=USA                      0.0
vehicle_weight                  1.0
dtype: float64

#### Question 2. RMSE on validation

In [16]:
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)

rf.fit(X_train, y_train)

pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, pred))
rmse.round(3)

np.float64(0.46)

#### Question 3. Number of estimators

In [17]:
rmses = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(
        n_estimators=n,
        random_state=1,
        n_jobs=-1
    )

    rf.fit(X_train, y_train)

    pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, pred))

    rmses.append((n, rmse))

rmses_rounded = [(n, round(rmse, 3)) for n, rmse in rmses]

for item in rmses_rounded:
    print(item)

(10, np.float64(0.46))
(20, np.float64(0.454))
(30, np.float64(0.452))
(40, np.float64(0.449))
(50, np.float64(0.447))
(60, np.float64(0.445))
(70, np.float64(0.445))
(80, np.float64(0.445))
(90, np.float64(0.445))
(100, np.float64(0.445))
(110, np.float64(0.444))
(120, np.float64(0.444))
(130, np.float64(0.444))
(140, np.float64(0.443))
(150, np.float64(0.443))
(160, np.float64(0.443))
(170, np.float64(0.443))
(180, np.float64(0.442))
(190, np.float64(0.442))
(200, np.float64(0.442))


#### Question 4. Best max_depth

In [18]:
depths = [10, 15, 20, 25]

In [19]:
results ={}

for depth in depths:
    rmses = []

    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )

        rf.fit(X_train, y_train)
        pred = rf.predict(X_val)

        rmse = np.sqrt(mean_squared_error(y_val, pred))
        rmses.append(rmse)
    
    mean_rmse = np.mean(rmses)
    results[depth] = mean_rmse

results

{10: np.float64(0.44180786093233565),
 15: np.float64(0.44541664456381075),
 20: np.float64(0.44625292424422536),
 25: np.float64(0.44590993626161624)}

#### Question 5. Most important feature

In [20]:
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)

rf.fit(X_train, y_train)

pd.Series(rf.feature_importances_, index=dv.feature_names_)


acceleration                    0.011480
drivetrain=All-wheel drive      0.000357
drivetrain=Front-wheel drive    0.000345
engine_displacement             0.003273
fuel_type=Diesel                0.000325
fuel_type=Gasoline              0.000360
horsepower                      0.015998
model_year                      0.003212
num_cylinders                   0.002343
num_doors                       0.001635
origin=Asia                     0.000462
origin=Europe                   0.000519
origin=USA                      0.000540
vehicle_weight                  0.959150
dtype: float64

#### Question 6. XGBoost eta

In [21]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [22]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [23]:
# train eta = 0.3
model_03 = xgb.train(
    {
        'eta': 0.3,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1
    },
    dtrain,
    num_boost_round=100,
    evals=watchlist
)

[0]	train-rmse:1.81393	val-rmse:1.85444
[1]	train-rmse:1.31919	val-rmse:1.35353
[2]	train-rmse:0.98120	val-rmse:1.01316


[3]	train-rmse:0.75443	val-rmse:0.78667
[4]	train-rmse:0.60680	val-rmse:0.64318
[5]	train-rmse:0.51381	val-rmse:0.55664
[6]	train-rmse:0.45470	val-rmse:0.50321
[7]	train-rmse:0.41881	val-rmse:0.47254
[8]	train-rmse:0.39534	val-rmse:0.45509
[9]	train-rmse:0.38038	val-rmse:0.44564
[10]	train-rmse:0.37115	val-rmse:0.43896
[11]	train-rmse:0.36361	val-rmse:0.43594
[12]	train-rmse:0.35850	val-rmse:0.43558
[13]	train-rmse:0.35365	val-rmse:0.43394
[14]	train-rmse:0.35025	val-rmse:0.43349
[15]	train-rmse:0.34666	val-rmse:0.43362
[16]	train-rmse:0.34459	val-rmse:0.43378
[17]	train-rmse:0.34128	val-rmse:0.43405
[18]	train-rmse:0.33822	val-rmse:0.43391
[19]	train-rmse:0.33709	val-rmse:0.43374
[20]	train-rmse:0.33553	val-rmse:0.43376
[21]	train-rmse:0.33243	val-rmse:0.43453
[22]	train-rmse:0.33031	val-rmse:0.43510
[23]	train-rmse:0.32815	val-rmse:0.43601
[24]	train-rmse:0.32670	val-rmse:0.43592
[25]	train-rmse:0.32268	val-rmse:0.43683
[26]	train-rmse:0.32085	val-rmse:0.43678
[27]	train-rmse:0.32035

In [24]:
model_01 = xgb.train(
    {
        'eta': 0.1,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1
    },
    dtrain,
    num_boost_round=100,
    evals=watchlist
)


[0]	train-rmse:2.28944	val-rmse:2.34561
[1]	train-rmse:2.07396	val-rmse:2.12434
[2]	train-rmse:1.88066	val-rmse:1.92597
[3]	train-rmse:1.70730	val-rmse:1.74987
[4]	train-rmse:1.55163	val-rmse:1.59059
[5]	train-rmse:1.41247	val-rmse:1.44988
[6]	train-rmse:1.28796	val-rmse:1.32329
[7]	train-rmse:1.17660	val-rmse:1.20930
[8]	train-rmse:1.07736	val-rmse:1.10830
[9]	train-rmse:0.98883	val-rmse:1.02009
[10]	train-rmse:0.91008	val-rmse:0.94062
[11]	train-rmse:0.84030	val-rmse:0.87100
[12]	train-rmse:0.77874	val-rmse:0.80916
[13]	train-rmse:0.72417	val-rmse:0.75465
[14]	train-rmse:0.67626	val-rmse:0.70780
[15]	train-rmse:0.63402	val-rmse:0.66672
[16]	train-rmse:0.59690	val-rmse:0.63062
[17]	train-rmse:0.56447	val-rmse:0.60016
[18]	train-rmse:0.53619	val-rmse:0.57383
[19]	train-rmse:0.51138	val-rmse:0.55044
[20]	train-rmse:0.48983	val-rmse:0.53064
[21]	train-rmse:0.47135	val-rmse:0.51451
[22]	train-rmse:0.45501	val-rmse:0.49998
[23]	train-rmse:0.44120	val-rmse:0.48790
[24]	train-rmse:0.42929	va