In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection  import SelectKBest, f_regression, mutual_info_regression

import pandas as pd
import numpy as np

import eli5
from tqdm import tqdm
import joblib
import matplotlib as plt

In [2]:
df = pd.read_csv('dataa.csv',delimiter=';')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 55 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   CLIENT_VAT           148 non-null    object 
 1   TIMO_ID              35 non-null     float64
 2   AT_KM                6 non-null      float64
 3   CZ_KM                119 non-null    float64
 4   DE_KM                55 non-null     float64
 5   EE_KM                24 non-null     float64
 6   HU_KM                33 non-null     float64
 7   LT_KM                87 non-null     float64
 8   LV_KM                43 non-null     float64
 9   PL_KM                209 non-null    float64
 10  SE_KM                32 non-null     float64
 11  SK_KM                77 non-null     float64
 12  COMPANY_LP           60 non-null     object 
 13  ADRESS_LP            55 non-null     object 
 14  COD_LP               262 non-null    object 
 15  CITY_LP              77 non-null     obj

In [3]:
df["VEHICLE_TYPE"].unique()

array(['Rigid truck, Articulated truck', 'Articulated truck',
       'ARTICULATED TRUCK', 'Rigid truck'], dtype=object)

In [3]:
del df["LATITUDE_LP"]
del df["LONGTITUDE_LP"]
del df["LATITUDE_DP"]
del df["LONGTITUDE_DP"]
del df["M3"]
del df["HEIGHT"]
del df["WIDTH"]
 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 48 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   CLIENT_VAT           148 non-null    object 
 1   TIMO_ID              35 non-null     float64
 2   AT_KM                6 non-null      float64
 3   CZ_KM                119 non-null    float64
 4   DE_KM                55 non-null     float64
 5   EE_KM                24 non-null     float64
 6   HU_KM                33 non-null     float64
 7   LT_KM                87 non-null     float64
 8   LV_KM                43 non-null     float64
 9   PL_KM                209 non-null    float64
 10  SE_KM                32 non-null     float64
 11  SK_KM                77 non-null     float64
 12  COMPANY_LP           60 non-null     object 
 13  ADRESS_LP            55 non-null     object 
 14  COD_LP               262 non-null    object 
 15  CITY_LP              77 non-null     obj

In [4]:
# Value Complement
df['CLIENT_VAT'] = df['CLIENT_VAT'].str.upper()
df['CLIENT_VAT'] = df['CLIENT_VAT'].fillna('OTHER')
df['CLIENT_VAT'] = pd.factorize(df['CLIENT_VAT'])[0]
df['TIMO_ID'].fillna(int(df['TIMO_ID'].mean()), inplace=True)

df['AT_KM'] = df['AT_KM'].fillna(0)
df['CZ_KM'] = df['CZ_KM'].fillna(0)
df['DE_KM'] = df['DE_KM'].fillna(0)
df['EE_KM'] = df['EE_KM'].fillna(0)
df['HU_KM'] = df['HU_KM'].fillna(0)
df['LT_KM'] = df['LT_KM'].fillna(0)
df['LV_KM'] = df['LV_KM'].fillna(0)
df['PL_KM'] = df['PL_KM'].fillna(0)
df['SE_KM'] = df['SE_KM'].fillna(0)
df['SK_KM'] = df['SK_KM'].fillna(0)

df['VEHICLE_TYPE'] = df['VEHICLE_TYPE'].str.upper()
df['VEHICLE_TYPE'] = df['VEHICLE_TYPE'].fillna('OTHER')
df['VEHICLE_TYPE'] = pd.factorize(df['VEHICLE_TYPE'])[0]

df['BODY_TYPE'] = df['BODY_TYPE'].str.upper()
df['BODY_TYPE'] = df['BODY_TYPE'].fillna('OTHER')
df['BODY_TYPE'] = pd.factorize(df['BODY_TYPE'])[0]

df['LOAD_UNLOAD_METHOD'] = df['LOAD_UNLOAD_METHOD'].str.upper()
df['LOAD_UNLOAD_METHOD'] = df['LOAD_UNLOAD_METHOD'].fillna('OTHER')
df['LOAD_UNLOAD_METHOD'] = pd.factorize(df['LOAD_UNLOAD_METHOD'])[0]

df['EPALE'] = df['EPALE'].fillna(0)

df['GOODS_TYPE'] = df['GOODS_TYPE'].str.upper()
df['GOODS_TYPE'] = df['GOODS_TYPE'].fillna('OTHER')
df['GOODS_TYPE'] = pd.factorize(df['GOODS_TYPE'])[0]

df['TEMP_MIN'].fillna(int(df['TEMP_MIN'].mean()), inplace=True)
df['TEMP_MAX'].fillna(int(df['TEMP_MAX'].mean()), inplace=True)

df['REQUIREMENTS'] = df['REQUIREMENTS'].str.upper()
df['REQUIREMENTS'] = df['REQUIREMENTS'].fillna('OTHER')
df['REQUIREMENTS'] = pd.factorize(df['REQUIREMENTS'])[0]

df['OTHER_COSTS'] = df['OTHER_COSTS'].fillna(0)

df['QTY_LOADS'] = df['QTY_LOADS'].fillna(1)
df['QTY_DELIVERIES'] = df['QTY_DELIVERIES'].fillna(1)

df['SOURCE'] = df['SOURCE'].str.upper()
df['SOURCE'] = df['SOURCE'].fillna('OTHER')
df['SOURCE'] = pd.factorize(df['SOURCE'])[0]

df['PAYMENT TERM'].fillna(int(df['PAYMENT TERM'].mean()), inplace=True)

df['CARGO_VALUE_EURO'].fillna(int(df['CARGO_VALUE_EURO'].mean()), inplace=True)

df['CUSTOMS'] = df['CUSTOMS'].fillna(0) #0 - no customs, 1 customs export or import, 2 export and import customss

In [5]:
df["START_LOAD_DATA"] = pd.to_datetime(df["START_LOAD_DATA"], dayfirst=True)
df["END_LOAD_DATA"] = pd.to_datetime(df["END_LOAD_DATA"], dayfirst=True)
df["START_DELIVERY_DATA"] = pd.to_datetime(df["START_DELIVERY_DATA"], dayfirst=True)
df["END_DELIVERY_DATA"] = pd.to_datetime(df["END_DELIVERY_DATA"], dayfirst=True)

In [6]:
df['START_LOAD_DATA_YEAR'] = df['START_LOAD_DATA'].dt.year
df['START_LOAD_DATA_MONTH'] = df['START_LOAD_DATA'].dt.month
df['START_LOAD_DATA_WEEK_OF_YEAR'] = df['START_LOAD_DATA'].dt.isocalendar().week
df['START_LOAD_DATA_WEEKDAY'] = df['START_LOAD_DATA'].dt.weekday
df['START_LOAD_DATA_DAY'] = df['START_LOAD_DATA'].dt.day
df['START_LOAD_DATA_DAY_OF_YEAR'] = df['START_LOAD_DATA'].dt.dayofyear
    
df['END_LOAD_DATA_YEAR'] = df['END_LOAD_DATA'].dt.year
df['END_LOAD_DATA_MONTH'] = df['END_LOAD_DATA'].dt.month
df['END_LOAD_DATA_WEEK_OF_YEAR'] = df['END_LOAD_DATA'].dt.isocalendar().week
df['END_LOAD_DATA_WEEKDAY'] = df['END_LOAD_DATA'].dt.weekday
df['END_LOAD_DATA_DAY'] = df['END_LOAD_DATA'].dt.day
df['END_LOAD_DATA_DAY_OF_YEAR'] = df['END_LOAD_DATA'].dt.dayofyear
                                          
df['START_DELIVERY_DATA_YEAR'] = df['START_DELIVERY_DATA'].dt.year
df['START_DELIVERY_DATA_MONTH'] = df['START_DELIVERY_DATA'].dt.month
df['START_DELIVERY_DATA_WEEK_OF_YEAR'] = df['START_DELIVERY_DATA'].dt.isocalendar().week
df['START_DELIVERY_DATA_WEEKDAY'] = df['START_DELIVERY_DATA'].dt.weekday
df['START_DELIVERY_DATA_DAY'] = df['START_DELIVERY_DATA'].dt.day
df['START_DELIVERY_DATA_DAY_OF_YEAR'] = df['START_DELIVERY_DATA'].dt.dayofyear
                                                
df['END_DELIVERY_DATA_YEAR'] = df['END_DELIVERY_DATA'].dt.year
df['END_DELIVERY_DATA_MONTH'] = df['END_DELIVERY_DATA'].dt.month
df['END_DELIVERY_DATA_WEEK_OF_YEAR'] = df['END_DELIVERY_DATA'].dt.isocalendar().week
df['END_DELIVERY_DATA_WEEKDAY'] = df['END_DELIVERY_DATA'].dt.weekday
df['END_DELIVERY_DATA_DAY'] = df['END_DELIVERY_DATA'].dt.day
df['END_DELIVERY_DATA_DAY_OF_YEAR'] = df['END_DELIVERY_DATA'].dt.dayofyear

In [7]:
#feature_engineering
df["KM"] = df['AT_KM'] + df['CZ_KM'] + df['DE_KM'] + df['EE_KM'] + df['HU_KM'] + df['LT_KM'] + df['LV_KM'] + df['PL_KM'] + df['SE_KM'] + df['SK_KM']
df["EUROforKM"] = df['EURO'] / df['KM']

df["COUNTRY_LOAD_PLACE"] = df["COD_LP"].str[:2]
df['COUNTRY_LOAD_PLACE'] = pd.factorize(df['COUNTRY_LOAD_PLACE'])[0]
df["COUNTRY_DELIVERY_PLACE"] = df["COD_DP"].str[:2]
df['COUNTRY_DELIVERY_PLACE'] = pd.factorize(df['COUNTRY_DELIVERY_PLACE'])[0]
df["RELATION"] = df["COD_LP"].str[:2] + ' to ' + df["COD_DP"].str[:2]

In [8]:
def feature_engineering(df):

    df_relation = df[ ["RELATION", "EUROforKM"] ].groupby(["RELATION"]).agg(["median","mean"])["EUROforKM"].reset_index()
    df = pd.merge(df, df_relation, on=["RELATION"], how="left")
    df.rename(columns = {'mean':'Mean Price in Relation', 'median':'Median Price in Relation'}, inplace = True)

    df_load = df[ ["COUNTRY_LOAD_PLACE", "EUROforKM"] ].groupby(["COUNTRY_LOAD_PLACE"]).agg(["median","mean"])["EUROforKM"].reset_index()
    df = pd.merge(df, df_load, on=["COUNTRY_LOAD_PLACE"], how="left")
    df.rename(columns = {'mean':'Mean Price from Country', 'median':'Median Price from Country'}, inplace = True)
    
    df_delivery = df[ ["COUNTRY_DELIVERY_PLACE", "EUROforKM"] ].groupby(["COUNTRY_DELIVERY_PLACE"]).agg(["median","mean"])["EUROforKM"].reset_index()
    df = pd.merge(df, df_delivery, on=["COUNTRY_DELIVERY_PLACE"], how="left")
    df.rename(columns = {'mean':'Mean Price to Country', 'median':'Median Price to Country'}, inplace = True)
    
    df_vehicletype = df[ ["VEHICLE_TYPE", "EUROforKM"] ].groupby(["VEHICLE_TYPE"]).agg(["mean", "median"])["EUROforKM"].reset_index()
    df = pd.merge(df, df_vehicletype, on=["VEHICLE_TYPE"], how="left")
    df.rename(columns = {'mean':'Mean Price for Vehicle Type', 'median':'Median Price for Vehicle Type',}, inplace = True)
    
    df_bodytype = df[ ["BODY_TYPE", "EUROforKM"] ].groupby(["BODY_TYPE"]).agg(["mean", "median"])["EUROforKM"].reset_index()
    df = pd.merge(df, df_bodytype, on=["BODY_TYPE"], how="left")
    df.rename(columns = {'mean':'Mean Price for Body Type', 'median':'Median Price for Body Type',}, inplace = True)
    
    df_loadunloadmethod = df[ ["LOAD_UNLOAD_METHOD", "EUROforKM"] ].groupby(["LOAD_UNLOAD_METHOD"]).agg(["mean", "median"])["EUROforKM"].reset_index()
    df = pd.merge(df, df_loadunloadmethod, on=["LOAD_UNLOAD_METHOD"], how="left")
    df.rename(columns = {'mean':'Mean Price  for load/unload method', 'median':'Median Price for load/unload method',}, inplace = True)
    
    # data
    
    df_sldd = df[ ["START_LOAD_DATA_DAY", "EUROforKM"] ].groupby(["START_LOAD_DATA_DAY"]).agg(["mean", "median"])["EUROforKM"].reset_index()
    df = pd.merge(df, df_sldd, on=["START_LOAD_DATA_DAY"], how="left")
    df.rename(columns = {'mean':'Mean Price in start load data day', 'median':'Median Price in start load data day',}, inplace = True)
    
    # 2 features
    
    df_vehicletype_relation = df[ ["VEHICLE_TYPE", "RELATION", "EUROforKM"] ].groupby(["VEHICLE_TYPE", "RELATION"]).agg(["mean", "median"])["EUROforKM"].reset_index()
    df = pd.merge(df, df_vehicletype_relation, on=["VEHICLE_TYPE", "RELATION"], how="left")
    df.rename(columns = {'mean':'Mean Price for Vehicle Type in Relation', 'median':'Median Price for Vehicle Type in Relation',}, inplace = True)

    df_relation_sldd = df[ ["START_LOAD_DATA_DAY", "RELATION", "EUROforKM"] ].groupby(["START_LOAD_DATA_DAY", "RELATION"]).agg(["mean", "median"])["EUROforKM"].reset_index()
    df = pd.merge(df, df_relation_sldd, on=["START_LOAD_DATA_DAY", "RELATION"], how="left")
    df.rename(columns = {'mean':'Mean Price for start load data day in Relation', 'median':'Median Price for start load data day in Relation',}, inplace = True)
    
    df_relation_sdw = df[ ["START_DELIVERY_DATA_WEEKDAY", "RELATION", "EUROforKM"] ].groupby(["START_DELIVERY_DATA_WEEKDAY", "RELATION"]).agg(["mean", "median"])["EUROforKM"].reset_index()
    df = pd.merge(df, df_relation_sdw, on=["START_DELIVERY_DATA_WEEKDAY", "RELATION"], how="left")
    df.rename(columns = {'mean':'Mean Price for start delivery weekday in Relation', 'median':'Median Price for start start delivery weekday in Relation',}, inplace = True)
    
    df_delivery_sld = df[ ["START_LOAD_DATA_DAY", "COUNTRY_DELIVERY_PLACE", "EUROforKM"] ].groupby(["START_LOAD_DATA_DAY", "COUNTRY_DELIVERY_PLACE"]).agg(["mean", "median"])["EUROforKM"].reset_index()
    df = pd.merge(df, df_delivery_sld, on=["START_LOAD_DATA_DAY", "COUNTRY_DELIVERY_PLACE"], how="left")
    df.rename(columns = {'mean':'Mean Price for start load day to Country', 'median':'Median Price start load day to Country',}, inplace = True)
    
    return df

In [9]:
df.rename(columns = {'KM':'Distance [km]'}, inplace = True)

In [10]:
def get_feats(df, black_list=['EURO', 'EUROforKM']):
    return [x for x in df.select_dtypes(["number", "bool"]).columns if x not in black_list]

In [11]:
def get_X_y(df, fe=feature_engineering, verbose=False):
    df_new = fe(df.copy())
    feats = get_feats(df_new, black_list=['EURO','EUROforKM'])
    if verbose:
        print("feats: ", feats)
    X = df_new[feats].values
    y = df["EURO"].values
    
    return X, y

In [12]:
def train_and_valid(model, X, y, cv=5, scoring="neg_mean_absolute_error"):
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    return np.mean(scores), np.std(scores)

In [13]:
def e(model, cv=5, verbose=False):
    X, y = get_X_y(df, verbose=verbose)
    return train_and_valid(model, X, y, cv=cv)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 77 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   CLIENT_VAT                        262 non-null    int64         
 1   TIMO_ID                           262 non-null    float64       
 2   AT_KM                             262 non-null    float64       
 3   CZ_KM                             262 non-null    float64       
 4   DE_KM                             262 non-null    float64       
 5   EE_KM                             262 non-null    float64       
 6   HU_KM                             262 non-null    float64       
 7   LT_KM                             262 non-null    float64       
 8   LV_KM                             262 non-null    float64       
 9   PL_KM                             262 non-null    float64       
 10  SE_KM                             262 non-null    

In [15]:
e(DecisionTreeRegressor(random_state=0, max_depth=10), verbose=True)

feats:  ['CLIENT_VAT', 'TIMO_ID', 'AT_KM', 'CZ_KM', 'DE_KM', 'EE_KM', 'HU_KM', 'LT_KM', 'LV_KM', 'PL_KM', 'SE_KM', 'SK_KM', 'VEHICLE_TYPE', 'BODY_TYPE', 'LOAD_UNLOAD_METHOD', 'EPALE', 'GOODS_TYPE', 'TEMP_MIN', 'TEMP_MAX', 'LDM', 'TONS', 'REQUIREMENTS', 'OTHER_COSTS', 'QTY_LOADS', 'QTY_DELIVERIES', 'SOURCE', 'PAYMENT TERM', 'CARGO_VALUE_EURO', 'CUSTOMS', 'START_LOAD_DATA_YEAR', 'START_LOAD_DATA_MONTH', 'START_LOAD_DATA_WEEK_OF_YEAR', 'START_LOAD_DATA_WEEKDAY', 'START_LOAD_DATA_DAY', 'START_LOAD_DATA_DAY_OF_YEAR', 'END_LOAD_DATA_YEAR', 'END_LOAD_DATA_MONTH', 'END_LOAD_DATA_WEEK_OF_YEAR', 'END_LOAD_DATA_WEEKDAY', 'END_LOAD_DATA_DAY', 'END_LOAD_DATA_DAY_OF_YEAR', 'START_DELIVERY_DATA_YEAR', 'START_DELIVERY_DATA_MONTH', 'START_DELIVERY_DATA_WEEK_OF_YEAR', 'START_DELIVERY_DATA_WEEKDAY', 'START_DELIVERY_DATA_DAY', 'START_DELIVERY_DATA_DAY_OF_YEAR', 'END_DELIVERY_DATA_YEAR', 'END_DELIVERY_DATA_MONTH', 'END_DELIVERY_DATA_WEEK_OF_YEAR', 'END_DELIVERY_DATA_WEEKDAY', 'END_DELIVERY_DATA_DAY', 'END_

(-175.20478797440643, 117.62429657513884)

In [16]:
e(RandomForestRegressor(random_state=0, max_depth=10, n_estimators=100), verbose=True)

feats:  ['CLIENT_VAT', 'TIMO_ID', 'AT_KM', 'CZ_KM', 'DE_KM', 'EE_KM', 'HU_KM', 'LT_KM', 'LV_KM', 'PL_KM', 'SE_KM', 'SK_KM', 'VEHICLE_TYPE', 'BODY_TYPE', 'LOAD_UNLOAD_METHOD', 'EPALE', 'GOODS_TYPE', 'TEMP_MIN', 'TEMP_MAX', 'LDM', 'TONS', 'REQUIREMENTS', 'OTHER_COSTS', 'QTY_LOADS', 'QTY_DELIVERIES', 'SOURCE', 'PAYMENT TERM', 'CARGO_VALUE_EURO', 'CUSTOMS', 'START_LOAD_DATA_YEAR', 'START_LOAD_DATA_MONTH', 'START_LOAD_DATA_WEEK_OF_YEAR', 'START_LOAD_DATA_WEEKDAY', 'START_LOAD_DATA_DAY', 'START_LOAD_DATA_DAY_OF_YEAR', 'END_LOAD_DATA_YEAR', 'END_LOAD_DATA_MONTH', 'END_LOAD_DATA_WEEK_OF_YEAR', 'END_LOAD_DATA_WEEKDAY', 'END_LOAD_DATA_DAY', 'END_LOAD_DATA_DAY_OF_YEAR', 'START_DELIVERY_DATA_YEAR', 'START_DELIVERY_DATA_MONTH', 'START_DELIVERY_DATA_WEEK_OF_YEAR', 'START_DELIVERY_DATA_WEEKDAY', 'START_DELIVERY_DATA_DAY', 'START_DELIVERY_DATA_DAY_OF_YEAR', 'END_DELIVERY_DATA_YEAR', 'END_DELIVERY_DATA_MONTH', 'END_DELIVERY_DATA_WEEK_OF_YEAR', 'END_DELIVERY_DATA_WEEKDAY', 'END_DELIVERY_DATA_DAY', 'END_

(-132.31738361268796, 102.80995416311973)

In [17]:
e(ExtraTreesRegressor(random_state=0, max_depth=10, n_estimators=100), verbose=True)

feats:  ['CLIENT_VAT', 'TIMO_ID', 'AT_KM', 'CZ_KM', 'DE_KM', 'EE_KM', 'HU_KM', 'LT_KM', 'LV_KM', 'PL_KM', 'SE_KM', 'SK_KM', 'VEHICLE_TYPE', 'BODY_TYPE', 'LOAD_UNLOAD_METHOD', 'EPALE', 'GOODS_TYPE', 'TEMP_MIN', 'TEMP_MAX', 'LDM', 'TONS', 'REQUIREMENTS', 'OTHER_COSTS', 'QTY_LOADS', 'QTY_DELIVERIES', 'SOURCE', 'PAYMENT TERM', 'CARGO_VALUE_EURO', 'CUSTOMS', 'START_LOAD_DATA_YEAR', 'START_LOAD_DATA_MONTH', 'START_LOAD_DATA_WEEK_OF_YEAR', 'START_LOAD_DATA_WEEKDAY', 'START_LOAD_DATA_DAY', 'START_LOAD_DATA_DAY_OF_YEAR', 'END_LOAD_DATA_YEAR', 'END_LOAD_DATA_MONTH', 'END_LOAD_DATA_WEEK_OF_YEAR', 'END_LOAD_DATA_WEEKDAY', 'END_LOAD_DATA_DAY', 'END_LOAD_DATA_DAY_OF_YEAR', 'START_DELIVERY_DATA_YEAR', 'START_DELIVERY_DATA_MONTH', 'START_DELIVERY_DATA_WEEK_OF_YEAR', 'START_DELIVERY_DATA_WEEKDAY', 'START_DELIVERY_DATA_DAY', 'START_DELIVERY_DATA_DAY_OF_YEAR', 'END_DELIVERY_DATA_YEAR', 'END_DELIVERY_DATA_MONTH', 'END_DELIVERY_DATA_WEEK_OF_YEAR', 'END_DELIVERY_DATA_WEEKDAY', 'END_DELIVERY_DATA_DAY', 'END_

(-132.7522581060204, 104.39785075733631)

In [18]:
e(GradientBoostingRegressor(random_state=0, max_depth=10, n_estimators=100), verbose=True)

feats:  ['CLIENT_VAT', 'TIMO_ID', 'AT_KM', 'CZ_KM', 'DE_KM', 'EE_KM', 'HU_KM', 'LT_KM', 'LV_KM', 'PL_KM', 'SE_KM', 'SK_KM', 'VEHICLE_TYPE', 'BODY_TYPE', 'LOAD_UNLOAD_METHOD', 'EPALE', 'GOODS_TYPE', 'TEMP_MIN', 'TEMP_MAX', 'LDM', 'TONS', 'REQUIREMENTS', 'OTHER_COSTS', 'QTY_LOADS', 'QTY_DELIVERIES', 'SOURCE', 'PAYMENT TERM', 'CARGO_VALUE_EURO', 'CUSTOMS', 'START_LOAD_DATA_YEAR', 'START_LOAD_DATA_MONTH', 'START_LOAD_DATA_WEEK_OF_YEAR', 'START_LOAD_DATA_WEEKDAY', 'START_LOAD_DATA_DAY', 'START_LOAD_DATA_DAY_OF_YEAR', 'END_LOAD_DATA_YEAR', 'END_LOAD_DATA_MONTH', 'END_LOAD_DATA_WEEK_OF_YEAR', 'END_LOAD_DATA_WEEKDAY', 'END_LOAD_DATA_DAY', 'END_LOAD_DATA_DAY_OF_YEAR', 'START_DELIVERY_DATA_YEAR', 'START_DELIVERY_DATA_MONTH', 'START_DELIVERY_DATA_WEEK_OF_YEAR', 'START_DELIVERY_DATA_WEEKDAY', 'START_DELIVERY_DATA_DAY', 'START_DELIVERY_DATA_DAY_OF_YEAR', 'END_DELIVERY_DATA_YEAR', 'END_DELIVERY_DATA_MONTH', 'END_DELIVERY_DATA_WEEK_OF_YEAR', 'END_DELIVERY_DATA_WEEKDAY', 'END_DELIVERY_DATA_DAY', 'END_

(-174.11556861123103, 123.93129437736748)

In [19]:
def im(model_cls, model_params, verbose=False):
    df_new = feature_engineering(df.copy())
    feats = get_feats(df_new)
    if verbose:
        print("feats: ", feats)
        print("model params: ", model_params)

    X, y = df_new[feats].values, df['EURO']
    model = model_cls(**model_params)
    model.fit(X, y)

    return eli5.show_weights(model, feature_names=feats)

In [20]:
im(DecisionTreeRegressor, dict(random_state=0, max_depth=10), verbose=False)

Weight,Feature
0.5713,Distance [km]
0.1863,Median Price in Relation
0.0866,Mean Price in Relation
0.0440,Mean Price for start load data day in Relation
0.0374,Median Price for Vehicle Type in Relation
0.0349,Mean Price for Vehicle Type in Relation
0.0097,SK_KM
0.0061,Median Price for start start delivery weekday in Relation
0.0046,Mean Price for start delivery weekday in Relation
0.0044,CZ_KM


In [21]:
im(RandomForestRegressor, dict(random_state=0, max_depth=10, n_estimators=100), verbose=False)

Weight,Feature
0.5339  ± 0.2172,Distance [km]
0.0485  ± 0.2194,Median Price to Country
0.0476  ± 0.1278,Mean Price for start load data day in Relation
0.0335  ± 0.1461,Mean Price in Relation
0.0296  ± 0.1269,Mean Price for start delivery weekday in Relation
0.0295  ± 0.1297,Median Price for start start delivery weekday in Relation
0.0294  ± 0.1589,Mean Price for start load day to Country
0.0293  ± 0.1114,Median Price for Vehicle Type in Relation
0.0286  ± 0.1107,Mean Price for Vehicle Type in Relation
0.0277  ± 0.1208,Median Price in Relation


In [22]:
im(ExtraTreesRegressor, dict(random_state=0, max_depth=10, n_estimators=100), verbose=False)

Weight,Feature
0.4125  ± 0.2128,Distance [km]
0.0897  ± 0.2006,Median Price to Country
0.0472  ± 0.1640,Mean Price to Country
0.0322  ± 0.1366,Mean Price in Relation
0.0301  ± 0.0898,Mean Price for start load data day in Relation
0.0285  ± 0.1121,Median Price for start start delivery weekday in Relation
0.0273  ± 0.0814,Median Price for start load data day in Relation
0.0245  ± 0.1057,Median Price for Vehicle Type in Relation
0.0240  ± 0.1007,Mean Price for Vehicle Type in Relation
0.0235  ± 0.0885,Mean Price for start delivery weekday in Relation


In [23]:
im(GradientBoostingRegressor, dict(random_state=0, max_depth=10, n_estimators=100), verbose=False)

Weight,Feature
0.5798  ± 0.0375,Distance [km]
0.1103  ± 0.1993,Median Price in Relation
0.0949  ± 0.1898,Mean Price in Relation
0.0411  ± 0.0523,Mean Price for start load data day in Relation
0.0223  ± 0.0303,Median Price for Vehicle Type in Relation
0.0166  ± 0.0429,Median Price for start start delivery weekday in Relation
0.0165  ± 0.0436,SE_KM
0.0133  ± 0.0335,Median Price for Vehicle Type
0.0126  ± 0.0120,BODY_TYPE
0.0124  ± 0.0401,Mean Price for start delivery weekday in Relation
