In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('data/new-york-taxi-fare-prediction/train.csv', nrows=50000)

In [4]:
data = data.query('pickup_latitude > 40.7 and pickup_latitude < 40.8 and ' +
                  'dropoff_latitude > 40.7 and dropoff_latitude < 40.8 and ' +
                  'pickup_longitude > -74 and pickup_longitude < -73.9 and ' +
                  'dropoff_longitude > -74 and dropoff_longitude < -73.9 and ' +
                  'fare_amount > 0'
                  )

In [5]:
data.head(2)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1


In [6]:
y = data['fare_amount']
base_features = ['pickup_longitude',
                 'pickup_latitude',
                 'dropoff_longitude',
                 'dropoff_latitude',
                 'passenger_count']
X = data[base_features]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [7]:
first_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(train_X, train_y)

In [8]:
train_X.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,23466.0,23466.0,23466.0,23466.0,23466.0
mean,-73.976827,40.756931,-73.975359,40.757434,1.66232
std,0.014625,0.018206,0.01593,0.018659,1.290729
min,-73.999999,40.700013,-73.999999,40.70002,0.0
25%,-73.987964,40.744901,-73.987143,40.745756,1.0
50%,-73.979629,40.758076,-73.978588,40.758542,1.0
75%,-73.967797,40.769602,-73.966459,40.770406,2.0
max,-73.900062,40.799952,-73.900062,40.799999,6.0


In [9]:
train_y.describe()

count    23466.000000
mean         8.472539
std          4.609747
min          0.010000
25%          5.500000
50%          7.500000
75%         10.100000
max        165.000000
Name: fare_amount, dtype: float64

In [10]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(first_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())

Weight,Feature
0.8474  ± 0.0185,dropoff_latitude
0.8267  ± 0.0263,pickup_latitude
0.6229  ± 0.0559,pickup_longitude
0.5384  ± 0.0305,dropoff_longitude
-0.0029  ± 0.0020,passenger_count


In [11]:
data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude)
data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude)

features_2  = ['pickup_longitude',
               'pickup_latitude',
               'dropoff_longitude',
               'dropoff_latitude',
               'abs_lat_change',
               'abs_lon_change']

X = data[features_2]
new_train_X, new_val_X, new_train_y, new_val_y = train_test_split(X, y, random_state=1)
second_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(new_train_X, new_train_y)

In [12]:
perm2 = PermutationImportance(second_model, random_state=1).fit(new_val_X, new_val_y)
eli5.show_weights(perm2, feature_names = new_val_X.columns.tolist())

Weight,Feature
0.5783  ± 0.0295,abs_lat_change
0.4467  ± 0.0509,abs_lon_change
0.0858  ± 0.0333,pickup_latitude
0.0735  ± 0.0101,dropoff_longitude
0.0733  ± 0.0113,dropoff_latitude
0.0613  ± 0.0063,pickup_longitude


### Question
A colleague observes that the values for abs_lon_change and abs_lat_change are pretty small (all values are between -0.1 and 0.1), whereas other variables have larger values. Do you think this could explain why those coordinates had larger permutation importance values in this case?

Consider an alternative where you created and used a feature that was 100X as large for these features, and used that larger feature for training and importance calculations. Would this change the outputted permutaiton importance values?

Why or why not?

In [13]:
data['large_lat_change'] = 100 * data.abs_lat_change 
features_3  = ['pickup_longitude',
               'pickup_latitude',
               'dropoff_longitude',
               'dropoff_latitude',
               'abs_lat_change',
               'abs_lon_change', 
               'large_lat_change']

X = data[features_3]
new_train_X, new_val_X, new_train_y, new_val_y = train_test_split(X, y, random_state=1)
third_model = RandomForestRegressor(n_estimators=30, random_state=1).fit(new_train_X, new_train_y)

In [14]:
perm3 = PermutationImportance(third_model, random_state=1).fit(new_val_X, new_val_y)
eli5.show_weights(perm3, feature_names = new_val_X.columns.tolist())

Weight,Feature
0.4291  ± 0.0849,abs_lon_change
0.3122  ± 0.0060,large_lat_change
0.2519  ± 0.0283,abs_lat_change
0.0722  ± 0.0259,pickup_latitude
0.0643  ± 0.0211,dropoff_longitude
0.0577  ± 0.0073,dropoff_latitude
0.0458  ± 0.0160,pickup_longitude


### Solution
The scale of features does not affect permutation importance per se. The only reason that rescaling a feature would affect PI is indirectly, if rescaling helped or hurt the ability of the particular learning method we're using to make use of that feature. That won't happen with tree based models, like the Random Forest used here. If you are familiar with Ridge Regression, you might be able to think of how that would be affected. That said, the absolute change features are have high importance because they capture total distance traveled, which is the primary determinant of taxi fares...It is not an artifact of the feature magnitude.