## Random Forest Feature Importance

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../')
%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics

### Load the data

In [3]:
PATH = "../../data/bulldozers/"

%time df_raw = pd.read_csv(f'{PATH}Train.csv', low_memory=False)
df_raw.SalePrice = np.log(df_raw.SalePrice)
train_cats(df_raw)
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')

CPU times: user 3.81 s, sys: 547 ms, total: 4.36 s
Wall time: 3.79 s


In [4]:
def split_vals(a, n): 
    return a[:n], a[n:]


n_valid = 12000
n_trn = len(df_trn)-n_valid
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)
raw_train, raw_valid = split_vals(df_raw, n_trn)

In [5]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

### Fit the model 

In [6]:
set_rf_samples(50000)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 26.6 s, sys: 998 ms, total: 27.6 s
Wall time: 9.32 s
[0.25577257039647905, 0.2539062738843095, 0.8632766276265883, 0.884868320947082, 0.8390609884931507]


### Check Feature Importance

In [7]:
%time fi = rf_feat_importance(m, df_trn)
fi[:10]

CPU times: user 24.3 ms, sys: 3.52 ms, total: 27.8 ms
Wall time: 105 ms


Unnamed: 0,cols,imp
5,YearMade,0.1639
38,Coupler_System,0.100386
14,ProductSize,0.09322
15,fiProductClassDesc,0.08849
40,Hydraulics_Flow,0.061825
2,ModelID,0.060568
0,SalesID,0.048067
11,fiSecondaryDesc,0.045267
20,Enclosure,0.044668
39,Grouser_Tracks,0.042797


## Implementing Feature Importance from Scratch

In [8]:
base = rmse(m.predict(X_valid), y_valid)

In [9]:
%%time
scores={}
for col in X_valid.columns:
    X = X_valid.copy()
    X[col] = np.random.choice(X[col], len(X))
    scores[col]=rmse(m.predict(X), y_valid)-base

CPU times: user 6.7 s, sys: 274 ms, total: 6.97 s
Wall time: 7.9 s


In [10]:
fi = sorted(scores.items(), key=lambda x: x[1], reverse=True)
pd.DataFrame(fi, columns = ['feature', 'importance'])[:10]

Unnamed: 0,feature,importance
0,YearMade,0.227399
1,ProductSize,0.219038
2,Coupler_System,0.171358
3,fiProductClassDesc,0.124646
4,Hydraulics_Flow,0.062298
5,fiSecondaryDesc,0.048131
6,Enclosure,0.038132
7,Grouser_Tracks,0.036796
8,ModelID,0.03078
9,fiModelDesc,0.019429
