# Check the output of weighted random forest

In [19]:
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
import numpy as np
from functools import reduce

# Import our custom utilities
from imp import reload
from utils import irf_jupyter_utils
from utils import irf_utils
reload(irf_jupyter_utils)
reload(irf_utils)

<module 'utils.irf_utils' from '/home/yu/github/scikit-learn-sandbox/jupyter/utils/irf_utils.py'>

## When feature_weight = None, the output should match Random Forest.

original RF result is stored in feature_weight1 below.

In [41]:
feature_weight0 = None

In [42]:
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=1000, 
                                                                             feature_weight=feature_weight0)

In [43]:
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf,
                                              X_train=X_train, y_train=y_train, 
                                              X_test=X_test, y_test=y_test)
#all_rf_tree_data

In [44]:
# Print the feature importance
feature_importances_rank_idx0 = all_rf_tree_data['feature_importances_rank_idx']
feature_importances0 = all_rf_tree_data['feature_importances']

print(feature_importances0)

[ 0.04633261  0.01279795  0.04649524  0.05372707  0.00613899  0.01410021
  0.04617194  0.08346155  0.00365378  0.00379191  0.01370514  0.00441408
  0.01102853  0.03730455  0.00456819  0.00397247  0.00564707  0.00500305
  0.00441179  0.00429459  0.11649582  0.01827778  0.14157085  0.11806595
  0.01195991  0.01511598  0.03314478  0.11802327  0.00953361  0.00679134]


In [35]:
correct_feature_importance =[ 0.04153319,  0.0136872,   0.05287382,  0.05537257,  0.00571718,  0.01101297,
  0.04525511,  0.08925701,  0.00407582,  0.00337926,  0.01301454,  0.00396505,
  0.01022279,  0.03255195,  0.00498767,  0.00438016,  0.00771317,  0.00459407,
  0.0037973,   0.00448982,  0.10938616,  0.01690837,  0.14415417,  0.1204331,
  0.01276175,  0.01472586,  0.03019196,  0.12449026,  0.00858072,  0.00648698]

## When feature_weight is uniform, it should give the same feature importance.

In [45]:
feature_weight1 = [1]*30

In [46]:
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=1000, 
                                                                             feature_weight=feature_weight1)

In [47]:
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf,
                                              X_train=X_train, y_train=y_train, 
                                              X_test=X_test, y_test=y_test)
#all_rf_tree_data

In [48]:
#feature importance 
feature_importances_rank_idx1 = all_rf_tree_data['feature_importances_rank_idx']
feature_importances1 = all_rf_tree_data['feature_importances']

print(feature_importances1)

[ 0.0415338   0.01366645  0.05287091  0.05537852  0.00572123  0.01101832
  0.04525511  0.08928869  0.00409541  0.00339203  0.01302611  0.0039895
  0.01021391  0.03256535  0.00498974  0.00435811  0.00778621  0.00450661
  0.00379767  0.00445639  0.10939255  0.01688089  0.14412127  0.12044069
  0.01279928  0.01472586  0.03016603  0.12449986  0.00858419  0.00647933]


## When feature_weight is weighted, it should give the roughly same feature ranking.

In [56]:
feature_weight2 = correct_feature_importance

In [57]:
X_train, X_test, y_train, y_test, rf = irf_jupyter_utils.generate_rf_example(n_estimators=1000, 
                                                                             feature_weight=feature_weight2)

In [58]:
all_rf_tree_data = irf_utils.get_rf_tree_data(rf=rf,
                                              X_train=X_train, y_train=y_train, 
                                              X_test=X_test, y_test=y_test)
#all_rf_tree_data

In [59]:
#feature importance 
feature_importances_rank_idx2 = all_rf_tree_data['feature_importances_rank_idx']
feature_importances2 = all_rf_tree_data['feature_importances']
for f in range(X_train.shape[1]):
    print("%2d. feature %2d (%10.9f) and feature %2d (%10.9f)" % (f + 1
                                   , feature_importances_rank_idx1[f]
                                   , feature_importances1[feature_importances_rank_idx1[f]]
                                   , feature_importances_rank_idx2[f]
                                   , feature_importances2[feature_importances_rank_idx2[f]]))

 1. feature 22 (0.144121272) and feature 22 (0.362967269)
 2. feature 27 (0.124499864) and feature 27 (0.188293202)
 3. feature 23 (0.120440687) and feature 23 (0.150729617)
 4. feature 20 (0.109392549) and feature  7 (0.099945902)
 5. feature  7 (0.089288686) and feature 20 (0.093353878)
 6. feature  3 (0.055378517) and feature 26 (0.012864982)
 7. feature  2 (0.052870910) and feature  6 (0.012603343)
 8. feature  6 (0.045255112) and feature  3 (0.012566934)
 9. feature  0 (0.041533803) and feature  2 (0.010233424)
10. feature 13 (0.032565353) and feature 21 (0.009941866)
11. feature 26 (0.030166028) and feature 13 (0.009454457)
12. feature 21 (0.016880889) and feature  1 (0.006836309)
13. feature 25 (0.014725862) and feature  0 (0.006545199)
14. feature  1 (0.013666447) and feature 24 (0.003600213)
15. feature 10 (0.013026106) and feature 25 (0.003404557)
16. feature 24 (0.012799277) and feature 10 (0.002887115)
17. feature  5 (0.011018324) and feature 28 (0.001976389)
18. feature 12