# Predict calculated fields

Identify the most important features in predicting calculated features, using random forest variable importance.

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble

### Columns

In [5]:
COLS = [
    'DSI',
    'E00200',
    'E00300',
    'E00400',
    'E00600',
    'E00650',
    'E00700',
    'E00800',
    'E00900',
    'E01100',
    'E01200',
    'E01400',
    'E01500',
    'E01700',
    'E02000',
    'E02100',
    'E02300',
    'E02400',
    'E03150',
    'E03210',
    'E03220',
    'E03230',
    'E03240',
    'E03270',
    'E03290',
    'E03300',
    'E03400',
    'E03500',
    'E07240',
    'E07260',
    'E07300',
    'E07400',
    'E07600',
    'E09700',
    'E09800',
    'E09900',
    'E11200',
    'E17500',
    'E18400',
    'E18500',
    'E19200',
    'E19800',
    'E20100',
    'E20400',
    'E24515',
    'E24518',
    'E26270',
    'E27200',
    'E32800',
    'E58990',
    'E62900',
    'E87521',
    'E87530',
    'EIC',
    'F2441',
    'F6251',
    'FDED',
    'MARS',
    'MIDR',
    'N24',
    'P08000',
    'P22250',
    'P23250',
    'S006',
    'XTOT']

In [6]:
CALCULATED_COLS = [
    'E00100',
    'E04600',
    'P04470',
    'E04800',
    'E62100',
    'E05800',
    'E08800',
    'E59560',
    'E26190'
]

In [18]:
AGG_RECIDS = [999996, 999997, 999998, 999999]

### Load

In [19]:
puf = pd.read_csv('~/puf2011.csv', usecols=COLS + CALCULATED_COLS + ['RECID'])
puf = puf[~puf.RECID.isin(AGG_RECIDS)].drop('RECID', axis=1)

## Predict

In [30]:
rf = ensemble.RandomForestRegressor(n_estimators=20)

In [32]:
importances_list = []

In [37]:
for col in CALCULATED_COLS:
    print('Analyzing ' + col + '...')
    rf.fit(puf[COLS], puf[col])
    importance = pd.DataFrame({
        'x': COLS,
        'y': col,
        'importance': rf.feature_importances_
    })
    importances_list.append(importance)

In [38]:
importances = pd.concat(importances_list)

In [40]:
importances.sort_values('importance', ascending=False)

Unnamed: 0,x,y,importance
64,XTOT,E04600,9.491970e-01
53,EIC,E59560,7.295009e-01
46,E26270,E26190,5.625608e-01
38,E18400,P04470,4.581230e-01
62,P23250,E04800,4.232409e-01
62,P23250,E62100,3.961944e-01
62,P23250,E00100,3.252834e-01
38,E18400,E08800,3.159273e-01
38,E18400,E05800,3.034495e-01
38,E18400,E00100,2.543323e-01
