# Decision Trees & Random Forests

For imbalanced datasets such as this one, accuracy is not an appropriate metric. Instead, we use Reciever Operating Characteristic Area Under the Curve (ROC AUC).

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve

In [11]:
SEED = 42

In [2]:
df = pd.read_csv("assets/2015.csv")
df.head()

Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,CTELENUM,...,_PAREC1,_PASTAE1,_LMTACT1,_LMTWRK1,_LMTSCL1,_RFSEAT2,_RFSEAT3,_FLSHOT6,_PNEUMO2,_AIDTST3
0,1.0,1.0,b'01292015',b'01',b'29',b'2015',1200.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0
1,1.0,1.0,b'01202015',b'01',b'20',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,2.0,2.0,3.0,3.0,4.0,2.0,2.0,,,2.0
2,1.0,1.0,b'02012015',b'02',b'01',b'2015',1200.0,2015000000.0,2015000000.0,1.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,
3,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,9.0
4,1.0,1.0,b'01142015',b'01',b'14',b'2015',1100.0,2015000000.0,2015000000.0,1.0,...,4.0,2.0,1.0,1.0,1.0,1.0,1.0,,,1.0


In [3]:
df.shape

(441456, 330)

In [4]:
len(df.columns.values)

330

In [5]:
df = df.select_dtypes("number")
len(df.columns.values)

323

#### Re-assign target to 1 or 0, drop other values

In [6]:
target = '_RFHLTH'
df[target].value_counts()

1.0    358072
2.0     82137
9.0      1247
Name: _RFHLTH, dtype: int64

In [7]:
df[target] = df[target].replace({2: 0})
df[target].value_counts()

1.0    358072
0.0     82137
9.0      1247
Name: _RFHLTH, dtype: int64

In [8]:
df = df[df[target] <= 1]
df[target].value_counts()

1.0    358072
0.0     82137
Name: _RFHLTH, dtype: int64

#### Drop columns that are different versions of our target variable

In [9]:
df = df.drop(columns = ['POORHLTH', 'PHYSHLTH', 'GENHLTH', 'PAINACT2', 
                        'QLMENTL2', 'QLSTRES2', 'QLHLTH2', 'HLTHPLN1', 'MENTHLTH'])

#### Deal with missing values

In [10]:
df.isna().sum()

_STATE           0
FMONTH           0
DISPCODE         0
SEQNO            0
_PSU             0
CTELENUM    186394
PVTRESD1    186394
COLGHOUS    440165
STATERES    186396
CELLFON3    186393
LADULT      440165
NUMADULT    186419
NUMMEN      186536
NUMWOMEN    186537
CTELNUM1    253815
CELLFON2    253815
CADULT      253816
PVTRESD2    253815
CCLGHOUS    439142
CSTATE      253814
LANDLINE    254558
HHADULT     258989
PERSDOC2         0
MEDCOST          1
CHECKUP1         1
BPHIGH4          1
BPMEDS      262632
BLOODCHO         0
CHOLCHK      58915
TOLDHI2      58915
             ...  
PADUR1_     153041
PADUR2_     248194
PAFREQ1_    149625
PAFREQ2_    245837
_MINAC11    154814
_MINAC21    157112
STRFREQ_     43780
PAMISS1_         0
PAMIN11_    157736
PAMIN21_    160004
PA1MIN_     151753
PAVIG11_    152990
PAVIG21_    157349
PA1VIGM_    150319
_PACAT1          0
_PAINDX1         0
_PA150R2         0
_PA300R2         0
_PA30021         0
_PASTRNG         0
_PAREC1          0
_PASTAE1    

In [15]:
# Extract the labels
labels = np.array(df.pop(target))

# 30% examples in test data
X_train, X_test, y_train, y_test = train_test_split(df, labels, 
                                                          stratify = labels,
                                                          test_size = 0.3, 
                                                          random_state = SEED)

In [16]:
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

# Features for feature importances
features = list(X_train.columns)

In [17]:
X_train.shape, X_test.shape

((308146, 313), (132063, 313))

#### Fit a decision tree

In [20]:
tree = DecisionTreeClassifier(random_state=SEED)
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [21]:
print("Decision Tree has {} nodes with max depth {}".format(tree.tree_.node_count, tree.tree_.max_depth))

Decision Tree has 54429 nodes with max depth 77


#### Assess performance for overfitting

In [22]:
train_probs = tree.predict_proba(X_train)[:, 1]
test_probs = tree.predict_proba(X_test)[:, 1]

train_predictions = tree.predict(X_train)
test_predictions = tree.predict(X_test)

In [26]:
print("Train ROC AUC Score: {}".format(roc_auc_score(y_train, train_predictions)))
print("Test ROC AUC Score: {}".format(roc_auc_score(y_test, test_predictions)))

Train ROC AUC Score: 1.0
Test ROC AUC Score: 0.6802190222196389


In [29]:
guess = [1 for p in range(len(y_test))]
print("Baseline ROC AUC Score: {}".format(roc_auc_score(y_test, guess)))

Baseline ROC AUC Score: 0.5
