In [1]:
import numpy as np
import pandas as pd
import sys, pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split, GridSearchCV

Remove the irrelevant features or those that have repeated information such as age in age groups when age is available as a continuous variable. Recode missing values, refused to answers to '123456789', and if the answer was none as an asnwer to, for example, 'how many drinks did you have in the last 30 days?' to 0.

In [2]:
brfss = pd.read_csv("brfss2014.csv", index_col = 0, encoding='cp1252')
brfss = brfss.drop(['fmonth','imonth','iday','idate','seqno','x.psu','colghous','ladult','flshtmy2','dlyother',
                    'alcday5','rcsbirth','x.ststr','x.strwt','x.rawrake','x.wt2rake','x.dualcor','x.llcpwt2',
                    'x.llcpwt','x.ageg5yr','x.age65yr','x.age.g','htm4','x.bmi5','x.rfbmi5','x.drnkdy4','x.smoker3',
                   'x.rfdrhv4','x.rfdrmn4','x.rfdrwm4','x.rfseat2','x.mam502y','x.mam5021','x.rfblds3','x.col10yr',
                   'x.cllcpwt','x.casthm1','x.ltasth1','x.mrace1'],axis=1)
brfss = brfss.replace('NaN',123456789)
brfss = brfss.replace([77,777,7777],123456789)
brfss =  brfss.replace([99,999,9999], 123456789)
brfss = brfss.replace([88,888,8888], 0)
# keep only the recalculated or imputed variables


  interactivity=interactivity, compiler=compiler, result=result)


Choose AIDS testing status (x.aidtst3) as label, its values are : 1 tested, 2 not tested, [7,9,123456789] refused, don't know and missing, respectively.

In [7]:
label_var = 'x.aidtst3'
brfss = brfss.loc[brfss[label_var] != 9]
brfss = brfss.loc[brfss[label_var] != 7]
brfss = brfss.loc[brfss[label_var] != 123456789]
label = brfss[label_var]
print(brfss.shape)
sys.getsizeof(brfss)/1024**2


(418626, 240)


769.7209854125977

In [8]:
label.value_counts()

2.0    301619
1.0    117007
Name: x.aidtst3, dtype: int64

Although the size of the categorical_variables data frame doesn't look very large, the encoding process was too much for my 32GB machine and doesn't fit in its memory (probably it would by looping variable-by-variable and then saving all OHE data as sparse) so we can also choose a few of the variables for further analysis

In [9]:
ratio_scale_variable_labels = ['sleptim1','x.age80','x.impnph','htin4','wtkg3','drocdy3.','x.drnkmo4']
ratio_scale_variables = brfss[ratio_scale_variable_labels]
categorical_variable_labels = ['x.imprace','x.impeduc','x.impmrtl','x.impcsex','x.asthms1','x.incomg',
                                'x.rfseat3','x.flshot6','x.pneumo2','x.bmi5cat','x.rfmam2y','x.denvst2','x.rfsmok3']
categorical_variables = brfss[categorical_variable_labels].astype('int64')
print(categorical_variables.shape)
variables = pd.concat([categorical_variables, ratio_scale_variables], axis = 1)
print(variables.shape)

(418626, 13)
(418626, 20)


Now, it must be small enough for further processing.

In [10]:
sys.getsizeof(categorical_variables)/1024**2

44.71410369873047

The first 13 features are categorical, the remaining 7 are on interval/ratio scale.

In [11]:
ohe = OneHotEncoder(categorical_features = range(13))
ohe_variables = ohe.fit_transform(variables)

Get back the feature names for plotting the tree

In [12]:
uniq_vals = categorical_variables.apply(lambda x: x.value_counts()).unstack()
uniq_vals = uniq_vals[~uniq_vals.isnull()]
categorical_ohe_feature_labels = list(uniq_vals.index.map('{0[0]}_{0[1]}'.format))
feature_names = np.hstack([categorical_ohe_feature_labels, ratio_scale_variable_labels])

Split the data to 0.67 training, 0.33 test sets

In [13]:
X_train, X_test, y_train, y_test = train_test_split(ohe_variables, label, test_size=0.33, random_state=1234)

## Run: Fit a 5-levels deep decision tree

In [14]:
clf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=61, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=25, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [15]:
clf.fit(X_train, y_train)
export_graphviz(clf, out_file = 'tree.dot', feature_names = feature_names, class_names = ['tested','not-tested'],
               rounded =  True, filled = True)
!dot -Tpng tree.dot -o tree.png  

In [17]:
pickle.dump(clf, open('decision_tree_AIDS_5lvl.pickle','wb'))

## Not run: find optimal tree parameters by cross-validation

In [None]:
clf = DecisionTreeClassifier(random_state=123)
params = {'max_depth':[3,4,5,10,12,15], 'min_samples_split':[2,5,10,50,100], 'min_samples_leaf':[1,5,10,25],
          'max_features':[5,10,20,30,61], 'class_weight':[None,'balanced']}
gd = GridSearchCV(clf, params, cv = 10, verbose = 1, n_jobs = 4)
gd.fit(X_train, y_train)

Fitting 10 folds for each of 1200 candidates, totalling 12000 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    8.1s
Process ForkPoolWorker-4:
Traceback (most recent call last):
  File "/home/adam/anaconda3/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/adam/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/adam/anaconda3/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()


In [10]:
gd.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=12,
            max_features=61, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=25, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [11]:
gd.best_score_

0.85416975163222719

In [12]:
clf_best = gd.best_estimator_
clf_best.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=12,
            max_features=61, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=25, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')