In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chenyanhui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import pandas as pd
import joblib

from datazoo.model import ZooRandomForest
from datazoo.featurize import ZooFeaturizer

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

# 1. Prepare Data

In [4]:
dict_label = {
    'numeric': 0,
    'categorical': 1,
    'datetime': 2,
    'sentence': 3,
    'url': 4,
    'embedded-number': 5,
    'list': 6,
    'not-generalizable': 7,
    'context-specific': 8
}


x_train = pd.read_csv('../data/ml/data_train.csv')
x_test = pd.read_csv('../data/ml/data_test.csv')


x_train = x_train.sample(frac=1,random_state=100).reset_index(drop=True)
print(len(x_train))

y_train = x_train.loc[:,['y_act']]
y_test = x_test.loc[:,['y_act']]

y_train['y_act'] = [dict_label[i] for i in y_train['y_act']]
y_test['y_act'] = [dict_label[i] for i in y_test['y_act']]

7936


# 2. Train and Test Model

In [5]:
model = ZooRandomForest()

In [6]:
model.fit(x_train, y_train)

[W 220324 23:48:29 base:44] vectorizer not fitted yet
[I 220324 23:48:29 base:28] vectorizer not fitted. Doing fit and transform
[I 220324 23:48:29 base:36] total length of from feature extraction: 7936


Fitting 5 folds for each of 16 candidates, totalling 80 fits


[I 220324 23:49:19 random_forest:28] best params: {'bootstrap': True, 'max_depth': 100, 'n_estimators': 500}


In [7]:
model.score(x_test, y_test, training=True)

[I 220324 23:49:19 base:25] vectorizer already fitted. Doing transform
[I 220324 23:49:19 base:36] total length of from feature extraction: 1985


0.9329974811083124

In [8]:
joblib.dump(model, '../artifacts/model_rf.joblib')

['../artifacts/model_rf.joblib']

# 3. Test Model with Featurizer for Input Raw Feature Files

In [9]:
fe = ZooFeaturizer(model.vectorizer)

In [10]:
joblib.dump(fe, '../artifacts/featurizer.joblib')

['../artifacts/featurizer.joblib']

### 3.1 Raw Data Feature Engineering

In [11]:
#raw_data = pd.read_csv('../data/datasets/accident.csv')
#raw_data = pd.read_csv('../data/datasets/bbc.csv')
test_raw_data = pd.read_csv('/Users/chenyanhui/Desktop/heart_2020_cleaned.csv')
test_raw_data

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [12]:
test_data_featurized = fe.featurize(test_raw_data)
test_data_featurized

[I 220324 23:49:21 base:25] vectorizer already fitted. Doing transform
[I 220324 23:49:21 base:36] total length of from feature extraction: 18


Unnamed: 0,total_vals,num_nans,%_nans,num_of_dist_val,%_dist_val,mean,std_dev,min_val,max_val,has_delimiters,...,1443,1444,1445,1446,1447,1448,1449,1450,1451,1452
0,319795,0,0.000625,2,0.000625,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0
1,319795,0,1.126972,3604,1.126972,28.325399,6.35609,12.02,94.85,False,...,0,0,0,0,0,0,0,0,0,0
2,319795,0,0.000625,2,0.000625,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0
3,319795,0,0.000625,2,0.000625,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0
4,319795,0,0.000625,2,0.000625,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0
5,319795,0,0.009694,31,0.009694,3.37171,7.950838,0.0,30.0,False,...,0,0,0,0,0,0,0,0,0,0
6,319795,0,0.009694,31,0.009694,3.898366,7.955223,0.0,30.0,False,...,0,0,0,0,0,0,0,0,0,0
7,319795,0,0.000625,2,0.000625,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0
8,319795,0,0.000625,2,0.000625,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0
9,319795,0,0.004065,13,0.004065,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,0,0,0


### 3.2 Predict Test Data

In [13]:
y_rf = model.predict(test_data_featurized).tolist()
y_rf

[1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [14]:
dict_int_label = {
    0: 'numeric',
    1: 'categorical',
    2: 'datetime',
    3: 'sentence',
    4: 'url',
    5: 'embedded-number',
    6: 'list',
    7: 'not-generalizable',
    8: 'context-specific'
}

 
for name, int_lable in zip(test_raw_data.columns, y_rf):
    print(f'Column:  {name}\nType: {dict_int_label[int_lable]}\n')

Column:  HeartDisease
Type: categorical

Column:  BMI
Type: numeric

Column:  Smoking
Type: categorical

Column:  AlcoholDrinking
Type: categorical

Column:  Stroke
Type: categorical

Column:  PhysicalHealth
Type: categorical

Column:  MentalHealth
Type: categorical

Column:  DiffWalking
Type: categorical

Column:  Sex
Type: categorical

Column:  AgeCategory
Type: categorical

Column:  Race
Type: categorical

Column:  Diabetic
Type: categorical

Column:  PhysicalActivity
Type: categorical

Column:  GenHealth
Type: categorical

Column:  SleepTime
Type: categorical

Column:  Asthma
Type: categorical

Column:  KidneyDisease
Type: categorical

Column:  SkinCancer
Type: categorical



In [15]:
y_rf_prob = model.predict_proba(test_data_featurized).tolist()
y_rf_prob

[[0.036, 0.758, 0.014, 0.0, 0.006, 0.006, 0.0, 0.132, 0.048],
 [0.566, 0.208, 0.068, 0.0, 0.0, 0.0, 0.0, 0.064, 0.094],
 [0.064, 0.756, 0.008, 0.0, 0.006, 0.008, 0.002, 0.116, 0.04],
 [0.052,
  0.7066666666666668,
  0.004,
  0.006,
  0.034,
  0.024,
  0.0,
  0.134,
  0.03933333333333334],
 [0.022, 0.818, 0.002, 0.0, 0.002, 0.002, 0.0, 0.12, 0.034],
 [0.328, 0.392, 0.034, 0.0, 0.0, 0.006, 0.0, 0.078, 0.162],
 [0.332, 0.402, 0.032, 0.0, 0.002, 0.004, 0.0, 0.086, 0.142],
 [0.086, 0.704, 0.004, 0.002, 0.006, 0.018, 0.002, 0.126, 0.052],
 [0.01, 0.8372, 0.0, 0.0, 0.002, 0.01, 0.0, 0.102, 0.038799999999999994],
 [0.052,
  0.6566666666666667,
  0.032,
  0.0,
  0.008,
  0.026,
  0.0,
  0.20733333333333331,
  0.018],
 [0.026,
  0.7536666666666667,
  0.002,
  0.002,
  0.0,
  0.022,
  0.002,
  0.11533333333333334,
  0.077],
 [0.024, 0.532, 0.008, 0.08, 0.002, 0.064, 0.12, 0.03, 0.14],
 [0.068,
  0.6766666666666667,
  0.012,
  0.0,
  0.004,
  0.018,
  0.008,
  0.15,
  0.06333333333333334],
 [0.03,

In [16]:
test_data_featurized.columns[:25]

Index(['total_vals', 'num_nans', '%_nans', 'num_of_dist_val', '%_dist_val',
       'mean', 'std_dev', 'min_val', 'max_val', 'has_delimiters', 'has_url',
       'has_email', 'has_date', 'mean_word_count', 'stdev_word_count',
       'mean_stopword_total', 'stdev_stopword_total', 'mean_char_count',
       'stdev_char_count', 'mean_whitespace_count', 'stdev_whitespace_count',
       'mean_delim_count', 'stdev_delim_count', 'is_list', 'is_long_sentence'],
      dtype='object')

In [17]:
test_data_featurized.columns[25:]

Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       1443, 1444, 1445, 1446, 1447, 1448, 1449, 1450, 1451, 1452],
      dtype='object', length=1453)

# 4. SHAP Values

In [18]:
import shap

my_model = model._best_model
# Create object that can calculate shap values
explainer = shap.TreeExplainer(my_model)

In [19]:
%%time
# Calculate Shap values
shap_values = explainer.shap_values(test_data_featurized.iloc[0], check_additivity=False)

CPU times: user 1.88 s, sys: 20.6 ms, total: 1.9 s
Wall time: 1.89 s


In [20]:
shap_values, len(shap_values)

([array([-7.04239329e+01, -7.05131132e+01, -8.25348181e+01, ...,
          8.73520076e-06,  1.11699605e-05,  6.23623430e-06]),
  array([ 5.57896313e+01,  4.69791451e+01, -1.60892320e+02, ...,
          5.53204177e-05,  3.89727773e-04, -4.16009867e-06]),
  array([-2.58581880e-03,  1.60853526e-03,  2.96169539e-03, ...,
          5.84809346e-06,  8.68059086e-07,  1.81884688e-06]),
  array([-1.48812245e-02,  3.46177187e-02, -1.64850051e-02, ...,
          7.61050977e-08,  6.27008038e-09,  4.37399128e-08]),
  array([-6.30319363e-03, -4.46914950e-04, -4.74618737e-04, ...,
         -8.52529461e-05,  1.24887352e-06,  2.31332876e-07]),
  array([-1.78366846e-02,  3.06751460e-03, -2.81395638e-03, ...,
          1.21985324e-06,  7.27985014e-07,  3.40550651e-07]),
  array([-5.06333400e-03,  8.64619846e-04,  2.61510156e-04, ...,
          1.71037494e-07,  1.12808441e-07,  1.49160266e-08]),
  array([-1.05731158e+00, -1.29097951e+00, -1.10243312e+00, ...,
         -6.06028415e-06,  1.17589047e-04,  2.

In [21]:
y_rf

[1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [22]:
shap_values[0], len(shap_values[0])

(array([-7.04239329e+01, -7.05131132e+01, -8.25348181e+01, ...,
         8.73520076e-06,  1.11699605e-05,  6.23623430e-06]),
 1478)