In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree,ensemble,metrics

from rule import Rule
from rule_extraction import rule_extract,draw_tree,rules_vote

##  1. Data Preparation

In [2]:
# fetch dataset
data = pd.read_csv('./dataset/titanic.csv', usecols = ['Age', 'Fare','Sex','Pclass','Survived'])

# drop NA records since most Tree algorithm cannot handle
data.dropna(inplace=True)

# split training/test sets
X_train, X_test, y_train, y_test = train_test_split(data[['Age', 'Fare','Sex','Pclass','Survived']], 
                                                    data.Survived, test_size=0.2, random_state=0)

# dataset shape
print(X_train.shape, X_test.shape)
print(X_train.head(5))

(571, 5) (143, 5)
      Age     Fare     Sex  Pclass  Survived
387  36.0  13.0000  female       2         1
685  25.0  41.5792    male       2         0
20   35.0  26.0000    male       2         0
331  45.5  28.5000    male       1         0
396  31.0   7.8542  female       3         0


## 2. Categorical Variable encoding using Mean Encoding

In [3]:
# Pclass
X_train.groupby(['Pclass'])['Survived'].mean()
ordered_labels = X_train.groupby(['Pclass'])['Survived'].mean().to_dict()
ordered_labels

# Mean Encoding
X_train['Pclass_ordered'] = X_train.Pclass.map(ordered_labels)
X_test['Pclass_ordered'] = X_test.Pclass.map(ordered_labels)

# Sex
X_train.groupby(['Sex'])['Survived'].mean()
ordered_labels = X_train.groupby(['Sex'])['Survived'].mean().to_dict()
ordered_labels

# Mean Encoding
X_train['Sex_ordered'] = X_train.Sex.map(ordered_labels)
X_test['Sex_ordered'] = X_test.Sex.map(ordered_labels)


## 3. Final training data

In [4]:
X_train_proceeded = X_train[['Age', 'Fare','Sex_ordered','Pclass_ordered']]
X_test_proceeded = X_test[['Age', 'Fare','Sex_ordered','Pclass_ordered']]
print(X_train_proceeded.head())

      Age     Fare  Sex_ordered  Pclass_ordered
387  36.0  13.0000     0.740196        0.460432
685  25.0  41.5792     0.204360        0.460432
20   35.0  26.0000     0.204360        0.460432
331  45.5  28.5000     0.204360        0.652482
396  31.0   7.8542     0.740196        0.240550


## 4. Training a Single Decision Tree

In [5]:
# API refer to http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

model_tree_clf = tree.DecisionTreeClassifier(criterion='gini',max_depth=3)
model_tree_clf.fit(X_train_proceeded,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [6]:
# model performance on training set
y_pred = model_tree_clf.predict(X_train_proceeded)
print(metrics.confusion_matrix(y_train,y_pred))

[[311  34]
 [ 67 159]]


##  4.1 Extract all rules without filtering
8 rules in total, corresponding to 8 paths from root to leaf node

In [7]:
rule, _ = rule_extract(model=model_tree_clf,feature_names=X_train_proceeded.columns)
for i in rule:
    print(i)

Sex_ordered > 0.4722778648138046 and Pclass_ordered > 0.3504907488822937 and Fare > 26.125
Sex_ordered <= 0.4722778648138046 and Age > 13.0 and Pclass_ordered <= 0.5564569681882858
Sex_ordered <= 0.4722778648138046 and Age <= 13.0 and Pclass_ordered <= 0.3504907488822937
Sex_ordered > 0.4722778648138046 and Pclass_ordered <= 0.3504907488822937 and Fare <= 20.800000190734863
Sex_ordered <= 0.4722778648138046 and Age > 13.0 and Pclass_ordered > 0.5564569681882858
Sex_ordered <= 0.4722778648138046 and Age <= 13.0 and Pclass_ordered > 0.3504907488822937
Sex_ordered > 0.4722778648138046 and Pclass_ordered > 0.3504907488822937 and Fare <= 26.125
Sex_ordered > 0.4722778648138046 and Pclass_ordered <= 0.3504907488822937 and Fare > 20.800000190734863


## 4.2 Output the tree sturcture
compared with the confusion matrix on training data:

 | pred=0 | pred=1
- | :-: | -: 
true=0 | 311 | 34
true=1|  67 |  159

the graph's result match perfectly

In [8]:
# blue node (class=1) denote the node make prediction of class 1
# orange node (class=0) denote the node make prediction of class 0
#  the darker the color, the more purity the node has 
# values refer to the absolute number of labeled samples in that node
# eg, the 1st leaf node [12,7] means that 12 class 0 samples and 7 class 1 samples are in that node
draw_tree(model=model_tree_clf,
          outdir='./images/DecisionTree/',
          feature_names=X_train_proceeded.columns,
          proportion=False,
          class_names=['0','1'])

![title](images/DecisionTree/DecisionTree.jpeg) 

##  5. Extract rule with filtering
rule_dict:  rule, recall on 1-class, prec on 1-class, recall on 0-class, prec on 0-class, nb


In [9]:
# filter rule
rules, rule_dict = rule_extract(model=model_tree_clf,
                                 feature_names=X_train_proceeded.columns,
                                 x_test=X_test_proceeded,
                                 y_test=y_test,
                                 sort_key=0,
                                 recall_min_c1=0.,
                                 precision_min_c1=0.,
                                 recall_min_c0=0.9,
                                 precision_min_c0=0.6)
for i in rule_dict:
    print(i)

('Fare > 26.125 and Pclass_ordered > 0.3504907488822937 and Sex_ordered > 0.4722778648138046', (0.328125, 0.9130434782608695, 0.9746835443037974, 0.6416666666666667, 1))
('Fare <= 26.125 and Pclass_ordered > 0.3504907488822937 and Sex_ordered > 0.4722778648138046', (0.21875, 0.875, 0.9746835443037974, 0.6062992125984252, 1))


### 5.1 Random Forest

In [10]:
model_RF_clf = ensemble.RandomForestClassifier(max_depth=3,n_estimators=3,random_state=9)
model_RF_clf.fit(X_train_proceeded,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3, n_jobs=None,
            oob_score=False, random_state=9, verbose=0, warm_start=False)

In [11]:
# model performance on test set
y_pred_test = model_RF_clf.predict(X_test_proceeded)
print(metrics.confusion_matrix(y_test,y_pred_test))

[[72  7]
 [23 41]]


In [12]:
rules,_ = rule_extract(model=model_RF_clf,feature_names=X_train_proceeded.columns)
for i in rules:
    print(i)

Fare > 15.64585018157959 and Age > 53.5 and Fare > 133.36874771118164
Fare <= 15.64585018157959 and Sex_ordered <= 0.4722778648138046 and Pclass_ordered > 0.3504907488822937
Fare > 15.64585018157959 and Age > 53.5 and Fare <= 133.36874771118164
Fare <= 15.64585018157959 and Sex_ordered > 0.4722778648138046 and Fare > 10.481249809265137
Fare > 15.64585018157959 and Age <= 53.5 and Pclass_ordered <= 0.3504907488822937
Fare <= 15.64585018157959 and Sex_ordered > 0.4722778648138046 and Fare <= 10.481249809265137
Fare <= 15.64585018157959 and Sex_ordered <= 0.4722778648138046 and Pclass_ordered <= 0.3504907488822937
Fare > 15.64585018157959 and Age <= 53.5 and Pclass_ordered > 0.3504907488822937
Pclass_ordered > 0.3504907488822937 and Age > 17.5 and Sex_ordered > 0.4722778648138046
Pclass_ordered <= 0.3504907488822937 and Sex_ordered <= 0.4722778648138046 and Age <= 9.5
Pclass_ordered > 0.3504907488822937 and Age <= 17.5
Pclass_ordered <= 0.3504907488822937 and Sex_ordered > 0.4722778648138

### 5.2 Output the tree sturcture

In [13]:
draw_tree(model=model_RF_clf,
          outdir='./images/RandomForest/',
          feature_names=X_train_proceeded.columns,
          proportion=False,
          class_names=['0','1'])

### Tree 1
![title](images/RandomForest/EnsembleTrees_No1.jpeg) 

### Tree2
![title](images/RandomForest/EnsembleTrees_No2.jpeg) 

### Tree3
![title](images/RandomForest/EnsembleTrees_No3.jpeg) 

### 5.3 BaggingClassifier

In [14]:
model_bagging_clf = ensemble.BaggingClassifier(
                base_estimator=tree.DecisionTreeClassifier(max_depth=3),
                n_estimators=2,
                n_jobs=-1,
                random_state=0)
model_bagging_clf.fit(X_train_proceeded,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=2, n_jobs=-1, oob_score=False,
         random_state=0, verbose=0, warm_start=False)

In [15]:
# model performance on test set
y_pred_test = model_bagging_clf.predict(X_test_proceeded)
print(metrics.confusion_matrix(y_test,y_pred_test))

[[75  4]
 [27 37]]


In [16]:
rules,_ = rule_extract(model=model_bagging_clf,feature_names=X_train_proceeded.columns)
for i in rules:
    print(i)

Sex_ordered <= 0.4722778648138046 and Fare <= 56.19790077209473 and Age <= 9.5
Sex_ordered > 0.4722778648138046 and Pclass_ordered > 0.3504907488822937 and Fare > 22.0
Sex_ordered > 0.4722778648138046 and Pclass_ordered <= 0.3504907488822937 and Age > 35.0
Sex_ordered <= 0.4722778648138046 and Fare > 56.19790077209473 and Fare > 59.08749961853027
Sex_ordered <= 0.4722778648138046 and Fare > 56.19790077209473 and Fare <= 59.08749961853027
Sex_ordered <= 0.4722778648138046 and Fare <= 56.19790077209473 and Age > 9.5
Sex_ordered > 0.4722778648138046 and Pclass_ordered > 0.3504907488822937 and Fare <= 22.0
Sex_ordered > 0.4722778648138046 and Pclass_ordered <= 0.3504907488822937 and Age <= 35.0
Sex_ordered <= 0.4722778648138046 and Age <= 5.5 and Pclass_ordered <= 0.3504907488822937
Sex_ordered <= 0.4722778648138046 and Age > 5.5 and Pclass_ordered > 0.5564569681882858
Sex_ordered > 0.4722778648138046 and Pclass_ordered > 0.3504907488822937 and Fare > 13.75
Sex_ordered <= 0.472277864813804

### 5.4 ExtraTree 

In [17]:
model_extratree_clf = ensemble.ExtraTreesClassifier(max_depth=3,n_estimators=2,random_state=0)
model_extratree_clf.fit(X_train_proceeded,y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=3, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=2, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [18]:
# model performance on test set
y_pred_test = model_extratree_clf.predict(X_test_proceeded)
print(metrics.confusion_matrix(y_test,y_pred_test))

[[68 11]
 [18 46]]


In [19]:
rules, _ = rule_extract(model=model_extratree_clf,feature_names=X_train_proceeded.columns)
for i in rules:
    print(i)

Sex_ordered > 0.5859209424696681 and Pclass_ordered > 0.506024637348139 and Age > 42.70739467800939
Sex_ordered <= 0.5859209424696681 and Pclass_ordered > 0.6300343487048754 and Fare <= 487.86732759797127
Sex_ordered > 0.5859209424696681 and Pclass_ordered > 0.506024637348139 and Age <= 42.70739467800939
Sex_ordered <= 0.5859209424696681 and Pclass_ordered <= 0.6300343487048754 and Fare > 19.768909903834068
Sex_ordered > 0.5859209424696681 and Pclass_ordered <= 0.506024637348139 and Age <= 27.126815941243812
Sex_ordered <= 0.5859209424696681 and Pclass_ordered <= 0.6300343487048754 and Fare <= 19.768909903834068
Sex_ordered <= 0.5859209424696681 and Pclass_ordered > 0.6300343487048754 and Fare > 487.86732759797127
Sex_ordered > 0.5859209424696681 and Pclass_ordered <= 0.506024637348139 and Age > 27.126815941243812
Pclass_ordered > 0.4895107363075585 and Sex_ordered > 0.3002771553876842 and Age <= 16.99989479733626
Pclass_ordered > 0.4895107363075585 and Sex_ordered > 0.3002771553876842

## 6.Rule Voting

### 6.1 Testing our filtering method

in section 5, we have a rule with performance on test set:

('Fare > 26.125 and Pclass_ordered > 0.3504907488822937 and Sex_ordered > 0.4722778648138046', 

recall on 1-class, prec on 1-class, recall on 0-class, prec on 0-class, nb   
(0.328125, 0.9130434782608695, 0.9746835443037974, 0.6416666666666667, 1))

let's check if the result is correct

In [20]:
one_rule = ['Fare > 26.125 and Pclass_ordered > 0.3504907488822937 and Sex_ordered > 0.4722778648138046']
X_test_proceeded_reindex = X_test_proceeded.reset_index(drop=True)
score = rules_vote(X=X_test_proceeded_reindex,rules=one_rule)
score = pd.DataFrame(score)
print(score[0].value_counts())
score['predict'] = score[0]
score['predict'][score[0]==1] = 1
# this single rule has predicted 23 cases to be positive in test data

0.0    120
1.0     23
Name: 0, dtype: int64


In [21]:
print(metrics.confusion_matrix(y_test,score.predict))
print('recall in 1-class: ', 21/(21+43))
print('prec   in 1-class: ', 21/(21+2))
print('recall in 0-class: ', 77/(77+2))
print('prec   in 0-class: ', 77/(77+43))

[[77  2]
 [43 21]]
recall in 1-class:  0.328125
prec   in 1-class:  0.9130434782608695
recall in 0-class:  0.9746835443037974
prec   in 0-class:  0.6416666666666667


### 6.2 Random Forest

In [22]:
# we train a random forest
model_RF_clf = ensemble.RandomForestClassifier(max_depth=3,n_estimators=3,random_state=9)
model_RF_clf.fit(X_train_proceeded,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=3, n_jobs=None,
            oob_score=False, random_state=9, verbose=0, warm_start=False)

In [23]:
# model performance on test set
y_pred_test = model_RF_clf.predict(X_test_proceeded)
print(metrics.confusion_matrix(y_test,y_pred_test))

[[72  7]
 [23 41]]


In [24]:
# we extract rules from the ensemble with filtering
rules,rule_dict = rule_extract(model=model_RF_clf,
                               feature_names=X_train_proceeded.columns,
                               x_test=X_train_proceeded,
                               y_test=y_train,
                                 sort_key=0,
                                 recall_min_c1=0.1,
                                 precision_min_c1=0.6,
                                 recall_min_c0=0.1,
                                 precision_min_c0=0.5)
print(len(rules))

# we have 5 rule that have prec on class 1>0.6 and recall>0.1

5


In [25]:
# use the 5 above rules to make prediction again!
X_test_proceeded_reindex = X_test_proceeded.reset_index(drop=True)
#print(X_test_proceeded_reindex)
score = rules_vote(X=X_test_proceeded_reindex,rules=rules)
score = pd.DataFrame(score)
print(score[0].value_counts())

# 6 cases have been voted 3 times. they should be class 1 with greater confidence

0.0    72
1.0    36
2.0    29
3.0     6
Name: 0, dtype: int64


In [26]:
# we predict all cases that have been voted at least once to be class 1
# if we are more preservative, we can predict class 1 only if they have been 
# voted more times

score['predict'] = score[0]
score['predict'][score[0]>0] = 1
score.head()

Unnamed: 0,0,predict
0,1.0,1.0
1,2.0,1.0
2,3.0,1.0
3,0.0,0.0
4,1.0,1.0


In [27]:
# compare this result with the confusion matrix made by the RF model itself
# we can see that 5 rules have a better performace on predicting class 1 samples
print('confusion matrix of RF model')
print(metrics.confusion_matrix(y_test,y_pred_test))
print('confusion matrix of the 5 rules')
print(metrics.confusion_matrix(y_test,score.predict))

confusion matrix of RF model
[[72  7]
 [23 41]]
confusion matrix of the 5 rules
[[58 21]
 [14 50]]
