In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree,ensemble,metrics

from rule import Rule
from rule_extraction import rule_extract,draw_tree

## 数据准备

In [2]:
# fetch dataset
data = pd.read_csv('./dataset/titanic.csv', usecols = ['Age', 'Fare','Sex','Pclass','Survived'])

# drop NA records since most Tree algorithm cannot handle
data.dropna(inplace=True)

# split training/test sets
X_train, X_test, y_train, y_test = train_test_split(data[['Age', 'Fare','Sex','Pclass','Survived']], 
                                                    data.Survived, test_size=0.2, random_state=0)

# dataset shape
print(X_train.shape, X_test.shape)
print(X_train.head(5))

(571, 5) (143, 5)
      Age     Fare     Sex  Pclass  Survived
387  36.0  13.0000  female       2         1
685  25.0  41.5792    male       2         0
20   35.0  26.0000    male       2         0
331  45.5  28.5000    male       1         0
396  31.0   7.8542  female       3         0


## 类别变量处理，采用 Mean Encoding 方法

In [3]:
# Pclass
X_train.groupby(['Pclass'])['Survived'].mean()
ordered_labels = X_train.groupby(['Pclass'])['Survived'].mean().to_dict()
ordered_labels

# Mean Encoding
X_train['Pclass_ordered'] = X_train.Pclass.map(ordered_labels)
X_test['Pclass_ordered'] = X_test.Pclass.map(ordered_labels)

# Sex
X_train.groupby(['Sex'])['Survived'].mean()
ordered_labels = X_train.groupby(['Sex'])['Survived'].mean().to_dict()
ordered_labels

# Mean Encoding
X_train['Sex_ordered'] = X_train.Sex.map(ordered_labels)
X_test['Sex_ordered'] = X_test.Sex.map(ordered_labels)


## 获得最终的训练数据集

In [4]:
X_train_proceeded = X_train[['Age', 'Fare','Sex_ordered','Pclass_ordered']]
X_test_proceeded = X_test[['Age', 'Fare','Sex_ordered','Pclass_ordered']]
print(X_train_proceeded.head())

      Age     Fare  Sex_ordered  Pclass_ordered
387  36.0  13.0000     0.740196        0.460432
685  25.0  41.5792     0.204360        0.460432
20   35.0  26.0000     0.204360        0.460432
331  45.5  28.5000     0.204360        0.652482
396  31.0   7.8542     0.740196        0.240550


## 单颗决策树

In [5]:
# 单颗决策树模型
# 模型API参考 http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

model_tree_clf = tree.DecisionTreeClassifier(criterion='gini',max_depth=3)
model_tree_clf.fit(X_train_proceeded,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [6]:
# model performance on training set
y_pred = model_tree_clf.predict(X_train_proceeded)
print(metrics.confusion_matrix(y_train,y_pred))

[[311  34]
 [ 67 159]]


##  输出所有规则，不加筛选条件

共8条规则，对应8条从root到叶子节点的完整路径

In [7]:
rule_extract(model=model_tree_clf,feature_names=X_train_proceeded.columns)

['Sex_ordered > 0.4722778797149658 and Pclass_ordered <= 0.3504907488822937 and Fare <= 20.799999237060547',
 'Sex_ordered > 0.4722778797149658 and Pclass_ordered > 0.3504907488822937 and Fare > 26.125',
 'Sex_ordered > 0.4722778797149658 and Pclass_ordered > 0.3504907488822937 and Fare <= 26.125',
 'Sex_ordered <= 0.4722778797149658 and Age <= 13.0 and Pclass_ordered > 0.3504907488822937',
 'Sex_ordered > 0.4722778797149658 and Pclass_ordered <= 0.3504907488822937 and Fare > 20.799999237060547',
 'Sex_ordered <= 0.4722778797149658 and Age <= 13.0 and Pclass_ordered <= 0.3504907488822937',
 'Sex_ordered <= 0.4722778797149658 and Age > 13.0 and Pclass_ordered <= 0.556456983089447',
 'Sex_ordered <= 0.4722778797149658 and Age > 13.0 and Pclass_ordered > 0.556456983089447']

## 输出决策树的结构
与训练集的混淆矩阵

 | 预测=0 | 预测=1
- | :-: | -: 
真实=0 | 311 | 34
真实=1|  67 |  159

 比较，结果完全吻合

In [8]:
draw_tree(model=model_tree_clf,
          outdir='./images/DecisionTree/',
          feature_names=X_train_proceeded.columns,
          proportion=False,
          class_names=['0','1'])

![title](images/DecisionTree/DecisionTree.jpeg) 

##  输出筛选后的规则
返回  [rule, recall on 0-class, prec on 0-class, recall on 0-class, prec on 0-class, nb]


In [9]:
# 筛选规则，要求对1类样本的召回率在0.1以上，准确率在0.5以上
rule_extract(model=model_tree_clf,
             feature_names=X_train_proceeded.columns,
             x_test=X_test,
             y_test=y_test,
             sort_key=0,
             recall_min_c1=0.1,
             precision_min_c1=0.5,
             recall_min_c0=0,
             precision_min_c0=0)

[('Fare > 26.125 and Pclass_ordered > 0.3504907488822937 and Sex_ordered > 0.4722778797149658',
  (0.328125, 0.9130434782608695, 0.9746835443037974, 0.6311475409836066, 1)),
 ('Fare <= 26.125 and Pclass_ordered > 0.3504907488822937 and Sex_ordered > 0.4722778797149658',
  (0.21875, 0.875, 0.9746835443037974, 0.5968992248062015, 1)),
 ('Fare <= 20.799999237060547 and Pclass_ordered <= 0.3504907488822937 and Sex_ordered > 0.4722778797149658',
  (0.171875, 0.6470588235294118, 0.9240506329113924, 0.553030303030303, 1))]

## 随机森林

In [10]:
model_RF_clf = ensemble.RandomForestClassifier(max_depth=3,n_estimators=3)
model_RF_clf.fit(X_train_proceeded,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=3, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [11]:
# model performance on test set
y_pred_test = model_RF_clf.predict(X_test_proceeded)
print(metrics.confusion_matrix(y_test,y_pred_test))

[[69 10]
 [27 37]]


In [22]:
rule_extract(model=model_RF_clf,feature_names=X_train_proceeded.columns)

['Sex_ordered > 0.4722778797149658 and Fare <= 31.331249237060547 and Age > 18.5',
 'Sex_ordered <= 0.4722778797149658 and Age <= 8.5 and Fare <= 39.34375',
 'Sex_ordered <= 0.4722778797149658 and Age > 8.5 and Fare > 25.075000762939453',
 'Sex_ordered > 0.4722778797149658 and Fare > 31.331249237060547 and Age <= 27.0',
 'Sex_ordered <= 0.4722778797149658 and Age > 8.5 and Fare <= 25.075000762939453',
 'Sex_ordered > 0.4722778797149658 and Fare <= 31.331249237060547 and Age <= 18.5',
 'Sex_ordered <= 0.4722778797149658 and Age <= 8.5 and Fare > 39.34375',
 'Sex_ordered > 0.4722778797149658 and Fare > 31.331249237060547 and Age > 27.0',
 'Pclass_ordered <= 0.3504907488822937 and Age > 32.5 and Fare <= 31.331249237060547',
 'Pclass_ordered > 0.3504907488822937 and Fare <= 13.64585018157959 and Sex_ordered <= 0.4722778797149658',
 'Pclass_ordered > 0.3504907488822937 and Fare <= 13.64585018157959 and Sex_ordered > 0.4722778797149658',
 'Pclass_ordered <= 0.3504907488822937 and Age <= 32.5

## 输出决策树的结构

In [13]:
draw_tree(model=model_RF_clf,
          outdir='./images/RandomForest/',
          feature_names=X_train_proceeded.columns,
          proportion=False,
          class_names=['0','1'])

### Tree 1
![title](images/RandomForest/EnsembleTrees_No1.jpeg) 

### Tree2
![title](images/RandomForest/EnsembleTrees_No2.jpeg) 

### Tree3
![title](images/RandomForest/EnsembleTrees_No3.jpeg) 

## BaggingClassifier

In [14]:
model_bagging_clf = ensemble.BaggingClassifier(
                base_estimator=tree.DecisionTreeClassifier(max_depth=3),
                n_estimators=2,
                n_jobs=-1,
                random_state=0)
model_bagging_clf.fit(X_train_proceeded,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=2, n_jobs=-1, oob_score=False,
         random_state=0, verbose=0, warm_start=False)

In [15]:
# model performance on test set
y_pred_test = model_bagging_clf.predict(X_test_proceeded)
print(metrics.confusion_matrix(y_test,y_pred_test))

[[75  4]
 [27 37]]


In [16]:
rule_extract(model=model_bagging_clf,feature_names=X_train_proceeded.columns)

['Sex_ordered > 0.4722778797149658 and Pclass_ordered <= 0.3504907488822937 and Age <= 35.0',
 'Sex_ordered > 0.4722778797149658 and Pclass_ordered > 0.3504907488822937 and Fare <= 22.0',
 'Sex_ordered <= 0.4722778797149658 and Fare > 56.197898864746094 and Fare <= 59.087501525878906',
 'Sex_ordered <= 0.4722778797149658 and Fare > 56.197898864746094 and Fare > 59.087501525878906',
 'Sex_ordered <= 0.4722778797149658 and Fare <= 56.197898864746094 and Age > 9.5',
 'Sex_ordered > 0.4722778797149658 and Pclass_ordered <= 0.3504907488822937 and Age > 35.0',
 'Sex_ordered > 0.4722778797149658 and Pclass_ordered > 0.3504907488822937 and Fare > 22.0',
 'Sex_ordered <= 0.4722778797149658 and Fare <= 56.197898864746094 and Age <= 9.5',
 'Sex_ordered <= 0.4722778797149658 and Age > 5.5 and Pclass_ordered > 0.556456983089447',
 'Sex_ordered > 0.4722778797149658 and Pclass_ordered <= 0.3504907488822937 and Fare <= 20.799999237060547',
 'Sex_ordered > 0.4722778797149658 and Pclass_ordered > 0.3504

## 极端随机树

In [17]:
model_extratree_clf = ensemble.ExtraTreesClassifier(max_depth=3,n_estimators=2)
model_extratree_clf.fit(X_train_proceeded,y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=3, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=2, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [18]:
# model performance on test set
y_pred_test = model_extratree_clf.predict(X_test_proceeded)
print(metrics.confusion_matrix(y_test,y_pred_test))

[[67 12]
 [22 42]]


In [19]:
rule_extract(model=model_extratree_clf,feature_names=X_train_proceeded.columns)

['Fare > 379.34570757649794',
 'Fare <= 379.34570757649794 and Sex_ordered > 0.6927238932803257 and Pclass_ordered > 0.3221824863733518',
 'Fare <= 379.34570757649794 and Sex_ordered > 0.6927238932803257 and Pclass_ordered <= 0.3221824863733518',
 'Fare <= 379.34570757649794 and Sex_ordered <= 0.6927238932803257 and Age > 65.99065063150593',
 'Fare <= 379.34570757649794 and Sex_ordered <= 0.6927238932803257 and Age <= 65.99065063150593',
 'Pclass_ordered <= 0.5742380434968545 and Sex_ordered <= 0.6736380032889797 and Age > 31.090236635227853',
 'Pclass_ordered <= 0.5742380434968545 and Sex_ordered > 0.6736380032889797 and Age <= 12.726500393695432',
 'Pclass_ordered > 0.5742380434968545 and Age <= 22.50804470596278 and Fare > 155.1217262992275',
 'Pclass_ordered > 0.5742380434968545 and Age > 22.50804470596278 and Sex_ordered > 0.35653855811173185',
 'Pclass_ordered <= 0.5742380434968545 and Sex_ordered > 0.6736380032889797 and Age > 12.726500393695432',
 'Pclass_ordered <= 0.574238043