In [27]:
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [4]:
cancer_data = load_breast_cancer()

In [5]:
X, y = cancer_data.data, cancer_data.target

In [8]:
print(cancer_data)

{'data': array([[  1.79900000e+01,   1.03800000e+01,   1.22800000e+02, ...,
          2.65400000e-01,   4.60100000e-01,   1.18900000e-01],
       [  2.05700000e+01,   1.77700000e+01,   1.32900000e+02, ...,
          1.86000000e-01,   2.75000000e-01,   8.90200000e-02],
       [  1.96900000e+01,   2.12500000e+01,   1.30000000e+02, ...,
          2.43000000e-01,   3.61300000e-01,   8.75800000e-02],
       ..., 
       [  1.66000000e+01,   2.80800000e+01,   1.08300000e+02, ...,
          1.41800000e-01,   2.21800000e-01,   7.82000000e-02],
       [  2.06000000e+01,   2.93300000e+01,   1.40100000e+02, ...,
          2.65000000e-01,   4.08700000e-01,   1.24000000e-01],
       [  7.76000000e+00,   2.45400000e+01,   4.79200000e+01, ...,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 0,

In [9]:
cancer_data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'], 
      dtype='<U23')

In [11]:
X.shape, y.shape

((569, 30), (569,))

In [12]:
tree = DecisionTreeClassifier()

In [13]:
DecisionTreeClassifier?

In [14]:
X_train, X_test = X[:400, :], X[400:, :]

(400, 30)

In [15]:
from sklearn.cross_validation import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [19]:
X_train.shape, X_test.shape

((398, 30), (171, 30))

In [20]:
%%time
tree.fit(X_train, y_train)

CPU times: user 6.99 ms, sys: 1.17 ms, total: 8.17 ms
Wall time: 7.1 ms


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [22]:
test_prediction = tree.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
accuracy_score(y_test, test_prediction)

0.90058479532163738

In [None]:
!conda install pydot
# graphviz

In [28]:
export_graphviz(tree, 'cancer_tree.dot')

In [29]:
!dot -T png cancer_tree.dot -o cancer_data.png

<img src='cancer_data.png'>

In [30]:
from sklearn.grid_search import GridSearchCV

In [32]:
params = {'min_samples_leaf': [1, 3, 5, 7],
         'max_depth': [2, 3, 4]}

best_tree = GridSearchCV(DecisionTreeClassifier(),
                        params, cv=5, n_jobs=-1, verbose=True)

best_tree.fit(X_train, y_train)




Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.2s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [2, 3, 4], 'min_samples_leaf': [1, 3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=True)

In [33]:
best_tree.best_params_

{'max_depth': 3, 'min_samples_leaf': 5}

In [34]:
best_tree.best_score_

0.9271356783919598

In [35]:
best_tree.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [36]:
new_pred = best_tree.best_estimator_.predict(X_test)

In [38]:
accuracy_score(y_test, new_pred)

0.95321637426900585

In [41]:
export_graphviz(best_tree.best_estimator_,
                'cancer_tree_tuned.dot',
               feature_names=cancer_data.feature_names)

In [42]:
!dot -T png cancer_tree_tuned.dot -o cancer_data_tuned.png