In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.tree import  DecisionTreeClassifier   
from sklearn.metrics import accuracy_score ,confusion_matrix                                                       

In [2]:
data = load_breast_cancer()
X = data.data
y = data.target

In [3]:
X.shape

(569, 30)

In [4]:
data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [5]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [6]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test= train_test_split(X, y,test_size=0.2, random_state =0)                       
 

In [7]:
DTC=DecisionTreeClassifier(max_depth=3)
DTC.fit(X_train,y_train) 

DecisionTreeClassifier(max_depth=3)

In [8]:
DTC.classes_

array([0, 1])

In [9]:
y_pred=DTC.predict(X_test)

In [10]:
accuracy = accuracy_score(y_pred, y_test)*100
accuracy.round(2)


96.49

In [11]:
print(confusion_matrix(y_test, y_pred))

[[44  3]
 [ 1 66]]


In [12]:
max_score = 0
paramters = []
test_sizes = [0.2, 0.25, 0.3, 0.33, 0.4]
for s in test_sizes:
  for i in range(43):
    X_train, X_test, y_train, y_test= train_test_split(
        X, y, test_size = s, random_state = i)                       
    for d in range(1,31):
      DTC = DecisionTreeClassifier(max_depth = d)
      DTC.fit(X_train, y_train) 
      y_pred = DTC.predict(X_test)
      score = accuracy_score(y_pred, y_test)*100
      if score > max_score:
        max_score = score
        paramters = [s, i, d]

In [13]:
print('Highest accuarcy obtained: {}%'.format(max_score.round(2)))
print('Using paramters test_size = {}, random_state = {}, max_depth = {}'.format(paramters[0], paramters[1], paramters[2]))

Highest accuarcy obtained: 98.6%
Using paramters test_size = 0.25, random_state = 11, max_depth = 7


In [14]:
X_train, X_test, y_train, y_test= train_test_split(
    X, y, test_size = 0.25, random_state = 41)            

In [15]:
DTC = DecisionTreeClassifier(max_depth = 4)
DTC.fit(X_train, y_train) 

DecisionTreeClassifier(max_depth=4)

In [16]:
print(f'Train set score: {(DTC.score(X_train, y_train)*100).round(2)}%')
print(f'Test set score: {(DTC.score(X_test, y_test)*100).round(2)}%')

Train set score: 97.89%
Test set score: 98.6%


In [17]:
y_pred = DTC.predict(X_test)

In [18]:
accuracy = accuracy_score(y_pred, y_test)*100
accuracy.round(2)

98.6

In [19]:
print(confusion_matrix(y_test, y_pred))

[[53  0]
 [ 2 88]]


In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        53
           1       1.00      0.98      0.99        90

    accuracy                           0.99       143
   macro avg       0.98      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143

