In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer

In [2]:
data_ = load_breast_cancer()
df = pd.DataFrame(data=np.c_[data_.data,data_.target],columns=[list(data_.feature_names)+['target']])
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [3]:
data_.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [4]:
data_.target_names

array(['malignant', 'benign'], dtype='<U9')

In [5]:
df.shape

(569, 31)

## Split the features and levels

In [6]:
feature = df.iloc[:,:-1]
level = df.iloc[:,-1]

In [7]:
feature.shape

(569, 30)

In [8]:
xtrain,xtest,ytrain,ytest = train_test_split(feature,level,test_size = 0.25,random_state = 2020)

In [9]:
print("Shape of xtrain: ",xtrain.shape)
print("Shape of ytrain: ",ytrain.shape)
print("Shape of xtest: ",xtest.shape)
print("Shape of ytest: ",ytest.shape)

Shape of xtrain:  (426, 30)
Shape of ytrain:  (426,)
Shape of xtest:  (143, 30)
Shape of ytest:  (143,)


## Create Model

In [10]:
classifier_model = DecisionTreeClassifier(criterion="gini") # Just for understanding purpose
classifier_model.fit(xtrain,ytrain)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [11]:
print("Accuracy: ", round(classifier_model.score(X=xtest,y=ytest),ndigits=3)*100,"%")

Accuracy:  95.1 %


In [12]:
classifier_model2 = DecisionTreeClassifier(criterion="entropy") # Just for understanding purpose
classifier_model2.fit(xtrain,ytrain)
print("Accuracy: ", round(classifier_model2.score(X=xtest,y=ytest),ndigits=3)*100,"%")

Accuracy:  94.39999999999999 %


In [13]:
pred = classifier_model2.predict(X=xtest)
pred

array([0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1.,
       1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1.,
       0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1.,
       1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1.,
       0., 1., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0.,
       0., 0., 1., 0., 1., 1., 0.])

In [14]:
predict_one = xtest.iloc[0,:].values
predict_one

array([2.321e+01, 2.697e+01, 1.535e+02, 1.670e+03, 9.509e-02, 1.682e-01,
       1.950e-01, 1.237e-01, 1.909e-01, 6.309e-02, 1.058e+00, 9.635e-01,
       7.247e+00, 1.558e+02, 6.428e-03, 2.863e-02, 4.497e-02, 1.716e-02,
       1.590e-02, 3.053e-03, 3.101e+01, 3.451e+01, 2.060e+02, 2.944e+03,
       1.481e-01, 4.126e-01, 5.820e-01, 2.593e-01, 3.103e-01, 8.677e-02])

In [15]:
result = classifier_model2.predict([predict_one])

In [16]:
print("Patient has Cancer (malignant tumor)") if result==0 else print("Patient has no Cancer (malignant benign)")

Patient has Cancer (malignant tumor)


## Apply k-fold cross **validition** 

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
# if we want apply cross-validition then we have to use feature & level
cvs = cross_val_score(estimator=classifier_model2,X=feature,y=level,cv=10)

In [19]:
cvs

array([0.92982456, 0.85964912, 0.96491228, 0.87719298, 0.98245614,
       0.94736842, 0.92982456, 0.92982456, 0.94736842, 0.92857143])

## Maximum & Minimum Accuracy

In [20]:
cvs.max(),cvs.min()

(0.9824561403508771, 0.8596491228070176)

In [21]:
# See the average accuracy
cvs.mean()

0.9296992481203008

# Save and load the model

In [22]:
import joblib

In [29]:
joblib.dump(classifier_model2,"DesicionTree_Model2")


['DesicionTree_Model2']

In [30]:
my_model = joblib.load("DesicionTree_Model2")

In [31]:
my_model.predict([predict_one])

array([0.])