# 1 Get Data

In [None]:
# data download link: https://archive/ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
import pandas as pd
data = pd.read_csv('wdbc.data', header=None)
print(data.shape)
print(data.columns)
print(data.head())

In [None]:
# commands in this cell only works for windows operartion system.
! attrib wdbc.data
! type wdbc.data

# 2 Set Column Names

In [None]:
column_names = ['id', 'malignant', 
                'nucleus_mean', 'nucleus_se', 'nucleus_worst', 
                'texture_mean', 'texture_se', 'texture_worst',
                'perimeter_mean', 'perimeter_se', 'perimeter_worst',
                'area_mean', 'area_se', 'area_worst',
                'smoothness_mean', 'smoothness_se', 'smoothness_worst',
                'compactness_mean', 'compactness_se', 'compactness_worst',
                'concavity_mean', 'concavity_se', 'concavity_worst',
                'concave_pts_mean', 'concave_pts_se', 'concave_pts_worst',
                'symmetry_mean', 'symmetry_se', 'symmetry_worst',
                'fractal_dim_mean', 'fractal_dim_se', 'fractal_dim_worst'
               ]
data.columns = column_names
print(data.shape)
print(data.columns)
print(data.head())

In [None]:
data.tail(10)

# 3 Make Data All Numerical

In [None]:
data['malignant'] = data['malignant'].map(lambda x: 0 if x == 'B' else 1)
data.tail(10)

# 4 Split Data Into Train Test

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split

X = data.drop(columns=['malignant']).values
y = data['malignant'].values

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X = ss.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 2018)

# 5 Build ML/DL Model
Pick one model from below.

In [7]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier()

# from sklearn.neighbors import KNeighborsClassifier
# model = KNeighborsClassifier()

# from sklearn.svm import SVC
# model = SVC()

# 6 Training ML/DL Model

In [None]:
model.fit(X_train, y_train)

# 7 Testing ML/DL Model

In [9]:
pred = model.predict(X_test)

# 8 Evaluate the Result

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.svm import SVC
import matplotlib.pyplot as plt

print('pred = ' + str(pred))
print('y_test = ' + str(y_test))

if pred.all == y_test.all:
    print('Prediction successful, pred = y_test!')
else:
    # printing
    diff = abs(pred - y_test)
    print('Prediction rate: {0}/{1} = {2}%'.format(len(y_test)-sum(diff), len(y_test), (len(y_test)-sum(diff))/len(y_test)*100))
    cm = confusion_matrix(y_test, pred).ravel()
    print('[TruePositive, TrueNegative, FalsePositive, FalseNegative] = ' + str(cm))
    # plotting
    ConfusionMatrixDisplay.from_predictions(y_test, pred, display_labels = ['True', 'False'])
    plt.show()
print('Finished Execution!')