# TASK
1. Load data breast cancer Database. Buatlah dataframe dari data tersebut.
2. Coba describe beberapa dari featurenya.
3. Apakah terdapat missing value? buktikan!
4. Buatlah sebuah model, dengan mengimplementasikan processing data (gunakan scaler, encoding, imputer jika perlu).
5. Gunakan random state 1000 untuk splitting data.
6. Bandingkan SVC (tentukan kernel terbaik terlebih dahulu), KNN (tentukan besk K nya terlebih dahulu), dan DT.
7. Model mana yang terbaik jika dibandingkan dengan menggunakan Recall Score

1 means the cancer is malignant and 0 means benign

In [2]:
# Library

#Standard
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats

#Data Pre-Processing
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler, StandardScaler

#Data Modeling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#Data Output
from sklearn.metrics import accuracy_score,recall_score, confusion_matrix
from sklearn.pipeline import Pipeline #menggabungkan proces

warnings.filterwarnings('ignore')

In [3]:
#Defining Function

def dataDescription(df):
    tempList = []
    for col in df.columns:
        tempList.append(
            [col,
            df[col].dtype,
            df[col].isna().sum(),
            round(df[col].isna().sum()/len(df)*100,2),
            df[col].nunique(),
            #list(df[col].drop_duplicates().sample(5,replace=True).values)
            list(df[col].drop_duplicates().sort_values().values)
            ]
        )

    descData = pd.DataFrame(data = tempList,
                            columns = ['Col','Data Type','Missing Value', 'Pct Missing Value','Num Unique','Unique Sample']
                            )
    display(descData)

def normalCheckShapiro(data):

    _, p_value = stats.shapiro(data)

    alpha = 0.05
    if p_value > alpha:
        print("The data is normally distributed.")
    else:
        print("The data is not normally distributed.")


In [4]:
#IMPORT DATA
import sklearn.datasets
raw = sklearn.datasets.load_breast_cancer()
df = pd.DataFrame(
    data=raw.data,
    columns=raw.feature_names
    )
df['target'] = raw.target

In [5]:
#Descriptive Analysis
display(df.info(),df.describe(),df.isnull().sum(),df.head(),dataDescription(df))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

Unnamed: 0,Col,Data Type,Missing Value,Pct Missing Value,Num Unique,Unique Sample
0,mean radius,float64,0,0.0,456,"[6.981, 7.691, 7.729, 7.76, 8.196, 8.219, 8.57..."
1,mean texture,float64,0,0.0,479,"[9.71, 10.38, 10.72, 10.82, 10.89, 10.91, 10.9..."
2,mean perimeter,float64,0,0.0,522,"[43.79, 47.92, 47.98, 48.34, 51.71, 53.27, 54...."
3,mean area,float64,0,0.0,539,"[143.5, 170.4, 178.8, 181.0, 201.9, 203.9, 221..."
4,mean smoothness,float64,0,0.0,474,"[0.05263, 0.06251, 0.06429, 0.06576, 0.06613, ..."
5,mean compactness,float64,0,0.0,537,"[0.01938, 0.02344, 0.0265, 0.02675, 0.03116, 0..."
6,mean concavity,float64,0,0.0,537,"[0.0, 0.000692, 0.0009737, 0.001194, 0.001461,..."
7,mean concave points,float64,0,0.0,542,"[0.0, 0.001852, 0.002404, 0.002924, 0.002941, ..."
8,mean symmetry,float64,0,0.0,432,"[0.106, 0.1167, 0.1203, 0.1215, 0.122, 0.1274,..."
9,mean fractal dimension,float64,0,0.0,499,"[0.04996, 0.05024, 0.05025, 0.05044, 0.05054, ..."


None

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0


mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


None

In [6]:
for i in df:
    normalCheckShapiro(df[i])

The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is not normally distributed.
The data is 

In [7]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [8]:
transformer = ColumnTransformer(
    [
        ('RobustScaler', RobustScaler(), (['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension']))
    ],
    remainder='passthrough' #LEWATKAN YANG GAK DIMENTION
)

In [9]:
transformer

In [10]:
# Data Spliting

x = df.drop(columns=['target'])
y = df['target']

xtrain, xtest, ytrain, ytest = train_test_split(
    x,
    y,
    stratify= y,
    random_state=1000,
    test_size= 0.2
)

In [11]:
# Transforming Data

xtrain_prepros = transformer.fit_transform(xtrain)
xtest_prepros = transformer.transform(xtest)
xtrain_prepros

array([[ 0.18937644,  0.85839161,  0.21939477, ...,  0.80873977,
         2.71184023,  1.10164903],
       [ 1.7482679 ,  0.81468531,  1.8022696 , ...,  1.15356736,
         0.43223966, -0.08515875],
       [-0.073903  , -0.0541958 , -0.04814305, ...,  0.27648338,
         0.41369472,  0.1452129 ],
       ...,
       [ 1.18244804, -0.24125874,  1.19704264, ...,  0.69586828,
        -0.44650499,  0.70046763],
       [ 1.4595843 ,  0.41083916,  1.49621733, ...,  1.4901108 ,
         1.12696148,  0.36229387],
       [-0.33025404, -0.11538462, -0.37070151, ..., -0.37423631,
        -0.05420827, -0.30223972]])

In [12]:
xtrain_prepros = pd.DataFrame(xtrain_prepros)
xtest_prepros = pd.DataFrame(xtest_prepros)
xtrain_prepros

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.189376,0.858392,0.219395,0.158891,-0.099794,0.589152,0.487035,0.574935,1.310550,0.282965,...,0.330508,1.051570,0.422419,0.219815,0.795302,1.027317,0.552459,0.808740,2.711840,1.101649
1,1.748268,0.814685,1.802270,2.167155,0.212449,1.052079,2.043868,1.809223,1.221397,-0.322581,...,1.808475,0.171525,1.642848,2.297824,-0.372483,0.552223,1.766972,1.153567,0.432240,-0.085159
2,-0.073903,-0.054196,-0.048143,-0.104239,0.598251,0.586066,0.336225,0.415210,0.716196,0.359932,...,-0.122034,-0.069507,-0.081766,-0.161740,0.070470,0.276915,0.114457,0.276483,0.413695,0.145213
3,-0.242494,-1.138112,-0.262724,-0.231938,0.320473,-0.348893,-0.211250,0.061361,0.490342,-0.258065,...,-0.238983,-1.102018,-0.257673,-0.234723,0.211409,-0.468666,-0.383393,-0.053743,0.005706,-0.615801
4,-0.660508,0.732517,-0.675378,-0.578246,0.253601,-0.377286,-0.356697,-0.277984,-0.294205,0.422184,...,-0.674576,-0.140135,-0.666990,-0.557857,-0.600671,-0.685378,-0.692426,-0.702599,-0.850214,-0.612848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,-0.390300,-0.477273,-0.378267,-0.348174,0.834877,0.003240,-0.182784,-0.044254,0.175334,0.246746,...,-0.269492,-0.434978,-0.267378,-0.233510,0.684564,-0.355651,-0.290630,-0.009837,-0.027104,0.000984
451,-0.854965,-0.139860,-0.874484,-0.705945,-0.692387,-0.588535,-0.455742,-0.519357,-0.338782,0.259196,...,-0.637288,-0.095291,-0.642485,-0.527520,0.221477,-0.474558,-0.608229,-0.767319,0.333809,-0.072360
452,1.182448,-0.241259,1.197043,1.378033,0.238169,0.606126,0.853578,1.080885,0.112927,0.581777,...,1.328814,-0.290359,1.184278,1.587068,0.302013,0.507766,0.473537,0.695868,-0.446505,0.700468
453,1.459584,0.410839,1.496217,1.737937,0.670267,1.041278,1.324430,1.751581,0.817236,-0.208263,...,1.467797,0.006726,1.332282,1.776025,0.409396,1.126942,0.836731,1.490111,1.126961,0.362294


In [13]:
feature = list(transformer.transformers_[0][1].get_feature_names_out())

xtrain_prepros.columns = feature
xtest_prepros.columns = feature

xtrain_prepros

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0.189376,0.858392,0.219395,0.158891,-0.099794,0.589152,0.487035,0.574935,1.310550,0.282965,...,0.330508,1.051570,0.422419,0.219815,0.795302,1.027317,0.552459,0.808740,2.711840,1.101649
1,1.748268,0.814685,1.802270,2.167155,0.212449,1.052079,2.043868,1.809223,1.221397,-0.322581,...,1.808475,0.171525,1.642848,2.297824,-0.372483,0.552223,1.766972,1.153567,0.432240,-0.085159
2,-0.073903,-0.054196,-0.048143,-0.104239,0.598251,0.586066,0.336225,0.415210,0.716196,0.359932,...,-0.122034,-0.069507,-0.081766,-0.161740,0.070470,0.276915,0.114457,0.276483,0.413695,0.145213
3,-0.242494,-1.138112,-0.262724,-0.231938,0.320473,-0.348893,-0.211250,0.061361,0.490342,-0.258065,...,-0.238983,-1.102018,-0.257673,-0.234723,0.211409,-0.468666,-0.383393,-0.053743,0.005706,-0.615801
4,-0.660508,0.732517,-0.675378,-0.578246,0.253601,-0.377286,-0.356697,-0.277984,-0.294205,0.422184,...,-0.674576,-0.140135,-0.666990,-0.557857,-0.600671,-0.685378,-0.692426,-0.702599,-0.850214,-0.612848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,-0.390300,-0.477273,-0.378267,-0.348174,0.834877,0.003240,-0.182784,-0.044254,0.175334,0.246746,...,-0.269492,-0.434978,-0.267378,-0.233510,0.684564,-0.355651,-0.290630,-0.009837,-0.027104,0.000984
451,-0.854965,-0.139860,-0.874484,-0.705945,-0.692387,-0.588535,-0.455742,-0.519357,-0.338782,0.259196,...,-0.637288,-0.095291,-0.642485,-0.527520,0.221477,-0.474558,-0.608229,-0.767319,0.333809,-0.072360
452,1.182448,-0.241259,1.197043,1.378033,0.238169,0.606126,0.853578,1.080885,0.112927,0.581777,...,1.328814,-0.290359,1.184278,1.587068,0.302013,0.507766,0.473537,0.695868,-0.446505,0.700468
453,1.459584,0.410839,1.496217,1.737937,0.670267,1.041278,1.324430,1.751581,0.817236,-0.208263,...,1.467797,0.006726,1.332282,1.776025,0.409396,1.126942,0.836731,1.490111,1.126961,0.362294


### KNN

In [14]:
k = range(3,50,2)

train_score = []
test_score= []
recall_scoreKNN = []
scoreKNN = 0

for i in k:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(xtrain_prepros,ytrain)

    ypred_train = knn.predict(xtrain_prepros)
    train_score.append(accuracy_score(ytrain,ypred_train))

    ypred_test = knn.predict(xtest_prepros)
    acc_score = accuracy_score(ytest,ypred_test)
    test_score.append(acc_score)

    if scoreKNN < acc_score:
        scoreKNN = acc_score
        best_k = i
    

knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(xtrain_prepros,ytrain)

ypred_train = knn.predict(xtrain_prepros)

ypred_test = knn.predict(xtest_prepros) 
recall_scoreKNN.append(recall_score(ytest,ypred_test))

print (f'Best K: {best_k}\nAcc Score: {scoreKNN*100}%\nRecall Score: {recall_scoreKNN}')


Best K: 3
Acc Score: 96.49122807017544%
Recall Score: [1.0]


### DECISSION TREE

In [15]:
depths = range(3,50)
testing_accuracies = []
training_accuracies = []
recall_scoreDT = []
scoreDT = 0

for j in ['entropy','gini']:
  for i in depths:
    tree = DecisionTreeClassifier(criterion = j, max_depth = i)
    tree.fit(xtrain_prepros,ytrain)

    y_predict_train = tree.predict(xtrain_prepros)
    training_accuracies.append(accuracy_score(ytrain,y_predict_train))

    y_predict_test = tree.predict(xtest_prepros)
    acc_score = accuracy_score(ytest,y_predict_test)
    testing_accuracies.append(acc_score)

    if scoreDT < acc_score:
      best_depth = i
      best_criterion = j
      scoreDT = acc_score

tree = DecisionTreeClassifier(criterion = best_criterion, max_depth = best_depth)
tree.fit(xtrain_prepros,ytrain)

y_predict_train = tree.predict(xtrain_prepros)

y_predict_test = tree.predict(xtest_prepros)
recall_scoreDT.append(recall_score(ytest,y_predict_test))

print (f'Best Depth: {best_depth}\nBest criterion: {best_criterion}\nScore: {scoreDT*100}%\nRecall Score: {recall_scoreDT}')

Best Depth: 34
Best criterion: gini
Score: 96.49122807017544%
Recall Score: [0.9583333333333334]


### SVC

In [16]:
kernel = ['sigmoid', 'rbf', 'poly', 'linear']
testing_accuracies = []
training_accuracies = []
recall_scoreSVM =[]

scoreSVC = 0

for i in kernel:
  SVM = SVC(kernel=i)
  SVM.fit(xtrain_prepros,ytrain)

  y_predict_train = SVM.predict(xtrain_prepros)
  training_accuracies.append(accuracy_score(ytrain,y_predict_train))

  y_predict_test = SVM.predict(xtest_prepros)
  acc_score = accuracy_score(ytest,y_predict_test)
  testing_accuracies.append(acc_score)

  if scoreSVC < acc_score:
    best_kernel = i
    scoreSVC = acc_score

SVM = SVC(kernel=best_kernel)
SVM.fit(xtrain_prepros,ytrain)

y_predict_train = SVM.predict(xtrain_prepros)

y_predict_test = SVM.predict(xtest_prepros)
recall_scoreSVM.append(recall_score(ytest,y_predict_test))

print (f'Best Kernel: {best_kernel}\nScore: {scoreSVC*100}%\nRecall Score: {recall_scoreSVM}')

Best Kernel: linear
Score: 98.24561403508771%
Recall Score: [0.9861111111111112]


In [17]:
pd.DataFrame({
    'Model' : ['KNN', 'Decission Tree', 'SVC'],
    'Accuracy Score' : [scoreKNN,scoreDT,scoreSVC],
    'Recall Score' : [recall_scoreKNN[0],recall_scoreDT[0],recall_scoreSVM[0]]
}).sort_values('Recall Score', ascending=False)

Unnamed: 0,Model,Accuracy Score,Recall Score
0,KNN,0.964912,1.0
2,SVC,0.982456,0.986111
1,Decission Tree,0.964912,0.958333


[[41  1]
 [ 1 71]]

True Positives: 71
True Negatives: 41
False Positives: 1
False Negatives: 1
