In [1]:
!pip install seaborn

from __future__ import absolute_import, division, print_function, unicode_literals

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns




In [2]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.7.0


In [3]:
path = keras.utils.get_file("breast-cancer-wisconsin.data","'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'")


In [4]:
columns = ['sample_code', 'clump_thickness', 'cell_size_uniformity',
           'cell_shape_uniformity',
           'marginal_adhesion', 'single_epithelial_cell_size',
           'bare_nuclei', 'bland_chromatin',
           'normal_nucleoli', 'mitoses', 'class']
data = pd.read_csv(path, header=None, names=columns, na_values=[np.nan, '?'])### เนื่องจาก Meaning ของข้อมูลเป็น numerical type จึงใช้ na_value ตรวจหาและแทนทีค่าใน list ด้วย NULL เพื่อให้ dataframe เป็น numerical type ทังหมด


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sample_code                  699 non-null    int64  
 1   clump_thickness              699 non-null    int64  
 2   cell_size_uniformity         699 non-null    int64  
 3   cell_shape_uniformity        699 non-null    int64  
 4   marginal_adhesion            699 non-null    int64  
 5   single_epithelial_cell_size  699 non-null    int64  
 6   bare_nuclei                  683 non-null    float64
 7   bland_chromatin              699 non-null    int64  
 8   normal_nucleoli              699 non-null    int64  
 9   mitoses                      699 non-null    int64  
 10  class                        699 non-null    int64  
dtypes: float64(1), int64(10)
memory usage: 60.2 KB


In [6]:
data.isnull().sum()

sample_code                     0
clump_thickness                 0
cell_size_uniformity            0
cell_shape_uniformity           0
marginal_adhesion               0
single_epithelial_cell_size     0
bare_nuclei                    16
bland_chromatin                 0
normal_nucleoli                 0
mitoses                         0
class                           0
dtype: int64

In [7]:
data.shape

(699, 11)

In [8]:
data.loc[data['bare_nuclei'].isnull(),['bare_nuclei']].tail(5)

Unnamed: 0,bare_nuclei
297,
315,
321,
411,
617,


In [9]:
data = data.fillna(data.median())

In [10]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sample_code,699.0,1071704.0,617095.729819,61634.0,870688.5,1171710.0,1238298.0,13454352.0
clump_thickness,699.0,4.41774,2.815741,1.0,2.0,4.0,6.0,10.0
cell_size_uniformity,699.0,3.134478,3.051459,1.0,1.0,1.0,5.0,10.0
cell_shape_uniformity,699.0,3.207439,2.971913,1.0,1.0,1.0,5.0,10.0
marginal_adhesion,699.0,2.806867,2.855379,1.0,1.0,1.0,4.0,10.0
single_epithelial_cell_size,699.0,3.216023,2.2143,1.0,2.0,2.0,4.0,10.0
bare_nuclei,699.0,3.486409,3.621929,1.0,1.0,1.0,5.0,10.0
bland_chromatin,699.0,3.437768,2.438364,1.0,2.0,3.0,5.0,10.0
normal_nucleoli,699.0,2.866953,3.053634,1.0,1.0,1.0,4.0,10.0
mitoses,699.0,1.589413,1.715078,1.0,1.0,1.0,1.0,10.0


In [11]:
np.random.seed(1) ###???

In [12]:
data.groupby(['class']).count()

Unnamed: 0_level_0,sample_code,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2,458,458,458,458,458,458,458,458,458,458
4,241,241,241,241,241,241,241,241,241,241


In [13]:
data['label'] = (data['class']==4).astype(int) ### สร้าง Column ใหม่ขึ้นมาเป็น Label ให้มีค่าเป็น 0,1 ซึ่ง 1 คือ Malignant (ค่า 4 เดิม)

In [14]:
data.groupby(['class','label']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_code,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
class,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,0,458,458,458,458,458,458,458,458,458,458
4,1,241,241,241,241,241,241,241,241,241,241


In [15]:
train = data.sample(frac=0.8).copy()

In [16]:
y_train = train['label']

In [17]:
y_train

584    0
417    0
606    0
349    1
134    0
      ..
137    0
536    0
212    0
673    0
651    0
Name: label, Length: 559, dtype: int64

In [18]:
train.drop(['sample_code', 'class','label'], axis=1, inplace=True)

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 559 entries, 584 to 651
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   clump_thickness              559 non-null    int64  
 1   cell_size_uniformity         559 non-null    int64  
 2   cell_shape_uniformity        559 non-null    int64  
 3   marginal_adhesion            559 non-null    int64  
 4   single_epithelial_cell_size  559 non-null    int64  
 5   bare_nuclei                  559 non-null    float64
 6   bland_chromatin              559 non-null    int64  
 7   normal_nucleoli              559 non-null    int64  
 8   mitoses                      559 non-null    int64  
dtypes: float64(1), int64(8)
memory usage: 43.7 KB


In [20]:
train.shape

(559, 9)

In [21]:
test = data.drop(train.index)
y_test = test['label']
test.drop(['sample_code', 'class','label'], axis=1, inplace=True)

In [22]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140 entries, 7 to 691
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   clump_thickness              140 non-null    int64  
 1   cell_size_uniformity         140 non-null    int64  
 2   cell_shape_uniformity        140 non-null    int64  
 3   marginal_adhesion            140 non-null    int64  
 4   single_epithelial_cell_size  140 non-null    int64  
 5   bare_nuclei                  140 non-null    float64
 6   bland_chromatin              140 non-null    int64  
 7   normal_nucleoli              140 non-null    int64  
 8   mitoses                      140 non-null    int64  
dtypes: float64(1), int64(8)
memory usage: 10.9 KB


In [23]:
test.shape

(140, 9)

In [24]:
train.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
clump_thickness,559.0,4.434705,2.850947,1.0,2.0,4.0,6.0,10.0
cell_size_uniformity,559.0,3.08229,3.029193,1.0,1.0,1.0,4.0,10.0
cell_shape_uniformity,559.0,3.223614,3.003895,1.0,1.0,1.0,5.0,10.0
marginal_adhesion,559.0,2.742397,2.818559,1.0,1.0,1.0,3.0,10.0
single_epithelial_cell_size,559.0,3.189624,2.190686,1.0,2.0,2.0,4.0,10.0
bare_nuclei,559.0,3.445438,3.603584,1.0,1.0,1.0,5.0,10.0
bland_chromatin,559.0,3.411449,2.44633,1.0,2.0,3.0,4.0,10.0
normal_nucleoli,559.0,2.844365,3.057824,1.0,1.0,1.0,3.5,10.0
mitoses,559.0,1.592129,1.742884,1.0,1.0,1.0,1.0,10.0


In [25]:
def build_model():
  model = keras.Sequential([
    layers.Dense(64, activation='sigmoid',
                 input_shape=[len(train.keys())]),
    #layers.Dense(64, activation='sigmoid'),
    layers.Dense(2, activation='softmax')
   ])
  
  optimizer = tf.optimizers.SGD(learning_rate=0.007)
  model.compile(loss='mse',
              optimizer=optimizer,
              metrics=['accuracy'])
  return model

In [26]:
model = build_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                640       
                                                                 
 dense_1 (Dense)             (None, 2)                 130       
                                                                 
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


In [None]:
class PrintDot(keras.callbacks.Callback):
 def on_epoch_end(self, epoch, logs):
  if epoch % 100 == 0: print('')
  print('.', end='')


history = model.fit(train, y_train, 
          batch_size=1, 
          epochs=10000, verbose=0, 
          validation_split=0.2,callbacks=[PrintDot()])



....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
..........................................................................................

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()


In [None]:
def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.plot(hist['epoch'], hist['accuracy'],
           label='Train Accuracy')
  plt.plot(hist['epoch'], hist['val_accuracy'],
           label = 'Val Accuracy')
  plt.legend()
  plt.show()

plot_history(history)


In [None]:
def define_feature_columns_layers(data_df, categorical_cols, numeric_cols):
    feature_columns = []
    feature_layer_inputs = {}

    for feature_name in numeric_cols:
        feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))
        feature_layer_inputs[feature_name] = tf.keras.Input(shape=(1,), name=feature_name)

    for feature_name in categorical_cols:
        vocabulary = data_df[feature_name].unique()
        cat = tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)
        cat_one_hot = tf.feature_column.indicator_column(cat)
        feature_columns.append(cat_one_hot)
        feature_layer_inputs[feature_name] = tf.keras.Input(shape=(1,), name=feature_name, dtype=tf.int32)

    return feature_columns, feature_layer_inputs


In [None]:
def create_logreg(feature_columns, feature_layer_inputs, optimizer,
                  loss='binary_crossentropy', metrics=['accuracy'],
                  l2=0.01):

    regularizer = keras.regularizers.l2(l2)
    feature_layer = keras.layers.DenseFeatures(feature_columns)
    feature_layer_outputs = feature_layer(feature_layer_inputs)
    norm = keras.layers.BatchNormalization()(feature_layer_outputs)
    outputs = keras.layers.Dense(1,
                                 kernel_initializer='normal',
                                 kernel_regularizer = regularizer,
                                 activation='sigmoid')(norm)

    model = keras.Model(inputs=[v for v in feature_layer_inputs.values()], outputs=outputs)
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    return model


In [None]:
categorical_cols = []
numeric_cols = ['clump_thickness', 'cell_size_uniformity', 'cell_shape_uniformity',
                'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei',
                'bland_chromatin',
                'normal_nucleoli', 'mitoses']
feature_columns, feature_layer_inputs = define_feature_columns_layers(data, categorical_cols, numeric_cols)
optimizer = keras.optimizers.Ftrl(learning_rate=0.007)
model = create_logreg(feature_columns, feature_layer_inputs, optimizer, l2=0.01)
model.summary()


In [None]:
history = model.fit(train, y_train, 
          batch_size=128, 
          epochs=100, verbose=1, 
          validation_data=(test, y_test))
