# Breast Cancer with Datret (Tensorflow)

Datret: Tensorflow implementation for structured tabular data.

A simple implementation of a deep neural network architecture for tabular data with automatic layer-by-layer reduction in the number of neurons and functionality similar to classical machine learning methods.

source: https://github.com/AbdualimovTP/datret

---

Architecture:
```text
Model: "DatRet with number_neurons = 500"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, X_train.shape[0)]      0         

 dense (Dense)               (None, 500)               150500    

 dense_1 (Dense)             (None, 250)               125250    

 dense_2 (Dense)             (None, 125)               31375     

 dense_3 (Dense)             (None, 62)                7812      

 dense_4 (Dense)             (None, 31)                1953      

 dense_5 (Dense)             (None, 15)                480       

 dense_6 (Dense)             (None, 7)                 112       

 dense_7 (Dense)             (None, 3)                 24        

 dense_8 (Dense)             (None, 2)                 8         
                       (2 predictable classes)                               
=================================================================
Total params: 317,514
Trainable params: 317,514
Non-trainable params: 0
```
---

In [75]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.model_selection import train_test_split
from fast_ml.model_development import train_valid_test_split

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import CategoricalCrossentropy, MeanSquaredError, BinaryCrossentropy
from datret.datret import DatRetClassifier, DatRetRegressor, DatRetMultilabelClassifier

In [76]:
# Random Seed
random_seed = 42 # set random seed for reproducibility
tf.random.set_seed(random_seed) # set random seed for TensorFlow
np.random.seed(random_seed) # set random seed for NumPy

## Data

In [77]:
df = pd.read_csv('datasets/mf_df_2_breast_cancer.csv')
df

Unnamed: 0,Id,Clump_thickness,Uniformity_cell_size,Uniformity_cell_shape,Marginal_adhesion,Single_e_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2.0,1,1,1,2
695,841769,2,1,1,1,2,1.0,1,1,1,2
696,888820,5,10,10,3,7,3.0,8,10,2,4
697,897471,4,8,6,4,3,4.0,10,6,1,4


In [78]:
target_column='Class'

In [79]:
df[target_column].replace(to_replace={2:0, 4:1}, inplace=True)

In [80]:
df[:3]

Unnamed: 0,Id,Clump_thickness,Uniformity_cell_size,Uniformity_cell_shape,Marginal_adhesion,Single_e_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,0
1,1002945,5,4,4,5,7,10.0,3,2,1,0
2,1015425,3,1,1,1,2,2.0,3,1,1,0


In [81]:
y = np.array(df[target_column]) # value dari label
df = df.drop(target_column, axis=1) # drop target
X = np.array(df) # df dalam bentuk numpy array

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [82]:
# X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(df, target=target_column, train_size=0.8, valid_size=0.1, test_size=0.1)

In [83]:
print(X_train.shape, y_train.shape)
# print(X_valid.shape, y_valid.shape)
print(X_test.shape, y_test.shape)

(559, 10) (559,)
(140, 10) (140,)


In [84]:
# scaler = StandardScaler()
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_valid = scaler.fit_transform(X_valid)
X_test = scaler.fit_transform(X_test)

In [85]:
temp = pd.DataFrame(X_train)
temp[:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.080816,0.444444,0.111111,0.0,0.0,0.111111,0.0,0.222222,0.0,0.0
1,0.078163,0.444444,0.222222,0.222222,0.333333,0.111111,0.333333,0.222222,0.333333,0.0
2,0.086815,0.0,0.0,0.0,0.111111,0.111111,0.0,0.222222,0.0,0.0


## Modeling

In [86]:
# Call the regressor or classifier and train the model.
DR = DatRetClassifier(epoch=50,
                      optimizer=Nadam(learning_rate=0.001),
                      loss=BinaryCrossentropy(),
                      verbose=1,
                      number_neurons=1000,
                      validation_split = 0.2,
                      batch_size=100,
                      shuffle=True,
                      callback=[]
                      )

DR.fit(X_train, y_train)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## Model Evaluation

In [87]:
# predict the actual label (or class) over a new set of data.
DR_predict = DR.predict(X_test)

# predict the class probabilities for each data point.
DR_predict_proba = DR.predict_proba(X_test)



In [88]:
cr = classification_report(y_test, DR_predict)
print(cr)

              precision    recall  f1-score   support

           0       0.90      0.99      0.94        95
           1       0.97      0.78      0.86        45

    accuracy                           0.92       140
   macro avg       0.94      0.88      0.90       140
weighted avg       0.93      0.92      0.92       140

