In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import confusion_matrix,classification_report
import tensorflow as tf
import joblib
%matplotlib inline

np.random.seed(3)
tf.random.set_seed(3)

## Dataset setting

### NHIS

In [None]:
pre = pd.read_csv('nhis_data.csv')

pre_df = pd.DataFrame(pre)

pre_df

Unnamed: 0,person_id,gender,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,...,Malignant neoplastic disease,Chronic liver disease,chronic obstructive lung disease,cerebrovascular disease,chronic kidney disease,Diabetes mellitus,Ischemic heart disease,hyperlipidemia,Hypertensive disorder,cancer
0,28610148,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,39709673,0,0,0,1,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,18979243,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0
3,29817862,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0
4,26339248,0,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36570,12746702,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1
36571,65929166,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
36572,85131223,1,0,0,0,0,0,1,0,0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1
36573,60915675,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
x_data = pre_df.iloc[:,1:27]
y_data = pre_df['cancer']

### Kangdong

In [None]:
kd = pd.read_csv('kd_data.csv')

kd_df = pd.DataFrame(kd)

kd_df

Unnamed: 0,person_id,gender,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,...,Malignant neoplastic disease,Chronic liver disease,chronic obstructive lung disease,cerebrovascular disease,chronic kidney disease,Diabetes mellitus,Ischemic heart disease,hyperlipidemia,Hypertensive disorder,cancer
0,1673221,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1289235,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1690134,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,530465,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,737827,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,1756845,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
405,14298,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
406,1646575,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
407,352251,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.model_selection import train_test_split

X_train, x_test, Y_train, y_test = train_test_split(kd_df.iloc[:,1:27],kd_df['cancer'], test_size=0.3, stratify=kd_df['cancer'],random_state=3)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, test_size=0.3, stratify=Y_train,random_state=3)

In [None]:
print("Number x_train dataset: ", x_train.shape)
print("Number y_train dataset: ", y_train.shape)
print("Number x_valid dataset: ", x_valid.shape)
print("Number y_valid dataset: ", y_valid.shape)
print("Number x_test dataset: ", x_test.shape)
print("Number y_test dataset: ", y_test.shape)

Number x_train dataset:  (200, 26)
Number y_train dataset:  (200,)
Number x_valid dataset:  (86, 26)
Number y_valid dataset:  (86,)
Number x_test dataset:  (123, 26)
Number y_test dataset:  (123,)


In [None]:
print(Counter(y_train))
print(Counter(y_valid))
print(Counter(y_test))

Counter({0: 157, 1: 43})
Counter({0: 67, 1: 19})
Counter({0: 96, 1: 27})


### Class weight

In [None]:
from sklearn.utils import class_weight

class_weights_gcd = class_weight.compute_class_weight('balanced',classes=np.unique(y_data),y=y_data)
class_weights_gcd

array([0.55312746, 5.20566467])

In [None]:
class_dict_gcd = {k:v for k,v in enumerate(class_weights)}

class_dict_gcd

{0: 0.5531274575040832, 1: 5.205664674067749}

In [None]:
class_weights_kd = class_weight.compute_class_weight('balanced',classes=np.unique(y_train),y=y_train)
class_weights_kd

In [None]:
class_dict_kd = {k:v for k,v in enumerate(class_weights)}
class_dict_kd

{0: 0.6369426751592356, 1: 2.3255813953488373}

## Modeling

### SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc, roc_auc_score  # ROC곡선 그리기

from sklearn.model_selection import learning_curve, validation_curve # 학습곡선, 검증곡선
from sklearn.model_selection import  cross_val_score, cross_val_predict  # 하이퍼파라미터 튜닝, 교차타당도

In [None]:
svm = SVC(kernel='rbf',C=9, gamma=0.1, class_weight = class_dict_gcd ,probability=True)
model = svm.fit(x_data, y_data)

In [None]:
Y_pred = model.predict(x_data)
Y_pred = [ 1 if x >= 0.5 else 0 for x in Y_pred]

In [None]:
confusion_matrix(y_data,Y_pred,labels=[1,0])
print(classification_report(y_data,Y_pred))

              precision    recall  f1-score   support

           0       1.00      0.84      0.91     33062
           1       0.40      0.98      0.56      3513

    accuracy                           0.85     36575
   macro avg       0.70      0.91      0.74     36575
weighted avg       0.94      0.85      0.88     36575



#### Application KD data

In [None]:
model = model.fit(x_train, y_train)
Y_pred = model.predict(x_valid)
Y_pred = [ 1 if x >= 0.5 else 0 for x in Y_pred]

In [None]:
confusion_matrix(y_valid,Y_pred,labels=[1,0])
print(classification_report(y_valid,Y_pred))

              precision    recall  f1-score   support

           0       0.97      0.52      0.68        67
           1       0.36      0.95      0.52        19

    accuracy                           0.62        86
   macro avg       0.67      0.73      0.60        86
weighted avg       0.84      0.62      0.64        86



In [None]:
confusion_matrix(y_valid,Y_pred,labels=[1,0])
print(classification_report(y_valid,Y_pred))

              precision    recall  f1-score   support

           0       0.97      0.52      0.68        67
           1       0.36      0.95      0.52        19

    accuracy                           0.62        86
   macro avg       0.67      0.73      0.60        86
weighted avg       0.84      0.62      0.64        86



##### Evaluation

In [None]:
Y_pred1 = model.predict(x_test)

In [None]:
Y_pred1 = [ 1 if x >= 0.5 else 0 for x in Y_pred1]

In [None]:
confusion_matrix(y_test,Y_pred1,labels=[1,0])
print(classification_report(y_test,Y_pred1))

              precision    recall  f1-score   support

           0       0.93      0.59      0.73        96
           1       0.37      0.85      0.52        27

    accuracy                           0.65       123
   macro avg       0.65      0.72      0.62       123
weighted avg       0.81      0.65      0.68       123



In [None]:
fpr, tpr, thresholds =roc_curve(y_test, model.predict_proba(x_test)[:,1])

score = auc(fpr, tpr)
print("AUC : ", score)

AUC :  0.7876157407407408


In [None]:
import joblib
joblib.dump(model, 'SVM.pkl')

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score # 정확도 함수

In [None]:
clf = RandomForestClassifier(n_estimators=100,random_state=0,max_depth=8, class_weight = class_dict)
model=clf.fit(x_data, y_data)

In [None]:
Y_pred = model.predict(x_data)
Y_pred = [ 1 if x >= 0.5 else 0 for x in Y_pred]

In [None]:
confusion_matrix(y_data,Y_pred,labels=[1,0])
print(classification_report(y_data,Y_pred))

              precision    recall  f1-score   support

           0       1.00      0.81      0.90     33062
           1       0.36      0.99      0.53      3513

    accuracy                           0.83     36575
   macro avg       0.68      0.90      0.71     36575
weighted avg       0.94      0.83      0.86     36575



#### Application KD data

In [None]:
model = model.fit(x_train, y_train)
Y_pred = model.predict(x_valid)
Y_pred = [ 1 if x >= 0.5 else 0 for x in Y_pred]

In [None]:
confusion_matrix(y_valid,Y_pred,labels=[1,0])
print(classification_report(y_valid,Y_pred))

              precision    recall  f1-score   support

           0       0.95      0.61      0.75        67
           1       0.40      0.89      0.55        19

    accuracy                           0.67        86
   macro avg       0.67      0.75      0.65        86
weighted avg       0.83      0.67      0.70        86



#### Evaluation

In [None]:
Y_pred1 = model.predict(x_test)

In [None]:
Y_pred1 = model.predict(x_test)

In [None]:
confusion_matrix(y_test,Y_pred1,labels=[1,0])
print(classification_report(y_test,Y_pred1))

              precision    recall  f1-score   support

           0       0.87      0.76      0.81        96
           1       0.41      0.59      0.48        27

    accuracy                           0.72       123
   macro avg       0.64      0.68      0.65       123
weighted avg       0.77      0.72      0.74       123



In [None]:
fpr, tpr, thresholds =roc_curve(y_test, model.predict_proba(x_test)[:,1])

score = auc(fpr, tpr)
print("AUC : ", score)

AUC :  0.7976466049382717


In [None]:
joblib.dump(model, 'Random Forest.pkl')

### DNN

In [None]:
pre_df.columns = ['person_id', 'gender', 'age_1', 'age_2', 'age_3', 'age_4', 'age_5',
       'age_6', 'age_7', 'age_8', 'smoking_status_0.0', 'smoking_status_1.0',
       'smoking_status_2.0', 'smoking_status_3.0', 'bmi_1.0', 'bmi_2.0', 'bmi_3.0',
       'bmi_4.0', 'Malignant_neoplastic_disease', 'Chronic_liver_disease',
       'chronic_obstructive_lung_disease', 'cerebrovascular_disease',
       'chronic_kidney_disease', 'Diabetes_mellitus', 'Ischemic_heart_disease',
       'hyperlipidemia', 'Hypertensive_disorder', 'cancer']

pre_df = pre_df.astype('int64')


####Model architecture

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

In [None]:
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for our feature.
    normalizer = preprocessing.Normalization()

    # Prepare a Dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

In [None]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_values=max_tokens)

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.

    return lambda feature: encoder(index(feature))

In [None]:
METRICS = [
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'),
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.AUC(name='auc'),
      tf.keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import metrics
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
def plot_metrics(history):
    metrics = ['loss', 'auc', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
                 color=colors[1], label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        elif metric == 'auc':
            plt.ylim([0.8,1])
        else:
            plt.ylim([0,1])

    plt.legend()

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('cancer')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)

    return ds

##### Create Layer

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

from tensorflow.keras import layers
from tensorflow.keras.models import Model
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.utils import class_weight
from tensorflow.keras.utils import plot_model
from tensorflow.keras.regularizers import L1, L2, L1L2
from sklearn.utils import class_weight

In [None]:
# mini_batch
batch_size = 32
all_ds = df_to_dataset(pre_df, batch_size=batch_size)

[(features, label_batch)] = all_ds.take(1)

In [None]:
categorical_cols = [ 'gender', 'age_1', 'age_2', 'age_3', 'age_4',
       'age_5', 'age_6', 'age_7', 'age_8', 'smoking_status_0.0',
       'smoking_status_1.0', 'smoking_status_2.0', 'smoking_status_3.0',
       'bmi_1.0', 'bmi_2.0', 'bmi_3.0', 'bmi_4.0',
       'Malignant_neoplastic_disease', 'Chronic_liver_disease',
       'chronic_obstructive_lung_disease', 'cerebrovascular_disease',
       'chronic_kidney_disease', 'Diabetes_mellitus',
       'Ischemic_heart_disease', 'hyperlipidemia', 'Hypertensive_disorder']

In [None]:
all_inputs = []
encoded_features = []

# Numeric features.
for header in categorical_cols:
    print(header)
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')

    # Create the encoding layer using your function
    encoding_layer = get_category_encoding_layer(header, all_ds, dtype='int64')

    # Apply the encoding layer to the input column
    encoded_categorical_col = encoding_layer(categorical_col)

    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

In [None]:
class_weights_gcd = {0:class_weights_gcd[0], 1:class_weights_gcd[1]}

In [None]:
all_inputs = []
encoded_features = []

# Numeric features.
for header in categorical_cols:
    print(header)
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')

    # Create the encoding layer using your function
    encoding_layer = get_category_encoding_layer(header, all_ds, dtype='int64')

    # Apply the encoding layer to the input column
    encoded_categorical_col = encoding_layer(categorical_col)

    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

In [None]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(256, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)

x = tf.keras.layers.Dense(64, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(32, activation="relu")(x)

x = tf.keras.layers.Dense(16, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(8, activation="relu")(x)


output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(all_inputs, output)
model.compile(optimizer='rmsprop',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=METRICS)

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")#, to_file='./model.png')

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [None]:
history = model.fit(all_ds, epochs=100, validation_data=all_ds,callbacks=[early_stopping],class_weight = class_weights)

In [None]:
all = df_to_dataset(pre_df, batch_size=len(pre_df), shuffle=False)

In [None]:
pred = model.predict(all)
pred = [ 1 if x >= 0.5 else 0 for x in pred]


[(_, label)] = all.take(1)
print(confusion_matrix(label,pred,labels=[1,0]))
print(classification_report(label,pred))

####Application KD data

In [None]:
kd_df

In [None]:
kd_df.columns = ['person_id', 'gender', 'age_1', 'age_2', 'age_3', 'age_4', 'age_5',
       'age_6', 'age_7', 'age_8', 'smoking_status_0.0', 'smoking_status_1.0',
       'smoking_status_2.0', 'smoking_status_3.0', 'bmi_1.0', 'bmi_2.0', 'bmi_3.0',
       'bmi_4.0', 'Malignant_neoplastic_disease', 'Chronic_liver_disease',
       'chronic_obstructive_lung_disease', 'cerebrovascular_disease',
       'chronic_kidney_disease', 'Diabetes_mellitus', 'Ischemic_heart_disease',
       'hyperlipidemia', 'Hypertensive_disorder', 'cancer']


kd_df = kd_df.astype('int64')

In [None]:
train, test = train_test_split(kd_df, test_size=0.3, random_state=3, stratify=kd_df['cancer'])
train, val = train_test_split(train, test_size=0.3, random_state=3, stratify=train['cancer'])
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

In [None]:
# mini_batch
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)

[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of gender:', train_features['gender'])
print('A batch of targets:', label_batch )

In [None]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
valid_ds = df_to_dataset(val, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
all_ds = df_to_dataset(pre_df, shuffle=False, batch_size=batch_size)

In [None]:
class_weights_kd = {0:class_weights_kd[0], 1:class_weights_kd[1]}

In [None]:
history = model.fit(train_ds, epochs=100, validation_data=valid_ds,callbacks=[early_stopping],class_weight = class_weights)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
1/7 [===>..........................] - ETA: 0s - loss: 0.4051 - recall: 0.8750 - precision: 0.6364 - tp: 7.0000 - fp: 4.0000 - tn: 20.0000 - fn: 1.0000 - accuracy: 0.8438 - auc: 0.8958 - prc: 0.7867Restoring model weights from the end of the best epoch: 30.
Epoch 40: early stopping


In [None]:
valid = df_to_dataset(val, batch_size=len(val),shuffle=False)

In [None]:
pred = model.predict(valid)
pred = [ 1 if x >= 0.5 else 0 for x in pred]


[(_, label)] = valid.take(1)
print(confusion_matrix(label,pred,labels=[1,0]))
print(classification_report(label,pred))

[[15  4]
 [15 52]]
              precision    recall  f1-score   support

           0       0.93      0.78      0.85        67
           1       0.50      0.79      0.61        19

    accuracy                           0.78        86
   macro avg       0.71      0.78      0.73        86
weighted avg       0.83      0.78      0.79        86



#### Evaluation

In [None]:
testing = df_to_dataset(test, batch_size=len(test),shuffle=False)

In [None]:
[(_, label)] = testing.take(1)

In [None]:
pred = model.predict(testing)
pred = [ 1 if x >= 0.5 else 0 for x in pred]


[(_, label)] = testing.take(1)
print(confusion_matrix(label,pred,labels=[1,0]))
print(classification_report(label,pred))

[[14 13]
 [14 82]]
              precision    recall  f1-score   support

           0       0.86      0.85      0.86        96
           1       0.50      0.52      0.51        27

    accuracy                           0.78       123
   macro avg       0.68      0.69      0.68       123
weighted avg       0.78      0.78      0.78       123



In [None]:
pred = model.predict(testing)

fpr, tpr, thresholds =roc_curve(label, pred)

score = auc(fpr, tpr)
print("AUC : ", score)

AUC :  0.7511574074074074


In [None]:
joblib.dump(model, 'DNN.pkl')