In [None]:
# install transfomers library if you need
!pip install transformers

In [19]:
# import required libraries
import os
import re
import numpy as np
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras
from transformers import *

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import gc
from tensorflow.keras.models import Model
from keras.models import load_model

In [5]:
# For visualization

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [68]:
#random seed
tf.random.set_seed(1234)
np.random.seed(1234)

BATCH_SIZE = 32
NUM_EPOCHS = 50
VALID_SPLIT = 0.2
MAX_LEN = 237 # EDA에서 추출된 Max Length
DATA_IN_PATH = '/home/jovyan/Desktop'
DATA_OUT_PATH = "/home/jovyan/data_out"

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased", cache_dir='bert_ckpt', do_lower_case=False, truncation=True)

In [None]:
# file load from google colab
from google.colab import drive
drive.mount('/content/drive')

In [70]:
data = pd.read_csv("YOUR FILE NAME", encoding='cp949')

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(sentence):
    sentence=str(sentence)
    korean = re.compile('[\u3131-\u3163\uac00-\ud7a3]+')
    sentence = re.sub(korean, '', sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"")
    cleanr = re.compile('=#$-+<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

data['IMG_RSLT']=data['IMG_RSLT'].map(lambda s:preprocess(s))

In [None]:
data['DVT 부위'] = data['DVT 부위'].replace("distal",0)
data['DVT 부위'] = data['DVT 부위'].replace("proximal",1)
data['DVT 부위'] = data['DVT 부위'].replace("DVT&PTE",2)
data['DVT 부위'] = data['DVT 부위'].replace("other",3)
data['DVT 부위'] = data['DVT 부위'].replace("PTE",4)
data['DVT 부위'] = data['DVT 부위'].replace("thrombophlebitis",5)

# Assign X and y after cleaning and replacement
X = data['IMG_RSLT']
y = data['DVT 부위']

# Check for any remaining null values in the target variable
print("Null values in 'DVT 부위' after cleaning:", y.isnull().sum())

In [77]:
# Bert Tokenizer

# Refer: https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus

def bert_tokenizer(sent, MAX_LEN):

    encoded_dict = tokenizer.encode_plus(
        text = sent,
        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
        max_length = MAX_LEN,           # Pad & truncate all sentences.
        padding = 'max_length', # Use padding='max_length'
        truncation = True, # Ensure truncation is explicitly set
        return_attention_mask = True   # Construct attn. masks.

    )

    input_id = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).
    token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences

    return input_id, attention_mask, token_type_id

In [None]:
class TFBertClassifier(tf.keras.Model):
    def __init__(self, model_name, dir_path, num_class):
        super(TFBertClassifier, self).__init__()

        self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)
        self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(num_class,
                                                kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),
                                                name="classifier")

    def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):

        #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)
        outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        logits = self.classifier(pooled_output)

        return logits

cls_model = TFBertClassifier(model_name='bert-base-multilingual-uncased',
                                  dir_path='bert_ckpt',
                                  num_class=6)

In [79]:
# Model Building
optimizer = tf.keras.optimizers.Adam(3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


In [80]:
# Preparing cross validation (k=5)
kfold = KFold(n_splits=5)
scores=[]

In [None]:
model_itr = 0

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    #keras.backend.clear_session()

    #input data
    train_input_ids = []
    train_attention_masks = []
    train_token_type_ids = []
    train_data_labels = []


    for train_sent, train_label in tqdm(zip(X_train, y_train), total=len(X_train)):
        try:
            input_id, attention_mask, token_type_id = bert_tokenizer(train_sent, MAX_LEN)

            train_input_ids.append(input_id)
            train_attention_masks.append(attention_mask)
            train_token_type_ids.append(token_type_id)
            train_data_labels.append(train_label)

        except Exception as e:
            print(f"Error tokenizing sentence: {train_sent}")
            print(e)
            # Append placeholders to maintain consistent list lengths
            train_input_ids.append([0] * MAX_LEN)
            train_attention_masks.append([0] * MAX_LEN)
            train_token_type_ids.append([0] * MAX_LEN)
            train_data_labels.append(-1) # Use a placeholder label or decide how to handle

    train_input_ids = np.array(train_input_ids, dtype=int)
    train_attention_masks = np.array(train_attention_masks, dtype=int)
    train_type_ids = np.array(train_token_type_ids, dtype=int)
    train_inputs = (train_input_ids, train_attention_masks, train_type_ids)

    # Filter out placeholder labels if necessary, or handle them in the loss function
    # For now, let's assume we remove them for training
    valid_indices = [i for i, label in enumerate(train_data_labels) if label != -1]
    train_inputs = (train_input_ids[valid_indices], train_attention_masks[valid_indices], train_type_ids[valid_indices])
    train_data_labels = np.asarray([train_data_labels[i] for i in valid_indices], dtype=np.int64)

    model_itr += 1
    model_name = "tf2_bert_dvt__multi" + str(model_itr)

    # ealrystop
    earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=5)
    # min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
    # patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\

    checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')
    checkpoint_dir = os.path.dirname(checkpoint_path)

    # Create path if exists
    if os.path.exists(checkpoint_dir):
        print("{} -- Folder already exists \n".format(checkpoint_dir))
    else:
        os.makedirs(checkpoint_dir, exist_ok=True)
        print("{} -- Folder create complete \n".format(checkpoint_dir))

    cp_callback = ModelCheckpoint(
        filepath=checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=False)


    # Learning and evaluation
    history = cls_model.fit(train_inputs, train_data_labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
                        validation_split = VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])

    plot_graphs(history, 'loss')

    #output data
    test_input_ids = []
    test_attention_masks = []
    test_token_type_ids = []
    test_data_labels = []

    for test_sent, test_label in tqdm(zip(X_test, y_test)):
        try:
            input_id, attention_mask, token_type_id = bert_tokenizer(test_sent, MAX_LEN)

            test_input_ids.append(input_id)
            test_attention_masks.append(attention_mask)
            test_token_type_ids.append(token_type_id)
            test_data_labels.append(test_label)
        except Exception as e:
            print(f"Error tokenizing sentence: {test_sent}")
            print(e)
            # Append placeholders to maintain consistent list lengths
            test_input_ids.append([0] * MAX_LEN)
            test_attention_masks.append([0] * MAX_LEN)
            test_token_type_ids.append([0] * MAX_LEN)
            test_data_labels.append(-1) # Use a placeholder label or decide how to handle


    test_input_ids = np.array(test_input_ids, dtype=int)
    test_attention_masks = np.array(test_attention_masks, dtype=int)
    test_type_ids = np.array(test_token_type_ids, dtype=int)
    test_inputs = (test_input_ids, test_attention_masks, test_type_ids)

    # Filter out placeholder labels if necessary
    valid_indices = [i for i, label in enumerate(test_data_labels) if label != -1]
    test_inputs = (test_input_ids[valid_indices], test_attention_masks[valid_indices], test_type_ids[valid_indices])
    test_data_labels = np.asarray([test_data_labels[i] for i in valid_indices], dtype=np.int32)


    results = cls_model.evaluate(test_inputs, test_data_labels, batch_size=1024)
    print("test loss, test acc: ", results)

    print("num sents, labels {}, {}".format(len(test_input_ids), len(test_data_labels)))

    y_pred = cls_model.predict(test_inputs)
    y_pred_arg = np.argmax(y_pred, axis=1)
    score = f1_score(test_data_labels, y_pred_arg, average='micro')
    scores.append(score)

In [None]:
confusion_matrix(test_data_labels, y_pred_arg)

In [None]:
print(scores)

In [None]:
print("average f1 score: ",np.mean(scores))