In [1]:
from keras.layers import Input, Embedding, Conv1D, LSTM, Dense, Bidirectional
from keras.models import Model, Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import json
import numpy as np
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
from sklearn.metrics import classification_report

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\GIGABYTE\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\GIGABYTE\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# Load JSON

def load_data_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]

    words = [entry["words"] for entry in data]
    tags = [entry["tags"] for entry in data]

    return words, tags


train_words, train_tags = load_data_from_json('PhoNER_COVID19-main-BIO/PhoNER_COVID19-main/data/syllable/train_syllable.json')
dev_words, dev_tags = load_data_from_json('PhoNER_COVID19-main-BIO/PhoNER_COVID19-main/data/syllable/dev_syllable.json')
test_words, test_tags = load_data_from_json('PhoNER_COVID19-main-BIO/PhoNER_COVID19-main/data/syllable/test_syllable.json')

def merge_data(words1, tags1, words2, tags2):
    merged_words = words1 + words2
    merged_tags = tags1 + tags2
    return merged_words, merged_tags

traindev_words, traindev_tags = merge_data(train_words, train_tags, dev_words, dev_tags)


In [3]:
# Tokenize words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_words)

# Đổi chữ thành ID
traindev_sequences = tokenizer.texts_to_sequences(traindev_words)
train_sequences = tokenizer.texts_to_sequences(train_words)
dev_sequences = tokenizer.texts_to_sequences(dev_words)
test_sequences = tokenizer.texts_to_sequences(test_words)


# Đổi tag thành ID
tag_to_index = {'O': 0, 'B-AGE': 1, 'B-DATE': 2, 'B-GENDER': 3, 'B-JOB': 4, 'B-LOCATION': 5, 'B-NAME': 6, 'B-ORGANIZATION': 7, 'B-PATIENT_ID': 8, 'B-SYMPTOM_AND_DISEASE': 9, 'B-TRANSPORTATION': 10, 'I-AGE': 11, 'I-DATE': 12, 'I-GENDER': 13, 'I-JOB': 14, 'I-LOCATION': 15, 'I-NAME': 16, 'I-ORGANIZATION': 17, 'I-PATIENT_ID': 18, 'I-SYMPTOM_AND_DISEASE': 19, 'I-TRANSPORTATION': 20}

traindev_tags = [[tag_to_index.get(tag) for tag in seq] for seq in traindev_tags]
train_tags = [[tag_to_index.get(tag) for tag in seq] for seq in train_tags]
dev_tags = [[tag_to_index.get(tag) for tag in seq] for seq in dev_tags]
test_tags = [[tag_to_index.get(tag) for tag in seq] for seq in test_tags]

In [4]:
# Get the word-to-index mapping
word_index = tokenizer.word_index

# Print the ID of a specific token
token_to_check = '"'
token_id = word_index.get(token_to_check)

if token_id is not None:
    print(f"The ID of '{token_to_check}' is: {token_id}")
else:
    print(f"{token_to_check} is not in the vocabulary.")


The ID of '"' is: 9


In [5]:
def remove_tokens(data, tags, tokens_to_remove):
    new_data = []
    new_tags = []
    for seq, tag_seq in zip(data, tags):
        new_seq = [word for word in seq if word not in tokens_to_remove]
        new_tag_seq = [tag for word, tag in zip(seq, tag_seq) if word not in tokens_to_remove]
        new_data.append(new_seq)
        new_tags.append(new_tag_seq)
    return new_data, new_tags

tokens_to_remove = [1, 4, 9, 32, 33, 151, 769]

# Remove instances of the specified tokens
traindev_sequences, traindev_tags = remove_tokens(traindev_sequences, traindev_tags, tokens_to_remove)
train_sequences, train_tags = remove_tokens(train_sequences, train_tags, tokens_to_remove)
dev_sequences, dev_tags = remove_tokens(dev_sequences, dev_tags, tokens_to_remove)
test_sequences, test_tags = remove_tokens(test_sequences, test_tags, tokens_to_remove)


In [6]:
# Print the modified data
print(train_sequences[1])

[56, 2, 7, 11, 210, 31, 64, 2, 3, 140, 99, 299, 8, 190, 105, 81, 661, 870, 142, 512, 11, 347, 147, 188, 109, 2, 3, 63, 770, 95, 47, 12, 68, 228, 49, 172, 157, 745, 247, 871, 109, 2, 3, 143, 2, 905, 2429, 223, 61, 11, 88, 237, 12, 24, 35]


In [7]:
# Pad chuỗi entry theo câu
max_seq_len = max_seq_len = 150
traindev_data = pad_sequences(traindev_sequences, maxlen=max_seq_len, padding='post', truncating='post')
train_data = pad_sequences(dev_sequences, maxlen=max_seq_len, padding='post', truncating='post')
dev_data = pad_sequences(dev_sequences, maxlen=max_seq_len, padding='post', truncating='post')
test_data = pad_sequences(test_sequences, maxlen=max_seq_len, padding='post', truncating='post')

# Pad chuỗi tag theo câu
traindev_tags = pad_sequences(traindev_tags, maxlen=max_seq_len, padding='post')
train_tags = pad_sequences(train_tags, maxlen=max_seq_len, padding='post')
dev_tags = pad_sequences(dev_tags, maxlen=max_seq_len, padding='post')
test_tags = pad_sequences(test_tags, maxlen=max_seq_len, padding='post')

In [8]:
print(train_data[1])

[   2    3  808   37 1233   34   15    3  117  518 2154 1665 1485  171
  821  411 1923   58   55  209   10   92  348   21  680  150   21    2
    3 1028  134   15    3  117 1665 2154   54  191  222   38   39    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]


In [9]:
# Module load file w2v
def load_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            word = parts[0]
            vector = list(map(float, parts[-300:])) #  dims vector embedding
            embeddings[word] = vector
    return embeddings

def create_embedding_matrix(tokenizer, embeddings, embedding_dim):
    vocab_size = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix


embedding_file_path = 'word2vec_vi_syllables_300dims.txt' # Input path file embedding (.txt)
word_embeddings = load_embeddings(embedding_file_path)
embedding_dim = 300  # dims vector embedding


# Load pre-train embedding
embeddings = load_embeddings(embedding_file_path)
# Embedding matrix
embedding_matrix = create_embedding_matrix(tokenizer, embeddings, embedding_dim)

In [10]:
print(dev_data[42])

[  98    2    3   20  125  171 1005  422   11   31   57  111   16    2
    3   86  325  194    2    3  852   67   36   34  798   37 1068   34
  207  724   37  375   34  713   67  666   34   15 3283  182   48    2
    3   86  325    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]


In [11]:
print(dev_tags[42])

[ 0  0  0  0  5 15 15 15  0  0  0  0  0  0  0  0  8  0  0  0  8  3  1  0
  8  3  1  0  0  8  3  1  0  8  3  1  0  0  0  0  0  0  0  0  8  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0]


In [12]:
lstm_units = 64
num_classes = len(tag_to_index)

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)) # trainable = False
model.add(Conv1D(filters=64, kernel_size=3, activation='relu',padding='same'))
model.add(Bidirectional(LSTM(units=lstm_units, return_sequences=True)))  # Bidirectional LSTM
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 300)          1019400   
                                                                 
 conv1d (Conv1D)             (None, 150, 64)           57664     
                                                                 
 bidirectional (Bidirection  (None, 150, 128)          66048     
 al)                                                             
                                                                 
 dense (Dense)               (None, 150, 21)           2709      
                                                                 
Total params: 1145821 (4.37 MB)
Trainable params: 1145821 (4.37 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [13]:
model.fit(traindev_data, traindev_tags, validation_data=(test_data, test_tags), epochs=10, batch_size=32)


_, accuracy = model.evaluate(test_data, test_tags)
print(f'Accuracy: {accuracy * 100:.2f}%')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 98.20%


In [14]:
predictions = model.predict(test_data)
pred_tags = np.argmax(predictions, axis=-1)

print(classification_report(test_tags.flatten(), pred_tags.flatten(), target_names=list(tag_to_index.keys())))



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

                    O       0.99      0.99      0.99    421675
                B-AGE       0.73      0.67      0.70       517
               B-DATE       0.87      0.89      0.88      1577
             B-GENDER       0.79      0.70      0.75       396
                B-JOB       0.71      0.37      0.49       171
           B-LOCATION       0.71      0.71      0.71      4260
               B-NAME       0.79      0.62      0.69       252
       B-ORGANIZATION       0.85      0.74      0.79       768
         B-PATIENT_ID       0.88      0.88      0.88      1657
B-SYMPTOM_AND_DISEASE       0.80      0.77      0.78      1109
     B-TRANSPORTATION       0.75      0.60      0.67       157
                I-AGE       0.00      0.00      0.00         6
               I-DATE       0.90      0.95      0.92      1665
             I-GENDER       0.00      0.00      0.00         1
                I-JOB       0.68      0.37      0.47  

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# JOB luôn là tag perform tệ nhất ở các lần thử, các model khác nhau, do ít dữ liệu train?

In [None]:
predictions = model.predict(test_data)
pred_tags = np.argmax(predictions, axis=-1)

print(classification_report(test_tags.flatten(), pred_tags.flatten(), target_names=list(tag_to_index.keys())))