In [None]:
# for Google Colab use A100 GPU

# **Step 1:** Setup Environment and import neccesory libraries

In [None]:
!pip install tltk
!pip install pythainlp
!pip install llama-index-llms-openai-like
!pip install --upgrade openai

In [None]:
import json, re, time, math
from pythainlp.tokenize import word_tokenize
from pythainlp.tag import pos_tag
from tqdm import tqdm
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.llms.openai_like import OpenAILike
from openai import OpenAI, APIError
from collections import defaultdict

In [None]:
def clean_text(text):
  text = re.sub(r'<.*?>','', text)
  text = re.sub(r'#','',text)
  #for c in string.punctuation:
      #text = re.sub(r'\{}'.format(c),'',text)
  text = text.replace("[", '')
  text = text.replace("]", '')
  text = text.replace("(", '')
  text = text.replace(")", '')
  text = text.replace("=", '')
  text = text.replace('“', '')
  text = text.replace('”', '')

  text = ' '.join(text.split()) # remove separator ex. \n \t
  return text

In [None]:
def compound_word_constructor(word_list):
  noun_list = ["NCMN", "NPRP"]
  verb_list = ["VACT", "VSTA"]
  prefix_list = ["FIXN"]
  preposition_list = ["RPRE"]
  adverb_list = ["ADVN"]
  end_list = ["PUNC"]
  verb_to_be = ["เป็น", "อยู่", "คือ"]

  i = 0
  prev_tag = ""
  next_tag = ""
  compound_word = []
  compound_word_list = []
  tokens_list = []
  for word, tag in word_list:
    if(len(word_list) == 2):
      if(tag in noun_list):
        compound_word = []
      tokens_list.append(word)
    else:
      if(len(compound_word) == 0):
        if(tag in noun_list or tag in prefix_list):
          compound_word.append(word)
        else:
          tokens_list.append(word)
      else:
        if(tag in end_list):
          compound_word_list.append("".join(compound_word))
          tokens_list.append("".join(compound_word))
          compound_word = []
          tokens_list.append(word)
        else:
          if(i < len(word_list) - 1):
            next_tag = word_list[i+1][1]
            prev_tag = word_list[i-1][1]
            if(tag in noun_list):
              compound_word.append(word)
            elif(tag in verb_list):
              if((prev_tag in prefix_list) and (tag not in verb_to_be)):
                compound_word.append(word)
              elif((prev_tag in noun_list) and (next_tag in noun_list or next_tag in end_list) and (tag not in verb_to_be)):
                compound_word.append(word)
              else:
                compound_word_list.append("".join(compound_word))
                tokens_list.append("".join(compound_word))
                compound_word = []
            elif(tag in prefix_list):
              if(next_tag in verb_list): compound_word.append(word)
              else:
                compound_word_list.append("".join(compound_word))
                tokens_list.append("".join(compound_word))
                compound_word = []
                compound_word.append(word)
            elif(tag in adverb_list):
              if(prev_tag in noun_list): compound_word.append(word)
            elif(tag in preposition_list):
              if((prev_tag in noun_list or prev_tag in verb_list) and (next_tag in noun_list or next_tag in prefix_list)):
                compound_word.append(word)
              else:
                compound_word_list.append("".join(compound_word))
                tokens_list.append("".join(compound_word))
                compound_word = []
            else:
              compound_word_list.append("".join(compound_word))
              tokens_list.append("".join(compound_word))
              compound_word = []
          else:
            prev_tag = word_list[i-1][1]
            if(tag in noun_list):
              compound_word.append(word)
            elif((tag in verb_list) and (prev_tag in prefix_list)):
              compound_word.append(word)
            elif((tag in adverb_list) and (prev_tag in noun_list)):
              compound_word.append(word)
            compound_word_list.append("".join(compound_word))
            tokens_list.append("".join(compound_word))
            compound_word = []

    i += 1

  return [compound_word_list, tokens_list]

# **Step 2:** Load sample data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_samples = []
file = open('/content/drive/MyDrive/Thai-Healthcare-Dataset/sample_articles.json', encoding="utf8")
data = json.load(file)
samples = data

# **Step 3:** Generate the generative words and use Typhoon suggest the SNOMED CT

In [None]:
def generate_words(text):
  words = word_tokenize(text, engine="tltk")
  tags = pos_tag(words)
  generative_words, tokens = compound_word_constructor(tags)

  return generative_words, tokens

In [None]:
prompt_role = "You are Named Entity Recognition (NER) annotator named ThaiNer. You are an expert in the Thai healthcare domian. Youalways answer in Thai"

In [None]:
openAI = OpenAI(api_key="YOUR_OPENAI_API")

In [None]:
typhoon = OpenAILike(model="typhoon-v2-70b-instruct",
                api_base="https://api.opentyphoon.ai/v1",
                context_window=8192,
                is_chat_model=True,
                max_tokens=2048,
                is_function_calling_model=False,
                request_timeout=180,
                max_new_tokens=2046,
                api_key="sk-h2NoABRy5xUDtmkBvq0UW875Z30ujPM1JKcka3JtReHtpNIT")
response = typhoon.chat([ChatMessage(role=MessageRole.SYSTEM, content=prompt_role)])

In [None]:
def getPromptNER(words):
  return f'''
Please annotate each word in list below:
{words}

follow a list of the Top Level Concepts of SNOMED CT with a brief description of the content represented in their branch of the hierarchy below:
1. Body structure represents normal and abnormal anatomical structures (e.g. mitral valve structure, adenosarcoma).
2. Clinical finding represents the result of a clinical observation, assessment or judgment and includes normal and abnormal clinical states (e.g. asthma, headache, normal breath sounds). The clinical findin|hierarchy includes concepts used to represent diagnoses.
3. Environments and geographical locations represents types of environments as well as named locations such as countries, states and regions (e.g. intensive care unit, academic medical center, Denmark).
4. Event represents occurrences excluding procedures and interventions (e.g. flood, earthquake).
5. Observable entity represents a question or assessment which can produce an answer or result (e.g. systolic blood pressure, color of iris, gender).
6. Organism represents organisms of significance in human medicine (e.g. streptococcus pyogenes, domain Bacteria, glossina).
7. Pharmaceutical/biologic product represents drug products (e.g. amoxicillin 250mg oral capsule, product containing codeine and paracetamol).
8. Physical force represents physical forces that can play a role as mechanisms of injury (e.g. friction, radiation, alternating current).
9. Physical object represents natural and man-made physical objects (e.g. vena cava filter, implant device, automobile).
10. Procedure represents activities performed in the provision of health care. This includes not only invasive procedures but also administration of medicines, imaging, education, therapies and administrative procedures (e.g. appendectomy, physiotherapy, injection into subcutaneous tissue).
11. Qualifier value represents the values for some SNOMED CT attributes, where those values are not subtypes of other top level concepts. (e.g. left, abnormal result, severe).
12. Record artifact represents content created for the purpose of providing other people with information about record events or states of affairs. (e.g. patient held record, record entry, family history section).
13. Situation with explicit context represents concepts in which the clinical context is specified as part of the definition of the concept itself. These include presence or absence of a condition, whether a clinical finding is current, in the past or relates to someone other than the subject of the record (e.g. endoscopy arranged, past history of myocardial infarction, family history of glaucoma).
14. Social context represents social conditions and circumstances significant to health care (e.g. occupation, spiritual or religious belief).
15. Special concept represents concepts that do not play a part in the formal logic of the concept model of the terminology, but which may be useful for specific use cases (e.g. navigational concept, alternative medicine poisoning).
16. Specimen represents entities that are obtained (usually from the patient) for examination or analysis (e.g. urine specimen, specimen from prostate obtained by needle biopsy).
17. Staging and scales represents assessment scales and tumor staging systems (e.g. Glasgow Coma Scale, FIGO staging system of gynecological malignancy).
18. Substance represents general substances, the chemical constituents of pharmaceutical/biological products, body substances, dietary substances and diagnostic substances (e.g. methane, insulin, albumin)

Remarks: If not matching any concept, please annotate the word as None. Do not annotate values other than the Top Level Concepts list.

Strictly return it in the following JSON format:
{{"entities":[{{"e":"word","t":"type"}}]}}

ตัวอย่าง:
Input: "ผู้ป่วยมีอาการไข้และไอ ใช้ยาพาราเซตามอล"
Output: {{"entities":[{{"e":"ผู้ป่วย","t": "None"}},{{"e":"อาการไข้","t":"Clinical finding"}},{{"e":"ไอ","t":"Clinical finding"}},{{"e":"ยาพาราเซตามอล","t":"Pharmaceutical/biologic product"}}]}}
'''

In [None]:
def retryWithBackoff(func, retries=3, initial_delay=1, backoff_factor=2):
  for i in range(retries):
    try:
      return func()
    except APIError:  # Catch both APIError and APITimeoutError
      if i < retries - 1:
        time.sleep(initial_delay * (backoff_factor ** i))
        print(f"Retrying ({i + 1}/{retries}) after {initial_delay * (backoff_factor ** i)} seconds...")
      else:
        raise APIError

def annotateWithTyphoon(text):
  prompt = getPromptNER(text)
  try:
    response = typhoon.chat([ChatMessage(role=MessageRole.USER, content=prompt)])
    message = response.message.content
    return message
  except APIError:
    print("Error")
    return ""

def annotateLabel(words):
  labels = []
  slide = 10
  loop = math.floor(len(words)/slide)
  for i in range(loop-1):
    if((i+1 * slide) <= len(words)):
      slide = 10
    else:
      slide = len(words)%10
    if(slide > 0):
      words_list = []
      for j in list(range(10)):
        words_list.append(words.pop(j))
      prompt_words = ",".join(words_list)
      extract_labels = retryWithBackoff(lambda: annotateWithTyphoon(prompt_words))
      try:
        for label in json.loads(extract_labels)['entities']:
          labels.append(label)
      except json.JSONDecodeError:
        print('Error')

  return labels

In [None]:
snomed_datasets = []
for n in tqdm(range(0, 500)):
  labels = []
  sample = samples[n]
  if "abstract-th" in sample:
    if(sample["abstract-th"] != ""):
      text = clean_text(sample["abstract-th"])
      generative_words, tokens = generate_words(text)
      labels = annotateLabel(generative_words)

      entity_dict = defaultdict(set)
      for label in labels:
        entity_dict[label["t"]].add(label["e"])

      snomed_types = {label: list(types) for label, types in entity_dict.items()}

      snomed_datasets.append({"text":text, "types": snomed_types})

  n = n+1

In [None]:
def sortByStart(e):
  return e['start']

# **Step 4:** Group the genertive keywords aligned with SNOMED CT

In [None]:
snomed_type = ["Body structure", "Clinical finding", "Environments and geographical locations", "Event", "Observable entity",
               "Organism", "Pharmaceutical/biologic product", "Physical force", "Physical object", "Procedure",
               "Qualifier value", "Record artifact", "Situation with explicit context",  "Social context",  "Special concept",
               "Specimen", "Staging and scales", "Substance"]

In [None]:
snomed_type_count = {"Body structure": 0, "Clinical finding": 0, "Environments and geographical locations": 0, "Event": 0, "Observable entity": 0,
               "Organism": 0, "Pharmaceutical/biologic product": 0, "Physical force": 0, "Physical object": 0, "Procedure": 0,
               "Qualifier value": 0, "Record artifact": 0, "Situation with explicit context": 0,  "Social context": 0,  "Special concept": 0,
               "Specimen": 0, "Staging and scales": 0, "Substance": 0}

In [None]:
n = 0
position_datasets = {}
for n in tqdm(range(len(snomed_datasets))):
  text = clean_text(snomed_datasets[n]['text'])
  types = snomed_datasets[n]['types']
  entities = []
  tokens = []
  position_datasets[n] = {"text": text, "entities": [], "tokens": []}

  for key in types:
    if key in snomed_type:
      for item in types[key]:
        snomed_type_count[key] = snomed_type_count[key] + 1
        char_idx = 0
        token_end = 0
        token_num = text.count(item)
        token_list = []
        for k in range(token_num):
          token_start = text.find(item, char_idx)
          token_end = token_start + len(item)
          char_idx = token_end
          token_list.append([item, token_start, token_end])

        i = 0
        for token in token_list:
          token_start = token[1]
          token_end = token[2]
          token_text = token[0]

          dup = False
          if len(entities) > 0:
            for j, ent in enumerate(entities):
              if token_start == ent["start"]:
                if token_end < ent["end"]:
                  dup = True
                elif token_end > ent["end"]:
                  entities[j]["start"] = token_start
                  entities[j]["end"] = token_end
                  entities[j]["type"] = key
                  entities[j]["token"] = token_text
                  dup = True
                else:
                  dup = True

              if token_end == ent["end"]:
                if token_start > ent["start"]:
                  dup = True
                elif token_start < ent["start"]:
                  entities[j]["start"] = token_start
                  entities[j]["end"] = token_end
                  entities[j]["type"] = key
                  entities[j]["token"] = token_text
                  dup = True
                else:
                  dup = True

              if token_start > ent["start"] and token_end < ent["end"]:
                dup = True

              if token_start < ent["start"] and token_end > ent["end"]:
                dup = True

              j = j+1

          if dup == False:
            entities.append({"start": token_start, "end": token_end, "type": key, "token": item})

          i = i+1
  entities.sort(key=sortByStart)

  new_set = entities
  unique_data = list({json.dumps(item, ensure_ascii=False): item for item in new_set}.values())

  # If you want to sort them (optional)
  unique_data = sorted(unique_data, key=lambda x: (x['start'], x['end']))

  new_token_list = []
  for item in unique_data:
      new_token_list.append(item)
      tokens.append(item['token'])
  m = 0
  newlen = len(new_token_list)
  for item in new_token_list:
    if m == newlen-1:
      break
    else:
      next_token = new_token_list[m+1]
      if next_token['start'] > item["start"] and next_token["end"] < item["end"]:
        del new_token_list[m+1]
        newlen = len(new_token_list)
      if (next_token['start'] > item["start"] and next_token['start'] <= item["end"]) and next_token["end"] > item["end"]:
        del new_token_list[m+1]
        newlen = len(new_token_list)
    m = m+1

  position_datasets[n]["entities"] = new_token_list
  position_datasets[n]["tokens"] = list(set(tokens))

  n = n+1

# **Step 5:** Create BIO-tagged dataset

In [None]:
bio_tags = []
item = 0
for item in tqdm(position_datasets):
  data = position_datasets[item]
  text = clean_text(data["text"])
  text_len = len(text)
  entityies_len = len(data["entities"])
  dataset = []
  startidx = 0
  endidx = 0
  for i, entity in enumerate(data["entities"]):

    if(i == 0):
      #first token
      if(entity['start'] > 0):
        startidx = 0
        endidx = entity['start']
        sentence = text[startidx:endidx]
        tokens = word_tokenize(sentence, engine="tltk", keep_whitespace=False)
        for token in tokens:
          dataset.append([token, 'O'])
        dataset.append([entity['token'], entity['type']])
        startidx = entity['end']
      else:
        startidx = entity['start']
        endidx = entity['end']
        dataset.append([entity['token'], entity['type']])
        startidx = endidx
    else:
      if(entity['start'] == startidx):
        #next token not O tag
        endidx = entity['end']
        dataset.append([entity['token'], entity['type']])
        startidx = endidx
      else:
        #next token is O tag
        if(i == entityies_len-1):
          #last token
          if(entity['end'] < text_len):
            #lase token with O tag
            dataset.append([entity['token'], entity['type']])
            startidx = entity['end']
            endidx = text_len
            sentence = text[startidx:endidx]
            tokens = word_tokenize(sentence, engine="tltk", keep_whitespace=False)
            for token in tokens:
              dataset.append([token, 'O'])
          else:
            #last token without O tag
            dataset.append([entity['token'], entity['type']])
        else:
          #not last token
          next_entity = data["entities"][i+1]
          if(entity['start'] > startidx):
            sentence = text[startidx:entity['start']]
            tokens = word_tokenize(sentence, engine="tltk", keep_whitespace=False)
            for token in tokens:
              dataset.append([token, 'O'])
            dataset.append([entity['token'], entity['type']])
            startidx = entity['end']
          else:
            startidx = entity['start']
            endidx = entity['end']
            dataset.append([entity['token'], entity['type']])
            startidx = endidx
  bio_tags.append(dataset)

In [None]:
def align_tokens_to_bio(text, tokens, entities):
  types = ['O'] * len(tokens)
  char_idx = 0
  new_tokens = []
  entity_data = []

  for i, token in enumerate(tokens):
    if token.find("<Fail>") < 0:
      new_tokens.append(token)
      token_start = text.find(token, char_idx)
      token_end = token_start + len(token)
      char_idx = token_end

      for ent in entities:
        if token_start >= ent["start"] and token_end <= ent["end"]:
          if token_start == ent["start"]:
            types[i] = f"B-{ent['type']}"
          else:
            types[i] = f"I-{ent['type']}"
          entity_data.append("type:"+types[i]+" token:"+token)
  return [new_tokens, types, entity_data]

In [None]:
bio_datasets = []
for entry in tqdm(position_datasets.values()):
  text = clean_text(entry["text"])
  entities = entry["entities"]

  tokens = word_tokenize(text, engine="tltk", keep_whitespace=True)
  ner_tags = align_tokens_to_bio(text, tokens, entities)

  bio_datasets.append({
      "lang": "th",
      "tokens": ner_tags[0],
      "ner_tags": ner_tags[1],
      "text": text
  })

In [None]:
with open("/content/drive/MyDrive/Thai-Healthcare-Dataset/bio_dataset_500.json", "w", encoding="utf-8") as f:
    json.dump(bio_datasets, f, ensure_ascii=False, indent=2)

# **Step 6:** Create instaruction-tuned dataset

In [None]:
def bio_list_to_html_bio(tokens, labels):
    result = []
    for token, label in zip(tokens, labels):
        label = label.strip()
        token = token.strip()

        if label == "O":
            result.append(f"<O>{token}</O>")
        else:
            result.append(f"<{label}>{token}</{label}>")
    return "".join(result)

In [None]:
def build_instruction_prompt(text):
    return (
        "Annotate the given Thai medical text using the BIO tagging format according to SNOMED CT top-level concepts. "
        "Wrap each word using tags like <O>...</O>, <B-Label>...</B-Label>, <I-Label>...</I-Label>.\n\n"
        "Input:\n" + text + "\n\nOutput:"
    )

In [None]:
converted_data = []
for item in bio_datasets:
    input_text = item["text"]
    html_output = bio_list_to_html_bio(item["tokens"], item["ner_tags"])
    converted_data.append({
        "instruction": create_instruction_prompt(item),
        "input": input_text,
        "output": html_output
    })

In [None]:
with open("/content/drive/MyDrive/Thai-Healthcare-Dataset/thai_healthcare_instruction.json", "w", encoding="utf-8") as f:
    json.dump(converted_data, f, ensure_ascii=False, indent=2)