# Read and process json files into bio files

In [None]:
import nltk
from nltk.tokenize import word_tokenize, WordPunctTokenizer, RegexpTokenizer
import re
import json
import os
import shutil
import random
from glob import glob
import torch

file_path_list = list_files_in_directory('train_json/All_2') # path of json files.
bio_path='example' # path to store the bio files.


nltk.download('punkt')
tokenizer = RegexpTokenizer(r'\w+|[/;\-]|[^\w\s]', flags=re.UNICODE)

def list_files_in_directory(directory_path):
    try:
        items = os.listdir(directory_path)
        files = [directory_path+'/'+item for item in items if os.path.isfile(os.path.join(directory_path, item))]
        return files
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

def tokenize_with_positions(sentence):
    tokens = []
    positions = []
    offset = 0  
    for word in tokenizer.tokenize(sentence):
        word_start = sentence.index(word, offset)
        word_end = word_start + len(word)
        tokens.append(word)
        positions.append((word_start, word_end))
        offset = word_end
    return tokens, positions


for file in file_path_list:
    with open(file, 'r', encoding='utf-8',errors='ignore') as f1, open(bio_path+'/'+file.split('/')[-1].replace('.json','.bio'), 'w') as f2:
        try:
            data = json.load(f1)
        except Exception:
            file_error = 1
            print(f1)
        out_text = ''
        content = data["content"]
        indexes = data["indexes"]
        entity_list = []
        file_error = 0
        # step 1: Load entities and tags
        for i in indexes:
            try:
                begin = indexes[i]['Token'][0]['begin']
                end = indexes[i]['Token'][0]['end']
                word = content[begin:end]
            except Exception:
                (indexes[i])
            if ('Entity' in indexes[i]):
                try:
                    end_entity = indexes[i]['Entity'][0]['end']
                    begin_entity = indexes[i]['Entity'][0]['begin']
                    tag = indexes[i]['Entity'][0]['semantic']
                    entity_list.append({'start_pos':begin_entity, 'end_pos':end_entity, 'entity_type':tag, 'entity': content[begin_entity:end_entity]})
                except Exception:
                    file_error = 1
                    tag = 'O'
                    
        if file_error==1:
            continue
        else:
            tokens, positions = tokenize_with_positions(content)
            
            # step 2：Initialize bio tags
            bio_tags = ['O'] * len(tokens)
            
            # step 3：Add tags for tokens based on the position of strings
            for entity in entity_list:
                ent_start = entity['start_pos']
                ent_end = entity['end_pos']
                ent_type = entity['entity_type']
                for i, (tok_start, tok_end) in enumerate(positions):
                    if tok_end <= ent_start or tok_start >= ent_end:
                        continue
                    if tok_start == ent_start:
                        bio_tags[i] = 'B-' + ent_type
                    else:
                        bio_tags[i] = 'I-' + ent_type
        
            for token, tag in zip(tokens, bio_tags):
                f2.write(f"{token}\t{tag}\n")

In [None]:
# split data for training/testing

def move_random_files(src_folder, dst_folder, percentage=0.2):
    # Ensure src_folder exists
    if not os.path.exists(src_folder):
        print(f"Source folder '{src_folder}' does not exist.")
        return

    # Clear or create destination folder
    if os.path.exists(dst_folder):
        shutil.rmtree(dst_folder)  # Delete existing folder and all contents
    os.makedirs(dst_folder)

    # Get all files in source folder (ignore subdirectories)
    all_files = [f for f in os.listdir(src_folder) if os.path.isfile(os.path.join(src_folder, f))]

    # Calculate number to move
    selected_count = max(1, int(len(all_files) * percentage))
    selected_files = random.sample(all_files, selected_count)

    # Move files
    for file in selected_files:
        src_path = os.path.join(src_folder, file)
        dst_path = os.path.join(dst_folder, file)
        shutil.move(src_path, dst_path)
        #print(f"Moved: {file}")

    print(f"\nTotal moved: {len(selected_files)} files to '{dst_folder}'.")

# Usage
move_random_files("example", "example_test", percentage=0.2)

# Process bio files for LLM inputs

In [10]:
folder_path =  'example'   #'CHANGE TO PATH OF THE DATA FOR NAMED ENTITY RECOGINITION, THE FORMAT IS BIO FILES IN THIS CASE'
train_files = ['/home/jupyter/20000360102458359xu/LingfeiQian/bilingual_bert/Bilingual_llm/'+folder_path+'/' + f for f in os.listdir(folder_path) if f.endswith('.bio')] # files for NER


test_folder_path =  'example_test'   #'CHANGE TO PATH OF THE DATA FOR NAMED ENTITY RECOGINITION, THE FORMAT IS BIO FILES IN THIS CASE'
test_files = ['/home/jupyter/20000360102458359xu/LingfeiQian/bilingual_bert/Bilingual_llm/'+test_folder_path+'/' + f for f in os.listdir(test_folder_path) if f.endswith('.bio')] 

In [12]:
entity_list = ['Language_Fluent', 'Language_Some', 'Language_No', 'Language_Other'] # load tag types 

In [24]:
prompt = '''### Your task is to generate an HTML version of an input text, using HTML <span> tags to mark up specific entities.

### Entity Markup Guides:
Use <span class="Language_Fluent"> to denote a language spoken by the patient fluently.
Use <span class="Language_Some"> to denote a language spoken by the patient in moderate level.
Use <span class="Language_No"> To denote a language which cannot be spoken or can only be spoken a little by the patient.
Use <span class="Language_Other"> to denote a language that not related to patient.

### Entity Definitions:
Language_Fluent: The person speaks the language fluently, including native speakers and those who have achieved nearnative fluency. They can use the language effectively in  various contexts with complete fluency and cultural understanding. Instances where a patient’s fluency is not explicitly stated but can be directly inferred (e.g. mention of interpreter/translator, preference of language on prescription) are included in this definition.
Language_Some: The person has a moderate level of proficiency in the language. They can understand and use the language for basic communication and simple conversations but are not fully fluent.
Language_No: The person knows a few words or phrases in the language but cannot use it for basic communication. Their knowledge is very limited and not sufficient for meaningful  interaction. Or the person does not know or speak the language at all. Cannot use this Language to communicate at all.
Language_Other: Languages mentioned in the text that are not related to the person’s language proficiency. These may be languages discussed in a different context or related to other individuals. 

### Additional Rules:
1. Only annotate the name of the language itself.
2. Do not annotate descriptive words about the language proficiency level (e.g. "some", "simple") or negations (e.g. "not", "no").
3. Hyphenated languages (ex. Chinese-Mandarin) or language connected by "/" should be labelled separately for each token.
4. Words that could refer to a language but are not used in that context (e.g. Greek yogurt, Irish Catholic, French catheter size) should not be annotated.

### Input Text: {} <EOS>
### Output Text:'''

In [14]:
def normalize_punctuation_spacing(text):
    # Remove extra spaces before punctuation (e.g., " hello,world" → "hello,world")
    text = re.sub(r'\s+([,.!?;:])', r'\1', text)
    # Ensure there's a space after punctuation (e.g., "hello,world" → "hello, world")
    text = re.sub(r'([,.!?;:])(?=\S)', r'\1 ', text)
    # Remove extra spaces (e.g., multiple spaces)
    text = re.sub(r'\s+', ' ', text)
    # Remove leading and trailing spaces
    text = text.strip()
    return text


def bio_file_to_text_and_highlight(bio_filename):
    """
    Read content from a BIO file and return:
      1. Plain text string (without tags) # for LLM input
      2. Highlighted text string (entity parts wrapped in <span class="TYPE">...</span>) # for LLM target output

    Example：
    Input: english B-Language_Some
    Output: english and <span class="Language_Some">english</span>
    """
    plain_text = ""
    highlighted_text = ""
    current_entity = ""
    current_type = ""

    with open(bio_filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for line in lines:
        line = line.strip()
        if not line:
            if current_entity:
                highlighted_text += f'<span class="{current_type}">{current_entity.strip()}</span> '
                plain_text += current_entity
                current_entity = ""
                current_type = ""
            highlighted_text += "\n"
            plain_text += "\n"
            continue

        if "\t" not in line:
            continue  # ignore wrong lines

        word, tag = line.split("\t", 1)

        if tag.startswith("B-"):
            if current_entity:
                highlighted_text += f'<span class="{current_type}">{current_entity.strip()}</span> '
                plain_text += current_entity
            current_type = tag[2:]
            current_entity = word + " "

        elif tag.startswith("I-") and tag[2:] == current_type:
            current_entity += word + " "

        else:
            if current_entity:
                highlighted_text += f'<span class="{current_type}">{current_entity.strip()}</span> '
                plain_text += current_entity
                current_entity = ""
                current_type = ""
            highlighted_text += word + " "
            plain_text += word + " "

    if current_entity:
        highlighted_text += f'<span class="{current_type}">{current_entity.strip()}</span> '
        plain_text += current_entity
    
    return (plain_text.strip()), (highlighted_text.strip())

In [15]:
# check the processed data 
a,b = bio_file_to_text_and_highlight(train_files[0])

In [18]:
train_prompts = []
train_answers = []
for file in train_files:
    plain, html = bio_file_to_text_and_highlight(file)
    train_prompts.append(prompt.format(plain))
    train_answers.append(html)


test_prompts = []
test_answers = []
for file in test_files:
    plain, html = bio_file_to_text_and_highlight(file)
    test_prompts.append(prompt.format(plain))
    test_answers.append(html)

# Process data for fine-tuning formats

In [39]:
path_for_training_data = '/home/jupyter/20000360102458359xu/LingfeiQian/saved_dataset/YBXL' # change to path for your training data

In [33]:
train_set_input = train_prompts
test_set_input = test_prompts

train_set_output = train_answers
test_set_output = test_answers

In [37]:
from datasets import Dataset, DatasetDict
import pandas as pd
import os
import json

def transfer_local_data_into_huggingface_data(inputs, outputs, task, train_test, training_path = path_for_training_data):
    dataset_folder = os.path.join(training_path, f'{task}_{train_test}')
    data_folder = os.path.join(dataset_folder, 'data')
    os.makedirs(data_folder, exist_ok=True)
    if train_test == 'train' or train_test == 'val':
        data_list = []
        for inp, oup in zip(inputs, outputs):
            data_list.append([{'role': 'user', 'content': inp}, {'role': 'assistant', 'content': oup}])
            
        data_df = pd.DataFrame({"conversations": data_list})
    elif train_test == 'test':
        
        data_df = pd.DataFrame({"query": inputs, "answer": outputs})

    data_dataset = Dataset.from_pandas(data_df)
    dataset_dict = DatasetDict({train_test: data_dataset})
    data_path = os.path.join(data_folder, f"{train_test}.parquet")
    dataset_dict[train_test].to_parquet(data_path)

    gitattributes_content = """*.parquet filter=lfs diff=lfs merge=lfs -text"""
    with open(os.path.join(dataset_folder, "gitattributes"), "w", encoding="utf-8") as f:
        f.write(gitattributes_content)
    readme_content = """# My Hugging Face Conversational Dataset

This dataset contains a structured dataset in Hugging Face format with `train` and `test` splits.

## Structure
- `data/train.parquet`: Training conversations.
- `data/test.parquet`: Testing conversations.

## Usage
To load this dataset in Python:
```python
from datasets import load_dataset
dataset = load_dataset("parquet", data_files={"train": "data/train.parquet", "test": "data/test.parquet"})
""" 
    with open(os.path.join(dataset_folder, "README.md"), "w", encoding="utf-8") as f: 
        f.write(readme_content)
    return dataset_dict

#train_dataset = Dataset.from_pandas(train_conversations)

In [38]:
train_dict = transfer_local_data_into_huggingface_data(train_set_input, train_set_output, 'Bilingual_example', train_test='train')
test_dict = transfer_local_data_into_huggingface_data(test_set_input, test_set_output, 'Bilingual_example', train_test='test')

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 239.99ba/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 694.88ba/s]


In [None]:
train_dict['train'][0]