In [None]:
#Checking data and transform the excel format into json format (For the BERT models)
import pandas as pd
import json
import random
import string

def generate_random_prefix(length=16):

    return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length))

def generate_random_id_with_prefix(index, prefix, total_length=24):

    index_str = str(index)
    padding_length = total_length - len(prefix) - len(index_str)
    if padding_length < 0:
        raise ValueError("total_length can't hold prefix and index_str, please adjust total_length or the length of prefix.")
    padding = ''.join(random.choices(string.ascii_lowercase + string.digits, k=padding_length))
    return f"{prefix}{padding}{index_str}"

def generate_answer_json_from_xlsx(file_path, output_file):

    # Load excel file
    try:
        data = pd.read_excel(file_path, usecols=["Sentence", "Question", "Answer"])
    except Exception as e:
        raise ValueError(f"can not read the columns：{e}")

    # Make sure the columns exist
    required_columns = {"Sentence", "Question", "Answer"}
    if not required_columns.issubset(data.columns):
        raise ValueError(f"The file doesn't have necessary column：{required_columns - set(data.columns)}")

    # generate random refix
    prefix = generate_random_prefix()

    # open output file
    with open(output_file, "w", encoding="utf-8") as f:
        for idx, row in data.iterrows():
            context = row["Sentence"]
            question = row["Question"]
            answers = [ans.strip() for ans in str(row["Answer"]).split(";")]
            answer_starts = [context.find(answer) for answer in answers]

            #
            if -1 in answer_starts:
                raise ValueError(f"The answer is not found in the context， please check the row：\nContext: {context}\nAnswers: {answers}")

            # Constructing JSON object
            obj = {
                "id": generate_random_id_with_prefix(idx, prefix),
                "question": question,
                "context": context,
                "answers": {
                    "answer_start": answer_starts,
                    "text": answers
                }
            }

            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

    print(f"JSON file completed：{output_file}")

#generate json file
input_file1 = "/content/RNA_Protein_.xlsx"
output_file1 = "/content/RPI_data.json"
generate_answer_json_from_xlsx(input_file1, output_file1)


JSON文件已生成：/content/RPI_data.json


In [None]:
#Train Dev Test
import json
import random

# Load every sentence of JSON
data = []
with open("/content/PPI_RPI_Table_data.json", 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line:
            data.append(json.loads(line))

# shuffle
random.shuffle(data)

# devide
n_total = len(data)
n_train = int(n_total * 0.8)
n_val = int(n_total * 0.1)

train_data = data[:n_train]
val_data = data[n_train:n_train + n_val]
test_data = data[n_train + n_val:]

def save_one_line_one_json(filename, dataset):
    with open(filename, 'w', encoding='utf-8') as f:
        for item in dataset:
            output_item = {
                "id": item.get("id", ""),
                "question": item.get("question", ""),
                "context": item.get("context", ""),
                "answers": item.get("answers", {"answer_start": [], "text": []})
            }
            f.write(json.dumps(output_item, ensure_ascii=False) + '\n')

# save
save_one_line_one_json('train.json', train_data)
save_one_line_one_json('val.json', val_data)
save_one_line_one_json('test.json', test_data)

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")



Train: 4262, Val: 532, Test: 534


In [None]:
#Divide the data into 10% 25% 50%
import json
import random

# loading
with open("train.json", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# shuffle
random.shuffle(data)

# calculate
total = len(data)
sizes = {
    "10%": int(total * 0.10),
    "25%": int(total * 0.25),
    "50%": int(total * 0.50),
}

# 4. save the subset
for key, size in sizes.items():
    subset = data[:size]
    output_path = f"/content/train_subset_{key}.json"
    with open(output_path, "w", encoding="utf-8") as f_out:
        for item in subset:
            json.dump(item, f_out)
            f_out.write("\n")
    print(f"✅ Saved {key} subset ({size} samples) to {output_path}")


✅ Saved 10% subset (426 samples) to /content/train_subset_10%.json
✅ Saved 25% subset (1065 samples) to /content/train_subset_25%.json
✅ Saved 50% subset (2131 samples) to /content/train_subset_50%.json


In [None]:
#Transform the json file into jsonl file (for the GPT models)
import json

#Read dev.json
with open("/content/train_subset_50%.json", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

#Transform into OpenAI fine-tuning format
converted = []
for item in data:
    context = item["context"].strip()
    question = item["question"].strip()
    answer = item["answers"]["text"][0].strip()

    messages = [
        {"role": "system", "content": "You are a biomedical QA assistant."},
        {"role": "user", "content": f"Context:\n{context}\n\nQuestion:\n{question}\n\nAnswer:"},
        {"role": "assistant", "content": answer}
    ]
    converted.append({"messages": messages})

#Save as JSONL file
with open("train50%_finetune.jsonl", "w", encoding="utf-8") as f:
    for entry in converted:
        json.dump(entry, f, ensure_ascii=False)
        f.write("\n")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Table data preprocess

In [None]:
#Random Order Without Annotation
# Google Drive
# from google.colab import drive
# drive.mount('/content/drive')


import pandas as pd
import numpy as np


input_path  = '/content/Random_Context.xlsx'
output_path = '/content/500_Random_Order.xlsx'


df = pd.read_excel(input_path, header=None, dtype=str).fillna('')


def shuffle_and_join(row):
    tokens = [t for t in row.tolist() if t.strip()!='']
    np.random.shuffle(tokens)
    return ' '.join(tokens)

df['context'] = df.apply(shuffle_and_join, axis=1)


pd.DataFrame({'context': df['context']}) \
  .to_excel(output_path, index=False)

print(f"Table_data_shuffled_without_annotation：{output_path}")



随机合并后的 context 已保存到：/content/500_Random_Order.xlsx


In [None]:
# Random Order With Annoatation
# from google.colab import drive
# drive.mount('/content/drive')

import pandas as pd
import re
import random


file_path  = '/content/Table2_part2.xlsx'
output_csv = '/content/Table4_part2.csv'

xls = pd.ExcelFile(file_path)
sheet_names = xls.sheet_names

contexts = []


for sheet in sheet_names:
    df = pd.read_excel(file_path, sheet_name=sheet, dtype=str).fillna('')
    headers = df.columns.tolist()


    for _, row in df.iterrows():
        pairs = []
        for h in headers:
            val = row[h].strip()
            val = re.sub(r'[\u00A0\u2002\u2003\u2009]', ' ', val)
            val = re.sub(r'\s+', ' ', val).strip()
            if val and val.lower() not in ('nan', 'none'):
                pairs.append(f"{h}: {val}")
        random.shuffle(pairs)
        contexts.append(' '.join(pairs))


pd.DataFrame({'context': contexts}) \
  .to_csv(output_csv, index=False, encoding='utf-8-sig')

print(f"Table_data_shuffled_with_annotation：{output_csv}")


✅ 已保存：/content/Table4_part2.csv
