In [None]:
!pip install transformers
!pip install datasets

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

plm = "EleutherAI/pythia-160m-deduped"

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(plm, revision="step3000")
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.padding_side = 'left'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset, Features, Value
dataset = load_dataset("csv", data_files="All13.tsv", delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                       column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

In [None]:
import torch
sub_datasets = torch.utils.data.random_split(dataset['train'], [ 164495 , 0 ])
print(len(sub_datasets[0]))
for i in range(4): print(sub_datasets[0][i])

In [None]:
PAD_IDX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
IGNORED_PAD_IDX = -100
PAD_IDX

In [None]:
from torch.utils.data import DataLoader
import torch

train_data = list(sub_datasets[0])

def collate_batch(batch):
    texts = [f"{bos} {data['content']} {sep}"+ data['label'].replace('\\n','\n')+f" {eos}" for data in list(batch)] # 範例 prompt
    encoded_seq = tokenizer(texts, padding=True)

    indexed_tks = torch.tensor(encoded_seq['input_ids'])
    attention_mask = torch.tensor(encoded_seq['attention_mask'])
    encoded_label = torch.tensor(encoded_seq['input_ids'])
    encoded_label[encoded_label == tokenizer.pad_token_id] = IGNORED_PAD_IDX

    return indexed_tks, encoded_label, attention_mask

train_dataloader = DataLoader(train_data, batch_size=2, shuffle=False, collate_fn=collate_batch)
titer = iter(train_dataloader)
tks, labels, masks= next(titer)
print(tks.shape)
next(iter(titer))

In [None]:
import random
BATCH_SIZE = 4 # 自行決定大小

class BatchSampler():
    def __init__(self, data, batch_size):
        self.pooled_indices = []
        self.data = data
        self.batch_size = batch_size
        self.len = len(list(data))
    def __iter__(self):
        self.pooled_indices = []
        indices = [(index, len(data["content"])) for index, data in enumerate(self.data)]
        random.shuffle(indices)
        for i in range(0, len(indices), BATCH_SIZE * 100):
            self.pooled_indices.extend(sorted(indices[i:i + BATCH_SIZE * 100], key=lambda x: x[1], reverse=True))
        self.pooled_indices = [x[0] for x in self.pooled_indices]

        for i in range(0, len(self.pooled_indices), BATCH_SIZE):
            yield self.pooled_indices[i:i + BATCH_SIZE]
    def __len__(self):
        return (self.len + self.batch_size - 1) // self.batch_size

bucket_train_dataloader = DataLoader(train_data, batch_sampler=BatchSampler(train_data, BATCH_SIZE),
                                     collate_fn=collate_batch, pin_memory=True)

In [None]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(plm,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained(plm, revision="step3000", config=config)
model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import torch
from tqdm import tqdm#, tqdm_notebook
from torch.nn import functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def sample_text(model, tokenizer, seed, n_words=20):
    model = model.to(device)
    model.eval()
    text = tokenizer.encode(seed)
    inputs, past_key_values = torch.tensor([text]), None
    with torch.no_grad():
        for _ in tqdm(range(n_words)):
            out = model(inputs.to(device), past_key_values=past_key_values)
            logits = out.logits
            past_key_values = out.past_key_values
            log_probs = F.softmax(logits[:, -1], dim=-1)
            inputs = torch.multinomial(log_probs, 1)
            text.append(inputs.item())
            if tokenizer.decode(inputs.item()) == eos:
                break


    return tokenizer.decode(text)

sample_text(model, tokenizer, seed=f"{bos} DR AADLAND ABRAHAM {sep}")

In [None]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

EPOCHS = 2 # 設定你的訓練次數
optimizer = AdamW(model.parameters(),lr=5e-5)

steps = len(bucket_train_dataloader)
total_steps = steps * EPOCHS
print(steps, total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=total_steps*0.1,
    num_training_steps=total_steps
)

model.resize_token_embeddings(len(tokenizer))
model.to(device)
print(f'Total numbers of steps: {total_steps}')
model


In [None]:
from tqdm import tqdm,trange

global_step = 0
total_loss = 0

model.train()
for _ in trange(EPOCHS, desc="Epoch"):
    model.train()
    total_loss = 0

    predictions , true_labels = [], []

    for step, (seqs, labels, masks) in enumerate(bucket_train_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels)#, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

In [None]:
torch.save(model.state_dict(),"/content/drive/MyDrive/160-13-2md.pt")

In [None]:
model.load_state_dict(torch.load("/content/drive/MyDrive/160md.pt"))

In [None]:
from datasets import load_dataset, Features, Value
valid_data = load_dataset("csv", data_files="opendid_valid.tsv", delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'])
valid_list= list(valid_data['train'])
valid_list

In [None]:
import re
from tqdm import tqdm#, tqdm_notebook
import torch

tokenizer.padding_side = 'left'
def sample_batch(model, tokenizer, input):
    """Generate text from a trained model."""
    model.eval()
    seeds = [f"{bos} {text['content']} {sep}" for text in input]
    texts = tokenizer(seeds, return_tensors = 'pt', padding=True).to(device)
    outputs = []
    #return
    with torch.cuda.amp.autocast():
        output_tokens = model.generate(**texts, max_new_tokens=400, pad_token_id = PAD_IDX,
                                        eos_token_id=tokenizer.convert_tokens_to_ids(eos))
        preds = tokenizer.batch_decode(output_tokens)
        for idx , pred in enumerate(preds):
            pred = pred[pred.index(sep)+len(sep):].replace(pad, "").replace(eos, "").strip()
            if pred == "PHI: NULL":
                continue
            phis = pred.split('\n')
            lidxs = {}
            for p in phis:
                tid = p.find(':')
                if tid > 0:
                    text = p[tid+1:].strip()
                    nv = text.find('=>')
                    normalizedV = None
                    # 處理時間正規化
                    # YOU IMPLEMENTATION
                    if nv > 0:
                      normalizedV = text[nv+2:]
                      text = text[:nv]

                      # ILEN (DATE)
                    else:
                      match_date = re.match(r'(\d{1,2})/(\d{1,2})/(\d{2,4})', text)  # 匹配日期的正則表達式
                      if match_date:
                          day, month, year = map(int, match_date.groups())
                          if len(str(year)) == 2:  # 如果年份只有兩位數
                              year += 2000  # 假設年份在 100 以內，將其轉換為四位數年份格式
                          normalizedV = f"{year:04d}-{month:02d}-{day:02d}"
                      else:
                          match_time = re.match(r'(\d{1,2}:\d{2}(?:am|pm)) on (\d{1,2}/\d{1,2}/\d{2,4})', text)
                          if match_time:
                              time, date_text = match_time.groups()
                              day, month, year = map(int, date_text.split('/'))
                              normalizedV = f"{year:04d}-{month:02d}-{day:02d}T{time}"

                    lidx = 0
                    if text in lidxs:
                        lidx = lidxs[text]
                    lidx = input[idx]['content'].find(text, lidx)
                    eidx = lidx+len(text)

                    lidxs[text] = eidx
                    sidx=int(input[idx]['idx'])

                    if lidx != eidx:

                      if normalizedV is None:
                        if text == 'P.O. BOX 246' or text == 'PO BOX 224' or text == 'PO BOX 1322' or text == 'PO BOX 1000':
                            outputs.append(f'{input[idx]["fid"]}\tLOCATION-OTHER\t{lidx + sidx}\t{eidx + sidx}\t{text}')
                        else:
                          outputs.append(f'{input[idx]["fid"]}\t{p[:tid]}\t{lidx+sidx}\t{eidx+sidx}\t{text}')
                      else:
                          outputs.append(f'{input[idx]["fid"]}\t{p[:tid]}\t{lidx+sidx}\t{eidx+sidx}\t{text}\t{normalizedV}')
    return outputs

f = open("answer.txt", "w", encoding="utf-8")
BATCH_SIZE = 8
for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
    with torch.no_grad():
        seeds = valid_list[i:i+BATCH_SIZE]
        outputs = sample_batch(model, tokenizer, input=seeds)
        for o in outputs:
            f.write(o)
            f.write('\n')
f.close()


In [None]:
def process_data(input_file, output_file):
    with open(input_file, 'r', encoding="utf-8") as file_x, open(output_file, "w", encoding="utf-8") as file_y:
        previous_data = set()
        for line in file_x:
            line = line.strip()  # 移除開頭/結尾的空格或換行符號
            data_identifier = '\t'.join(line.split('\t')[:5])
            # 如果資料已存在，則跳過這行
            if data_identifier in previous_data:
                continue
            # 如果資料不存在，則寫入
            file_y.write(line + '\n')  # 在每一行後加入換行符號
            # 更新現有資料
            previous_data.add(data_identifier)


In [None]:
import pandas as pd
import numpy as np

# 讀取資料到 DataFrame 中
ipdata = pd.read_csv("answer.txt", sep='\t', header=None, names=['file_name', 'label', 'start', 'end', 'content', 'nomal'])
ipdata = ipdata.fillna("NULL")

# 定義 PHI 集合
PHI = frozenset(["PATIENT", "DOCTOR", "USERNAME", "PROFESSION", "ROOM", "DEPARTMENT", "HOSPITAL", "ORGANIZATION", "STREET", "CITY", "STATE",
                 "COUNTRY", "ZIP", "LOCATION-OTHER", "AGE", "DATE", "TIME", "DURATION", "SET", "PHONE", "FAX", "EMAIL", "URL", "IPADDR", "SSN",
                 "MEDICALRECORD", "HEALTHPLAN", "ACCOUNT", "LICENSE", "VECHICLE", "DEVICE", "BIOID", "IDNUM", "OTHER"])
for i in range(len(ipdata)):
    if (ipdata.loc[i]["content"] == "NULL"):
        ipdata=ipdata.drop(i)
    elif (ipdata.loc[i]["label"] not in PHI):
        ipdata=ipdata.drop(i)

ipdata.replace('NULL', np.nan, inplace=True)
ipdata.to_csv("test.tsv", sep='\t', index=False, header=False)
x = 'test.tsv'
y = 'answer0.txt'
process_data(x,y)

In [None]:
from google.colab import files

# 下載生成的 TSV 檔案
files.download('test.tsv')
files.download('answer0.txt')
files.download('answer.txt')
