In [6]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import KFold

# Importing the relevant files
train_file = '../../Data/NCBItrainset_corpus.txt'
dev_file = '../../Data/NCBIdevelopset_corpus.txt'
model_name = '../../Models/BiLSTM_CrossAttention_NER_model.pth'

In [10]:

def load_data(file_path):
    with open(file_path, 'r') as file:
        data = file.read().split('\n\n')
    return data

def split_sentences(text):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return sentences

def extract_sentences_and_tags(data):
    sentences = []
    tags = []
    for entry in data:
        lines = entry.strip().split('\n')
        if len(lines) < 2:
            continue
        title = lines[0].split('|t|')[1]
        abstract = lines[1].split('|a|')[1]
        text = title + " " + abstract
        tagged_sentences = split_sentences(text)
        sentence_tags = []
        for i in range(2, len(lines)):
            parts = lines[i].split('\t')
            start, end, entity, entity_type, _ = int(parts[1]), int(parts[2]), parts[3], parts[4], parts[5]
            sentence_tags.append((start, end, entity, entity_type))
        sentences.append((tagged_sentences, sentence_tags))
    return sentences

def tag_sentences(sentences):
    tagged_data = []
    for sent_group, tags in sentences:
        char_offset = 0
        for sent in sent_group:
            words = sent.split()
            word_tags = ['O'] * len(words)
            for start, end, entity, entity_type in tags:
                if start >= char_offset and end <= char_offset + len(sent):
                    entity_start = start - char_offset
                    entity_end = end - char_offset
                    entity_words = re.findall(r'\w+', sent[entity_start:entity_end])
                    for i in range(len(words)):
                        if words[i:i + len(entity_words)] == entity_words:
                            for j in range(len(entity_words)):
                                word_tags[i + j] = f'I-{entity_type}'
            tagged_data.append((words, word_tags))
            char_offset += len(sent) + 1  # +1 for the space added in split_sentences
    return tagged_data

def format_for_model(tagged_data):
    formatted_data = []
    for words, tags in tagged_data:
        sentence_data = '\n'.join([f'{word}\t{tag}' for word, tag in zip(words, tags)])
        formatted_data.append(sentence_data)
    return formatted_data

# Load and preprocess data
data = load_data(train_file)
sentences = extract_sentences_and_tags(data)
tagged_data = tag_sentences(sentences)
formatted_data = format_for_model(tagged_data)

# Save formatted data to a file
with open('formatted_data.txt', 'w') as file:
    file.write('\n\n'.join(formatted_data))