In [None]:
### IMPORTS ###
import torch
import datasets

import os
import spacy
import numpy as np
import pandas as pd

from accelerate import Accelerator
from huggingface_hub import get_full_repo_name, Repository, notebook_login
from tqdm.auto import tqdm
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, pipeline, Trainer, TrainingArguments

# First Step
Convert data into with aligned labels and tokens to perform the NER in IOB format.

In [None]:
# Path to data
TEXT_FOLDER = './data/CADEC.v2/cadec/text/'
OG_ANN_FOLDER = './data/CADEC.v2/cadec/original/'

In [None]:
# Function to read text from a .txt file
def read_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

# Function to read annotations from a .ann file
def read_annotations(file_path):
    annotations = []

    with open(file_path, 'r') as file:
        lines = file.readlines()

    for line in lines:
        if not line.startswith('#'):
            parts = line.split('\t')

            # Get only the important information
            information = parts[1].split()

            # Get values from info
            label = information[0]
            # Semicolon problem not solved
            start_values = [value for value in information[1].split(';')]
            end_values = [value for value in information[2].split(';')]

            for start, end in zip(start_values, end_values):
                annotations.append({"start": int(start), "end": int(end), "label": label})

    return annotations

# Update tags based on the annotations
def annotate_text(doc, annotations):
    tags = ["O"] * len(doc)
    for annotation in annotations:
        start, end, label = annotation["start"], annotation["end"], annotation["label"]
        start_token = None
        for i, token in enumerate(doc):
            if start_token is None and token.idx >= start:
                start_token = i
            if token.idx + len(token) >= end:
                for j in range(start_token, i + 1):
                    if j == start_token:
                        tags[j] = f"B-{label}"
                    else:
                        tags[j] = f"I-{label}"
                break
    return tags

In [None]:
# Path to data
TEXT_FOLDER = './data/CADEC.v2/cadec/text/'
OG_ANN_FOLDER = './data/CADEC.v2/cadec/original/'

# Function to read text from a .txt file
def read_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

# Function to read annotations from a .ann file
def read_annotations(file_path):
    annotations = []

    with open(file_path, 'r') as file:
        lines = file.readlines()

    for line in lines:
        if not line.startswith('#'):
            parts = line.split('\t')

            # Get only the important information
            information = parts[1].split()

            # Get values from info
            label = information[0]
            # Semicolon problem not solved
            start_values = [value for value in information[1].split(';')]
            end_values = [value for value in information[2].split(';')]

            for start, end in zip(start_values, end_values):
                annotations.append({"start": int(start), "end": int(end), "label": label})

    return annotations

# Update tags based on the annotations
def annotate_text(doc, annotations):
    tags = ["O"] * len(doc)
    for annotation in annotations:
        start, end, label = annotation["start"], annotation["end"], annotation["label"]
        start_token = None
        for i, token in enumerate(doc):
            if start_token is None and token.idx >= start:
                start_token = i
            if token.idx + len(token) >= end:
                print(start_token)
                for j in range(start_token, i + 1):
                    if j == start_token:
                        tags[j] = f"B-{label}"
                    else:
                        tags[j] = f"I-{label}"
                break
    return tags

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Get a list of all text files in the folder
text_files = [f for f in os.listdir(TEXT_FOLDER) if f.endswith(".txt")]

# Initialize lists to store data
data = []

# Loop through each text file
for text_file in text_files:
    print(text_file)
    # Build file paths
    txt_file_path = os.path.join(TEXT_FOLDER, text_file)
    ann_file_path = os.path.join(OG_ANN_FOLDER, text_file.replace(".txt", ".ann"))

    # Read text from the .txt file
    text = read_text(txt_file_path)

    # Read annotations from the .ann file
    annotations = read_annotations(ann_file_path)

    # Process the text with spaCy
    doc = nlp(text)

    # Perform the annotation loop
    tags_array = annotate_text(doc, annotations)

    # Create a word by word array
    words_array = [token.text for token in doc]

    # Store the data for this document
    data.append({"Words": words_array, "Tags": tags_array})

# Convert the list of dictionaries to a pandas dataframe
df = pd.DataFrame(data)

# Print the dataframe
print(df.head())


In [None]:
# # Load the spaCy model
# nlp = spacy.load("en_core_web_sm")

# # Sample text and annotations
# text = (
#     "I feel a bit drowsy & have a little blurred vision, so far no gastric problems. "
#     "I've been on Arthrotec 50 for over 10 years on and off, only taking it when I needed it. "
#     "Due to my arthritis getting progressively worse, to the point where I am in tears with the agony, "
#     "gp's started me on 75 twice a day and I have to take it. "
#     "every day for the next month to see how I get on, here goes. "
#     "So far its been very good, pains almost gone, but I feel a bit weird, didn't have that when on 50."
# )

# annotations = [
#     {"start": 9, "end": 19, "label": "ADR"},  # Drowsy
#     {"start": 29, "end": 50, "label": "ADR"},  # Blurred Vision
#     {"start": 93, "end": 102, "label": "Drug"},  # Arthrotec
#     {"start": 179, "end": 188, "label": "Disease"},  # arthritis
#     {"start": 260, "end": 265, "label": "Symptom"},  # agony
#     {"start": 62, "end": 78, "label": "ADR"},  # gastric problems
#     {"start": 412, "end": 417, "label": "Symptom"},  # pains
#     {"start": 437, "end": 453, "label": "ADR"},  # feel a bit weird
# ]

# # Process the text with spaCy
# doc = nlp(text)

# # Update annotations based on spaCy entities
# for ent in doc.ents:
#     if ent.label_ in ["Drug", "ADR", "Disease", "Symptom"]:
#         start, end, label = ent.start_char, ent.end_char, ent.label_
#         annotations.append({"start": start, "end": end, "label": label})

# # Initialize IOB tags for each token
# tags = ["O"] * len(doc)

# # Update tags based on annotations
# for annotation in annotations:
#     start, end, label = annotation["start"], annotation["end"], annotation["label"]
#     start_token = None
#     for i, token in enumerate(doc):
#         if start_token is None and token.idx >= start:
#             start_token = i
#         if token.idx + len(token) >= end:
#             for j in range(start_token, i + 1):
#                 if j == start_token:
#                     tags[j] = f"B-{label}"
#                 else:
#                     tags[j] = f"I-{label}"
#             break

# # Convert the tags to a string array
# tags_array = [f"{tag}" for tag in tags]

# # Create a word by word array
# words_array = [token.text for token in doc]

# # Print the result
# print("Words array:", words_array)
# print("Tags array:", tags_array)
