In [1]:
### IMPORTS ###
import torch
import datasets

import os
import spacy
import numpy as np
import pandas as pd

from accelerate import Accelerator
from huggingface_hub import get_full_repo_name, Repository, notebook_login
from tqdm.auto import tqdm
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, pipeline, Trainer, TrainingArguments

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from pathlib import Path

from pickle import dump, load

# exploration

So original is just the post with all annotations, and SCT and MEDDRA are different types of tags, with SCT non drug terms and Meddra drug terms

In [3]:
# explore data

# original example
pd.read_csv("./data/CADEC.v2/cadec/original/ARTHROTEC.1.ann", sep='\t', header=None)

Unnamed: 0,0,1,2
0,T1,ADR 9 19,bit drowsy
1,#1,AnnotatorNotes T1,Drowsy
2,T2,ADR 29 50,little blurred vision
3,#2,AnnotatorNotes T2,Blurred Vision
4,T3,Drug 93 102,Arthrotec
5,T5,Disease 179 188,arthritis
6,T6,Symptom 260 265,agony
7,T4,ADR 62 78,gastric problems
8,T7,Symptom 412 417,pains
9,T8,ADR 437 453,feel a bit weird


In [4]:
#  MedDRA example
pd.read_csv("./data/CADEC.v2/cadec/meddra/ARTHROTEC.1.ann", sep='\t', header=None)

Unnamed: 0,0,1,2
0,TT1,10013649 9 19,bit drowsy
1,TT2,10005886 29 50,little blurred vision
2,TT4,10056819 62 78,gastric problems
3,TT8,10025482 437 453,feel a bit weird


In [5]:
# SCT wxamples
pd.read_csv("./data/CADEC.v2/cadec/sct/ARTHROTEC.1.ann", sep='\t', header=None)

Unnamed: 0,0,1,2
0,TT1,271782001 | Drowsy | 9 19,bit drowsy
1,TT2,246636008 | Blurred vision - hazy | 29 50,little blurred vision
2,TT4,162076009 | Excessive upper gastrointestinal g...,gastric problems
3,TT3,3384011000036100 | Arthrotec | 93 102,Arthrotec
4,TT5,3723001 | Arthritis | 179 188,arthritis
5,TT6,102498003 | Agony | or 76948002|Severe pain| 2...,agony
6,TT7,22253000 | Pain | 412 417,pains
7,TT8,367391008 | Malaise | 437 453,feel a bit weird


In [30]:
annotation_directory = Path('./data/CADEC.v2/cadec/original')
unique_drugs = set()

for file_path in annotation_directory.glob('*.ann'):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        
        for line in lines:
            if '\tDrug ' in line:
                drug_name = line.split('\t')[2].strip()
                unique_drugs.add(drug_name)

print('unique drug names: \n')
for i in unique_drugs:
    print(i)

    
print ("\n num: ", len(unique_drugs))

unique drug names: 

Chondroitin
prednisone
Folgard
fish oils
arthotec
corti-cortisones
Red Yeast
phytomega vitamins
fish oil (salmon)
IB 800 mg
Pravochol
pseudoephedrine
Lopid (Gemfibrozil)
Lopid
Gas-X
arthrotec
Fosamax 35
Nosterol
vioxx
Litpitor
Atenolol
Lopressor
niaspan
CQ10
Zochor
lipoic acid
provacal
Hydrochlorothiazide
Vit. C
CoQ-10
insalin
celebrex 8
birth control
cortisone injection
vitamin C
mevacor
mypaid forte
clonidine
coenzyme Q10
Omega 3
tylenol #3
zetia
alpha lipoid acid
Celebrx
diclofenac
Vioxx
Olmetec(olmesartan)
RX
Vitamins A
testonrone
caffeine
saw palmetto
Zipsor
occupycet
Fish Oil
CQ-10
Tylenol PM
ibu
cymbalta
B3
coenzyme10
Lopids
zanax
mobic
B complex
Baycor
pravastatin
LIPEX
Simcor
Mavik 1mg
antibiotic
Birth control pills
B6
Rhinocort
C Q10
glucosamine sulfate
Vit C
toprol
Glucosamine
POLICOSANOL
Prednisone
Lipior
crestor
voltaren rapid 25
Phytosterol
Choleast
MSM
pravachol
capiten
Arthrotec 75
pamprin
IB 1600
Niacin
Lipidil Micro
Pravachol
GEMFIBROSIL
darvon
so

In [31]:
#drugs from paper

drugs = ['ARTHROTEC', 'CAMBIA', 'CATAFLAM', 'DICLOFENAC-POTASSIUM', 'DICLOFENAC-SODIUM',
         'FLECTOR', 'LIPITOR', 'PENNSAID', 'SOLARAZE', 'VOLTAREN', 'VOLTAREN-XR', 'ZIPSOR']

In [32]:
reports_per_drug = dict()


print('amount of reports for each drug: \n')

# iterate over each drug
for drug in drugs:
    reports_directory = Path('./data/CADEC.v2/cadec/original')
    
    drug_files = reports_directory.glob(drug + '*.ann')
    num_reports_for_current_drug = sum(1 for file in drug_files)
    reports_per_drug[drug] = num_reports_for_current_drug
    print(f'{drug} --> {num_reports_for_current_drug}')

amount of reports for each drug: 

ARTHROTEC --> 145
CAMBIA --> 4
CATAFLAM --> 10
DICLOFENAC-POTASSIUM --> 3
DICLOFENAC-SODIUM --> 7
FLECTOR --> 1
LIPITOR --> 1000
PENNSAID --> 4
SOLARAZE --> 3
VOLTAREN --> 68
VOLTAREN-XR --> 22
ZIPSOR --> 5


# First Step
Convert data into with aligned labels and tokens to perform the NER in IOB format.

In [7]:
# Path to data
TEXT_FOLDER = './data/CADEC.v2/cadec/text/'
OG_ANN_FOLDER = './data/CADEC.v2/cadec/original/'

In [8]:
# Function to read text from a .txt file
def read_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

# Function to read annotations from a .ann file
def read_annotations(file_path):
    annotations = []

    with open(file_path, 'r') as file:
        lines = file.readlines()

    for line in lines:
        if not line.startswith('#'):
            parts = line.split('\t')

            # Get only the important information
            information = parts[1].split()

            # Get values from info
            label = information[0]
            # Semicolon problem not solved
            start_values = [value for value in information[1].split(';')]
            end_values = [value for value in information[2].split(';')]

            for start, end in zip(start_values, end_values):
                annotations.append({"start": int(start), "end": int(end), "label": label})

    return annotations

# Update tags based on the annotations
def annotate_text(doc, annotations):
    tags = ["O"] * len(doc)
    for annotation in annotations:
        start, end, label = annotation["start"], annotation["end"], annotation["label"]
        start_token = None
        for i, token in enumerate(doc):
            if start_token is None and token.idx >= start:
                start_token = i
            if token.idx + len(token) >= end:
                for j in range(start_token, i + 1):
                    if j == start_token:
                        tags[j] = f"B-{label}"
                    else:
                        tags[j] = f"I-{label}"
                break
    return tags

In [9]:
# Path to data
TEXT_FOLDER = './data/CADEC.v2/cadec/text/'
OG_ANN_FOLDER = './data/CADEC.v2/cadec/original/'

# Function to read text from a .txt file
def read_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

# Function to read annotations from a .ann file
def read_annotations(file_path):
    annotations = []

    with open(file_path, 'r') as file:
        lines = file.readlines()

    for line in lines:
        if not line.startswith('#'):
            parts = line.split('\t')

            # Get only the important information
            information = parts[1].split()

            # Get values from info
            label = information[0]
            # Semicolon problem not solved
            start_values = [value for value in information[1].split(';')]
            end_values = [value for value in information[2].split(';')]

            for start, end in zip(start_values, end_values):
                annotations.append({"start": int(start), "end": int(end), "label": label})

    return annotations

# Update tags based on the annotations
def annotate_text(doc, annotations):
    tags = ["O"] * len(doc)
    for annotation in annotations:
        start, end, label = annotation["start"], annotation["end"], annotation["label"]
        start_token = None
        for i, token in enumerate(doc):
            if start_token is None and token.idx >= start:
                start_token = i
            if token.idx + len(token) >= end:
                print(start_token)
                for j in range(start_token, i + 1):
                    if j == start_token:
                        tags[j] = f"B-{label}"
                    else:
                        tags[j] = f"I-{label}"
                break
    return tags

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Get a list of all text files in the folder
text_files = [f for f in os.listdir(TEXT_FOLDER) if f.endswith(".txt")]

# Initialize lists to store data
data = []

# Loop through each text file
for text_file in text_files:
    print(text_file)
    # Build file paths
    txt_file_path = os.path.join(TEXT_FOLDER, text_file)
    ann_file_path = os.path.join(OG_ANN_FOLDER, text_file.replace(".txt", ".ann"))

    # Read text from the .txt file
    text = read_text(txt_file_path)

    # Read annotations from the .ann file
    annotations = read_annotations(ann_file_path)

    # Process the text with spaCy
    doc = nlp(text)

    # Perform the annotation loop
    tags_array = annotate_text(doc, annotations)

    # Create a word by word array
    words_array = [token.text for token in doc]

    # Store the data for this document
    data.append({"Words": words_array, "Tags": tags_array})

# Convert the list of dictionaries to a pandas dataframe
df = pd.DataFrame(data)

# Print the dataframe
print(df.head())


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
# # Load the spaCy model
# nlp = spacy.load("en_core_web_sm")

# # Sample text and annotations
# text = (
#     "I feel a bit drowsy & have a little blurred vision, so far no gastric problems. "
#     "I've been on Arthrotec 50 for over 10 years on and off, only taking it when I needed it. "
#     "Due to my arthritis getting progressively worse, to the point where I am in tears with the agony, "
#     "gp's started me on 75 twice a day and I have to take it. "
#     "every day for the next month to see how I get on, here goes. "
#     "So far its been very good, pains almost gone, but I feel a bit weird, didn't have that when on 50."
# )

# annotations = [
#     {"start": 9, "end": 19, "label": "ADR"},  # Drowsy
#     {"start": 29, "end": 50, "label": "ADR"},  # Blurred Vision
#     {"start": 93, "end": 102, "label": "Drug"},  # Arthrotec
#     {"start": 179, "end": 188, "label": "Disease"},  # arthritis
#     {"start": 260, "end": 265, "label": "Symptom"},  # agony
#     {"start": 62, "end": 78, "label": "ADR"},  # gastric problems
#     {"start": 412, "end": 417, "label": "Symptom"},  # pains
#     {"start": 437, "end": 453, "label": "ADR"},  # feel a bit weird
# ]

# # Process the text with spaCy
# doc = nlp(text)

# # Update annotations based on spaCy entities
# for ent in doc.ents:
#     if ent.label_ in ["Drug", "ADR", "Disease", "Symptom"]:
#         start, end, label = ent.start_char, ent.end_char, ent.label_
#         annotations.append({"start": start, "end": end, "label": label})

# # Initialize IOB tags for each token
# tags = ["O"] * len(doc)

# # Update tags based on annotations
# for annotation in annotations:
#     start, end, label = annotation["start"], annotation["end"], annotation["label"]
#     start_token = None
#     for i, token in enumerate(doc):
#         if start_token is None and token.idx >= start:
#             start_token = i
#         if token.idx + len(token) >= end:
#             for j in range(start_token, i + 1):
#                 if j == start_token:
#                     tags[j] = f"B-{label}"
#                 else:
#                     tags[j] = f"I-{label}"
#             break

# # Convert the tags to a string array
# tags_array = [f"{tag}" for tag in tags]

# # Create a word by word array
# words_array = [token.text for token in doc]

# # Print the result
# print("Words array:", words_array)
# print("Tags array:", tags_array)
