## Dataset

In [15]:
# Load the dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [16]:
import xml.etree.ElementTree as ET

def parse_ddi_corpus(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    sentences = []
    entities = []

    for sentence in root.iter('sentence'):
        sent_text = sentence.attrib['text']
        sent_entities = []

        for entity in sentence.iter('entity'):
            ent_text = entity.attrib['text']
            ent_type = entity.attrib['type']
            char_offset = entity.attrib['charOffset']
            start, end = map(int, char_offset.split('-'))
            
            sent_entities.append({
                'text': ent_text,
                'type': ent_type,
                'start': start,
                'end': end
            })

        sentences.append(sent_text)
        entities.append(sent_entities)

    return sentences, entities



In [44]:
# Function to convert character offsets into BIO tags
def bio_tagging(sent_text, sent_entities):
    tokens = sent_text.split()  # Tokenizing by spaces for simplicity; you can use a tokenizer here
    tags = ['O'] * len(tokens)  # Initialize all tags as 'O'
    
    for entity in sent_entities:
        entity_start = entity['start']
        entity_end = entity['end']
        entity_type = entity['type']

        # Iterate over the tokens and update the BIO tags
        current_pos = 0
        for i, token in enumerate(tokens):
            token_start = sent_text.find(token, current_pos)
            token_end = token_start + len(token) - 1
            current_pos = token_end + 1

            # Assign 'B' if the token is the start of an entity, 'I' for inside entity
            if entity_start <= token_start <= entity_end:
                if tags[i] == 'O':  # Only update if it's currently 'O'
                    if token_start == entity_start:
                        tags[i] = f'B-{entity_type}'
                    else:
                        tags[i] = f'I-{entity_type}'

    return tokens, tags



In [62]:
# Function to process all XML files in a directory
def process_all_files_in_directory(directory):
    all_results = []
    
    # Iterate through all XML files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".xml"):
            file_path = os.path.join(directory, filename)
            print(f'Processing {file_path}')
            
            # Parse the file and get sentences and entities
            sentences, entities = parse_ddi_corpus(file_path)
            
            # BIO tag each sentence
            for i, sent_text in enumerate(sentences):
                sent_entities = entities[i]
                tokens, tags = bio_tagging(sent_text, sent_entities)
                
                # Store the results
                all_results.append({
                    'sentence': sent_text,
                    'tokens': tokens,
                    'tags': tags
                })
    
    return all_results


# Path to your DrugBank directory
directory = 'DDICorpus/Train/DrugBank'  # Replace with the correct path

# Process all files and get the results
results = process_all_files_in_directory(directory)

Processing DDICorpus/Train/DrugBank\19-norandrostenedione_ddi.xml
Processing DDICorpus/Train/DrugBank\Abarelix_ddi.xml
Processing DDICorpus/Train/DrugBank\Abatacept_ddi.xml
Processing DDICorpus/Train/DrugBank\Abciximab_ddi.xml
Processing DDICorpus/Train/DrugBank\Acamprosate_ddi.xml
Processing DDICorpus/Train/DrugBank\Acarbose_ddi.xml
Processing DDICorpus/Train/DrugBank\Acebutolol_ddi.xml
Processing DDICorpus/Train/DrugBank\Acetazolamide_ddi.xml
Processing DDICorpus/Train/DrugBank\Acetohydroxamic Acid_ddi.xml
Processing DDICorpus/Train/DrugBank\Aciclovir_ddi.xml
Processing DDICorpus/Train/DrugBank\Acitretin_ddi.xml
Processing DDICorpus/Train/DrugBank\Adalimumab_ddi.xml
Processing DDICorpus/Train/DrugBank\Adapalene_ddi.xml
Processing DDICorpus/Train/DrugBank\Adefovir Dipivoxil_ddi.xml
Processing DDICorpus/Train/DrugBank\Adenosine_ddi.xml
Processing DDICorpus/Train/DrugBank\Adinazolam_ddi.xml
Processing DDICorpus/Train/DrugBank\Agalsidase beta_ddi.xml
Processing DDICorpus/Train/DrugBank\A

In [64]:
for result in results:
    print(f"Sentence: {result['sentence']}")
    print(f"BIO Tags: {result['tags']}")
    print()

Sentence: No drug, nutritional supplement, food or herb interactions have yet been reported.
BIO Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Sentence: No formal drug/drug interaction studies with Plenaxis were performed.
BIO Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-brand', 'O', 'O']

Sentence: Cytochrome P-450 is not known to be involved in the metabolism of Plenaxis.
BIO Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-brand']

Sentence: Plenaxis is highly bound to plasma proteins (96 to 99%).
BIO Tags: ['B-brand', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Sentence: Laboratory Tests Response to Plenaxis should be monitored by measuring serum total testosterone concentrations just prior to administration on Day 29 and every 8 weeks thereafter.
BIO Tags: ['O', 'O', 'O', 'O', 'B-brand', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-drug', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Sentence: Serum transaminase levels should be