### **Import Libraries**

In [1]:
import xml.etree.ElementTree as ET
import requests
import json
import os
import pandas as pd

### **Dataset**

In [2]:
def parse_ddi_corpus(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    sentences = []
    entities = []

    for sentence in root.iter('sentence'):
        sent_text = sentence.attrib['text']
        sent_entities = []

        for entity in sentence.iter('entity'):
            ent_text = entity.attrib['text']
            ent_type = entity.attrib['type']
            char_offset = entity.attrib['charOffset']
            start, end = map(int, char_offset.split('-'))
            
            sent_entities.append({
                'text': ent_text,
                'type': ent_type,
                'start': start,
                'end': end
            })

        sentences.append(sent_text)
        entities.append(sent_entities)

    return sentences, entities

In [3]:
def bio_tagging(sent_text, sent_entities):
    tokens = sent_text.split()  # Tokenizing by spaces for simplicity; you can use a tokenizer here
    tags = ['O'] * len(tokens)  # Initialize all tags as 'O'
    
    for entity in sent_entities:
        entity_start = entity['start']
        entity_end = entity['end']
        entity_type = entity['type']

        # Iterate over the tokens and update the BIO tags
        current_pos = 0
        for i, token in enumerate(tokens):
            token_start = sent_text.find(token, current_pos)
            token_end = token_start + len(token) - 1
            current_pos = token_end + 1

            # Assign 'B' if the token is the start of an entity, 'I' for inside entity
            if entity_start <= token_start <= entity_end:
                if tags[i] == 'O':  # Only update if it's currently 'O'
                    if token_start == entity_start:
                        tags[i] = f'B-{entity_type}'
                    else:
                        tags[i] = f'I-{entity_type}'

    return tokens, tags

In [4]:
def process_all_files_in_directory(directory):
    all_results = []
    
    # Iterate through all XML files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".xml"):
            file_path = os.path.join(directory, filename)
            # print(f'Processing {file_path}')
            
            # Parse the file and get sentences and entities
            sentences, entities = parse_ddi_corpus(file_path)
            
            # BIO tag each sentence
            for i, sent_text in enumerate(sentences):
               
                sent_entities = entities[i]
                # print(sent_entities)
                # print(type(sent_entities))
                tokens, tags = bio_tagging(sent_text, sent_entities)
                
                # Store the results
                all_results.append({
                    'sentence': sent_text,
                    'entity': sent_entities,
                    'tokens': tokens,
                    'tags': tags
                })
    
    return all_results

In [5]:
train_directory = '../DDICorpus/Train/DrugBank'
train_results = process_all_files_in_directory(train_directory)
train_directory_2 = '../DDICorpus/Train/MedLine'
train_results_2 = process_all_files_in_directory(train_directory_2)
train_results.extend(train_results_2)
print(len(train_results))  # Check the length of the train results

6967


In [6]:
for result in train_results:
    print(f"Sentence: {result['sentence']}")
    print(f"BIO Tags: {result['tags']}")
    print(f"Entities: {result['entity']}")
    print()

Sentence: No drug, nutritional supplement, food or herb interactions have yet been reported.
BIO Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Entities: []

Sentence: No formal drug/drug interaction studies with Plenaxis were performed.
BIO Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-brand', 'O', 'O']
Entities: [{'text': 'Plenaxis', 'type': 'brand', 'start': 45, 'end': 52}]

Sentence: Cytochrome P-450 is not known to be involved in the metabolism of Plenaxis.
BIO Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-brand']
Entities: [{'text': 'Plenaxis', 'type': 'brand', 'start': 66, 'end': 73}]

Sentence: Plenaxis is highly bound to plasma proteins (96 to 99%).
BIO Tags: ['B-brand', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Entities: [{'text': 'Plenaxis', 'type': 'brand', 'start': 0, 'end': 7}]

Sentence: Laboratory Tests Response to Plenaxis should be monitored by measuring serum total testosterone concentrations just prior to administration on Da

### **Groq Console**

In [7]:
import time
from groq import Groq

In [19]:
# Function to make a request with rate-limiting
def make_request(sentence):
    client = Groq(
    api_key='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',
    )
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": """ 
                You are a specialized AI model trained to identify
                and extract drug-related entities from text. Your goal is to accurately 
                recognize and classify entities such as drug names, drug categories, and 
                related medical terminology. Always output your response in the requested 
                structured format. If no relevant entities are found, respond with 'No entities found'.
                """
            },
            {
                "role": "user",
                "content": f"""
                Your task is to extract drug names from the given text. Only include valid drug names explicitly mentioned in the text. Provide the list of drug names as shown.

                Example 1:
                Input: 'Ketoconazole/Itraconazole, Macrolides, Including Erythromycin.'
                Output: ["Ketoconazole/Itraconazole","Macrolides", "Erythromycin"]

                Example 2:
                Input: 'Methotrexate: HUMIRA has been studied in rheumatoid arthritis patients taking concomitant MTX.'
                Output: ["Methotrexate", "HUMIRA", "MTX"]

                Now process the following text:
                {sentence}
                 """,
            }
        ],
        model="llama-3.3-70b-versatile",
        stream=False,
    )
    print(chat_completion.choices[0].message.content)
    return chat_completion.choices[0].message.content

# Rate-limit loop
requests_per_minute = 15
interval = 60 / requests_per_minute


all_results_generated = []
for result in train_results[:100]:
    entities_present = []
    drugs = make_request(result['sentence'])
    if drugs == 'No entities found':
        drugs = []
    else:
        drugs = json.loads(drugs)
    print(f"Sentence: {result['sentence']}")
    print(f"BIO_Tags: {result['tags']}")
    for entity in result['entity']:
        entities_present.append(entity['text'])
    print(entities_present)
    print(f"Extracted Drugs: {drugs}")
    all_results_generated.append({
        'sentence': result['sentence'],
        'tags' : result['tags'],
        'drugs_present': entities_present,
        'extracted_drugs': drugs,
    })
    time.sleep(interval)  # Wait to maintain the rate limit

No entities found
Sentence: No drug, nutritional supplement, food or herb interactions have yet been reported.
BIO_Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[]
Extracted Drugs: []
["Plenaxis"]
Sentence: No formal drug/drug interaction studies with Plenaxis were performed.
BIO_Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-brand', 'O', 'O']
['Plenaxis']
Extracted Drugs: ['Plenaxis']
["Plenaxis"]
Sentence: Cytochrome P-450 is not known to be involved in the metabolism of Plenaxis.
BIO_Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-brand']
['Plenaxis']
Extracted Drugs: ['Plenaxis']
["Plenaxis"]
Sentence: Plenaxis is highly bound to plasma proteins (96 to 99%).
BIO_Tags: ['B-brand', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['Plenaxis']
Extracted Drugs: ['Plenaxis']
["Plenaxis", "testosterone"]
Sentence: Laboratory Tests Response to Plenaxis should be monitored by measuring serum total testosterone concentrations just prior to administration on D

Arrange Drugs Present and Extracted Drugs in a dataframe


In [20]:
df = pd.DataFrame(all_results_generated)

# Convert the drug present to lower case and set 
# it as a set to remove duplicates
df['drugs_present'] = df['drugs_present'].apply(lambda x: set([drug.lower() for drug in x]))
df['extracted_drugs'] = df['extracted_drugs'].apply(lambda x: set([drug.lower() for drug in x]))

# df['drugs_present'] = df['drugs_present'].apply(lambda x: ', '.join(x))
# df['extracted_drugs'] = df['extracted_drugs'].apply(lambda x: ', '.join(x))
df['correct'] = df['drugs_present'] == df['extracted_drugs']
df

Unnamed: 0,sentence,tags,drugs_present,extracted_drugs,correct
0,"No drug, nutritional supplement, food or herb ...","[O, O, O, O, O, O, O, O, O, O, O, O]",{},{},True
1,No formal drug/drug interaction studies with P...,"[O, O, O, O, O, O, B-brand, O, O]",{plenaxis},{plenaxis},True
2,Cytochrome P-450 is not known to be involved i...,"[O, O, O, O, O, O, O, O, O, O, O, O, B-brand]",{plenaxis},{plenaxis},True
3,Plenaxis is highly bound to plasma proteins (9...,"[B-brand, O, O, O, O, O, O, O, O, O]",{plenaxis},{plenaxis},True
4,Laboratory Tests Response to Plenaxis should b...,"[O, O, O, O, B-brand, O, O, O, O, O, O, O, B-d...","{plenaxis, testosterone}","{plenaxis, testosterone}",True
...,...,...,...,...,...
95,Do not start or stop any medicine without doct...,"[O, O, O, O, O, O, O, O, O, O, O, O]",{},{},True
96,Intravenous Adenocard (adenosine) has been eff...,"[O, B-brand, O, O, O, O, O, O, O, O, O, O, O, ...","{angiotensin converting enzyme inhibitors, bet...","{quinidine, adenosine, adenocard}",False
97,Digoxin and verapamil use may be rarely associ...,"[B-drug, O, B-drug, O, O, O, O, O, O, O, O, O,...","{verapamil, digoxin, adenocard}","{verapamil, digoxin, adenocard}",True
98,Because of the potential for additive or syner...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",{adenocard},{adenocard},True


Save the result as a CSV file

In [21]:
df.to_csv('results/drug_extraction_results.csv', index=False)

Convert the CSV file to dataframe and calculate the accuracy


In [22]:
df = pd.read_csv('results/drug_extraction_results.csv')

In [23]:
# Calculate the accuracy
accuracy = df['correct'].mean()
print(f"Accuracy: {accuracy:.2f}")

# now we need to check the number of false positives and false negatives
# False positives: Extracted drugs that are not present in the text
# False negatives: Drugs present in the text but not extracted

df['false_positive'] = df.apply(lambda x: set(x['extracted_drugs']) if isinstance(x['extracted_drugs'], (list, set)) else set(), axis=1) - \
                        df.apply(lambda x: set(x['drugs_present']) if isinstance(x['drugs_present'], (list, set)) else set(), axis=1)

df['false_negative'] = df.apply(lambda x: set(x['drugs_present']) if isinstance(x['drugs_present'], (list, set)) else set(), axis=1) - \
                        df.apply(lambda x: set(x['extracted_drugs']) if isinstance(x['extracted_drugs'], (list, set)) else set(), axis=1)


# Calculate the total number of false positives and false negatives
total_false_positives = df['false_positive'].apply(len).sum()
total_false_negatives = df['false_negative'].apply(len).sum()

print(f"Total False Positives: {total_false_positives}")
print(f"Total False Negatives: {total_false_negatives}")

# Calculate the precision and recall
precision = (len(df) - total_false_positives) / len(df)
recall = (len(df) - total_false_negatives) / len(df)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

# Calculate the F1 score
f1_score = 2 * (precision * recall) / (precision + recall)
print(f"F1 Score: {f1_score:.2f}")

Accuracy: 0.74
Total False Positives: 0
Total False Negatives: 0
Precision: 1.00
Recall: 1.00
F1 Score: 1.00


In [None]:
df

### **Gemini**

In [14]:
api_key = "AIzaSyAlWzliqQKSKyRW2xBM9op_fToJNyId7R0"

# The endpoint URL
url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent?key={api_key}"

def fetch_text(message):

    # Headers and data
  headers = {
      "Content-Type": "application/json"
  }

  data = {
      "contents": [
          {
              "parts": [
                  {
                      "text": message
                  },
              ]
          }
      ]
  }

  # Make the POST request
  response = requests.post(url, headers=headers, data=json.dumps(data))
  
  # Check the response
  if response.status_code == 200:
      
      result = response.json()["candidates"][0]['content']['parts'][0]['text']
      
      return result
  if response.status_code == 429:
    return "429"
  return []

In [15]:
import json

def fetch_drug_entities(sentence):
    prompt = f"""
    Your task is to extract drug names from the given text. Only include valid drug names explicitly mentioned in the text. Provide the list of drug names as shown.

    Example 1:
    Input: 'Ketoconazole/Itraconazole, Macrolides, Including Erythromycin.'
    Output: ["Ketoconazole/Itraconazole","Macrolides", "Erythromycin"]

    Example 2:
    Input: 'Methotrexate: HUMIRA has been studied in rheumatoid arthritis patients taking concomitant MTX.'
    Output: ["Methotrexate", "HUMIRA", "MTX"]

    Now process the following text:
    {sentence}
    """
    response = fetch_text(prompt)  # Use your existing fetch_text function
    try:
        # Ensure the response is valid JSON
        drug_entities = json.loads(response)
    except json.JSONDecodeError:
        print(f"Error parsing JSON response: {response}")
        drug_entities = []
    return drug_entities

all_results_generated = []
for result in train_results[:10]:
    entities_present = []
    drugs = fetch_drug_entities(result['sentence'])
    print(f"Sentence: {result['sentence']}")
    print(f"BIO_Tags: {result['tags']}")
    for entity in result['entity']:
        entities_present.append(entity['text'])
    print(entities_present)
    print(f"Extracted Drugs: {drugs}")
    all_results_generated.append({
        'sentence': result['sentence'],
        'tags' : result['tags'],
        'drugs_present': entities_present,
        'extracted_drugs': drugs,
    })


Sentence: No drug, nutritional supplement, food or herb interactions have yet been reported.
BIO_Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[]
Extracted Drugs: []
Sentence: No formal drug/drug interaction studies with Plenaxis were performed.
BIO_Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-brand', 'O', 'O']
['Plenaxis']
Extracted Drugs: ['Plenaxis']
Sentence: Cytochrome P-450 is not known to be involved in the metabolism of Plenaxis.
BIO_Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-brand']
['Plenaxis']
Extracted Drugs: ['Plenaxis']
Error parsing JSON response: ```
[]
```
There are no drug names explicitly mentioned in the provided text.

Sentence: Plenaxis is highly bound to plasma proteins (96 to 99%).
BIO_Tags: ['B-brand', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['Plenaxis']
Extracted Drugs: []
Sentence: Laboratory Tests Response to Plenaxis should be monitored by measuring serum total testosterone concentrations just prior to adminis

KeyboardInterrupt: 