In [None]:
import nltk  # Natural Language Toolkit
from nltk import word_tokenize, pos_tag, ne_chunk  # Import specific NLTK functions
from nltk.tree import Tree  # Import the Tree data structure

# Download required NLTK data files for tokenization, POS tagging, and Named Entity Recognition (NER)
nltk.download('punkt')  # Tokenizer models
nltk.download('averaged_perceptron_tagger')  # Part-of-Speech (POS) tagging models
nltk.download('maxent_ne_chunker')  # Named Entity Recognition (NER) chunker
nltk.download('words')  # Word list used by the NER chunker

def remove_names_and_locations(text):
    """
    Remove names of persons and locations from the given text using Named Entity Recognition (NER).

    :param text: The input text to process
    :return: Text with names and locations replaced by an empty string
    """
    # Tokenize the text into individual words
    tokenized_text = word_tokenize(text)
    
    # Perform Part-of-Speech (POS) tagging on the tokenized text
    tagged_text = pos_tag(tokenized_text)
    
    # Perform Named Entity Recognition (NER) to identify named entities in the tagged text
    chunked_text = ne_chunk(tagged_text)

    def extract_entity_names_and_locations(t):
        """
        Recursively extract entity names labeled as 'PERSON' or 'GPE' from the NER tree structure.

        :param t: NER tree node
        :return: List of names and locations recognized as 'PERSON' or 'GPE'
        """
        entity_names_and_locations = []
        
        # Check if the node has a label (is a subtree)
        if hasattr(t, 'label') and t.label:
            # If the label is 'PERSON' or 'GPE', it's a named entity representing a person's name or location
            if t.label() in ['PERSON', 'GPE']:
                # Join all child tokens (words) to form the complete entity name
                entity_names_and_locations.append(' '.join([child[0] for child in t]))
            else:
                # Recursively check each child node in the tree
                for child in t:
                    entity_names_and_locations.extend(extract_entity_names_and_locations(child))
        return entity_names_and_locations

    # Extract all names and locations recognized as 'PERSON' or 'GPE' from the chunked NER tree
    entity_names_and_locations = []
    for tree in chunked_text:
        entity_names_and_locations.extend(extract_entity_names_and_locations(tree))

    # Remove each recognized name and location from the original text by replacing it with an empty string
    cleaned_text = text
    for entity in entity_names_and_locations:
        cleaned_text = cleaned_text.replace(entity, '')

    return cleaned_text

In [None]:
# Example usage
text = "Hi I am Jason and I live in Toronto, I’m not feeling that well today because I have a runny nose and a headache."
cleaned_text = remove_names_and_locations(text)
print(cleaned_text)

In [None]:
import os
import requests
from bs4 import BeautifulSoup
import time

# Define the base URL of the MedlinePlus articles
base_url = 'https://medlineplus.gov/ency/article/'

# Define the directory to save the disease definitions
save_dir = './data/medlineplus/'

# Ensure the save directory exists
os.makedirs(save_dir, exist_ok=True)

def get_disease_info(article_id):
    try:
        disease_url = f'{base_url}{article_id}.htm'
        response = requests.get(disease_url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the disease name and definition
        disease_name = soup.find('h1').text.strip()
        print(f"Fetching info for: {disease_name}")  # Debugging print
        definition_section = soup.find('div', class_='section')
        definition = definition_section.text.strip() if definition_section else "No definition found."
        
        return {
            'name': disease_name,
            'definition': definition
        }
    except requests.exceptions.RequestException as e:
        print(f"Error fetching disease info from {disease_url}: {e}")
        return None

def save_disease_info(disease_info):
    # Create a valid filename by removing characters that are not allowed in filenames
    filename = "".join(c for c in disease_info['name'] if c.isalnum() or c in (' ', '_')).rstrip()
    filepath = os.path.join(save_dir, f"{filename}.txt")
    
    with open(filepath, 'w', encoding='utf-8') as file:
        file.write(f"Name: {disease_info['name']}\n")
        file.write(f"Definition: {disease_info['definition']}\n")
    
    print(f"Saved: {filepath}")

def main():
    # Example range of article IDs to iterate through
    for article_id in range(0, 1000):  # Adjust range as needed
        article_id_str = f'{article_id:06}'  # Zero-pad to match the article ID format
        disease_info = get_disease_info(article_id_str)
        if disease_info:
            save_disease_info(disease_info)
        time.sleep(1)  # Add a delay to avoid overloading the server

if __name__ == "__main__":
    main()


In [None]:
import voyageai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

# Use Voyage AI API KEY
api_key = "VOYAGEAI_API_KEY"

vo = voyageai.Client(api_key=api_key)
# This will automatically use the environment variable VOYAGE_API_KEY.
# Alternatively, you can use vo = voyageai.Client(api_key="<your secret key>")

result = vo.embed(["hello world"], model="voyage-large-2")

In [None]:
# Set a distinct number of drug categories
distinct_drug_categories = [
    'Analgesics',
    'Antibiotics',
    'Antifungal Agents',
    'Antiviral Agents',
    'Antipyretics',
    'Antiseptics',
    'Mood Stabilizers',
    'Anti-Inflammatory Agents',
    'Anticoagulants',
    'Antihistamines',
    'Diuretics',
    'Laxatives',
    'Bronchodilators',
    'Anticonvulsants',
    'Antidepressants'
]

In [None]:
def embed_vector(list_of_text):
    list_of_text: list
    if type(list_of_text) == list:
        result = vo.embed(list_of_text, model="voyage-large-2")
    else:
        print("The input must be a list")
        return TypeError
    return result.embeddings

def best_drug_category(categories, standard_list):
    vecs = embed_vector([categories]+standard_list)
    vec_array = np.array(vecs)

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(vec_array)

    first_line_similarities = similarity_matrix[0, 1:]

    # Find the index of the highest similarity
    best_fit_index = np.argmax(first_line_similarities)

    # Get the corresponding category
    best_fit_category = distinct_drug_categories[best_fit_index]

    return best_fit_category

def plot_variance_distribution(data):
    plt.figure(figsize=(8, 6))
    sns.histplot(data, bins=20, kde=True)
    plt.title('Distribution of Variance Values (without self-similarity)')
    plt.xlabel('Variance')
    plt.ylabel('Frequency')
    plt.show()
    return True

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Base URL for DrugBank
base_url = "https://go.drugbank.com/drugs/"

# Range of DrugBank IDs to scrape
start_id = 1000
end_id = 1500  # Example range, adjust as needed

# Initialize a list to store drug data
drugs_data = []

for i in range(start_id, end_id + 1):
    drug_id = f"DB{i:05d}"
    url = f"{base_url}{drug_id}"
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract the drug ID
        drug_id_meta = soup.find('meta', attrs={'name': 'dc.identifier'})
        drug_id_value = drug_id_meta['content'] if drug_id_meta else 'N/A'
        
        # Extract the drug name
        drug_name_meta = soup.find('meta', attrs={'name': 'dc.title'})
        drug_name = drug_name_meta['content'] if drug_name_meta else 'N/A'
        
        # Extract the drug description
        description_meta = soup.find('meta', attrs={'name': 'description'})
        description = description_meta['content'] if description_meta else 'N/A'
        
        # Extract additional fields by looking for dt/dd pairs
        def get_text_for_label(label):
            tag = soup.find('dt', text=label)
            if tag:
                next_tag = tag.find_next_sibling('dd')
                if next_tag:
                    return next_tag.text.strip()
            return 'N/A'

        # Data to collect
        mechanism_of_action = get_text_for_label('Mechanism of action')
        indication = get_text_for_label('Indication')
        pharmacodynamics = get_text_for_label('Pharmacodynamics')
        absorption = get_text_for_label('Absorption')
        volume_of_distribution = get_text_for_label('Volume of distribution')
        protein_binding = get_text_for_label('Protein binding')
        metabolism = get_text_for_label('Metabolism')
        drug_categories = get_text_for_label('Drug Categories')

        # Append the data to the list
        drugs_data.append({
            'DrugBank ID': drug_id_value,
            'Name': drug_name,
            'Description': description,
            'Mechanism of Action': mechanism_of_action,
            'Indication': indication,
            'Pharmacodynamics': pharmacodynamics,
            'Absorption': absorption,
            'Volume of Distribution': volume_of_distribution,
            'Protein Binding': protein_binding,
            'Metabolism': metabolism,
            'Drug Categories': drug_categories,
            # Add more fields as necessary
        })
        print(f"Gathered data for {drug_id}")
        # Just so not to overwhelm the server 
        time.sleep(0.01)
    else:
        print(f"Failed to retrieve data for {drug_id}")

# Convert the list to a pandas DataFrame
df = pd.DataFrame(drugs_data)

In [None]:
# Convert the list to a pandas DataFrame
df = pd.DataFrame(drugs_data)

# Save the DataFrame to a CSV file
df.to_csv('drugbank_data.csv', index=False)

In [None]:
# Fit the drugs into their respective categories
df["one_category"] = [best_drug_category(x, distinct_drug_categories) for x in df["Drug Categories"]]
df.one_category.value_counts()
df.head()

In [None]:
# One-line test
# Fetch data
source = [x for x in list(df.iloc[1])[1:] if x.lower().strip() != "not available"]
vectors = embed_vector(source)

# Convert the list of vectors to a numpy array
vector_array = np.array(vectors)

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(vector_array)

# Print the similarity matrix
# print(similarity_matrix)

# Create a heatmap to visualize the similarity matrix
plt.figure(figsize=(8, 6))
sns.heatmap(similarity_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Cosine Similarity Heatmap')
plt.xlabel('Vector Index')
plt.ylabel('Vector Index')
plt.show()

def analyze_similarity_matrix_without_self(similarity_matrix, plot = False):
    # Remove self-similarity by setting the diagonal to NaN
    np.fill_diagonal(similarity_matrix, np.nan)
    
    # Calculate the variance of the similarity matrix ignoring NaN values
    variance = np.nanvar(similarity_matrix)
    
    # Flatten the similarity matrix to get the distribution of values, excluding NaN values
    distribution = similarity_matrix.flatten()
    distribution = distribution[~np.isnan(distribution)]
    
    if plot:    
        # Plot the distribution as a histogram
        plt.figure(figsize=(8, 6))
        sns.histplot(distribution, bins=20, kde=True)
        plt.title('Distribution of Similarity Values (without self-similarity)')
        plt.xlabel('Similarity Value')
        plt.ylabel('Frequency')
        plt.show()
    else:
        return variance
    return variance

vector_array = np.array(vectors)
similarity_matrix = cosine_similarity(vector_array)

variance = analyze_similarity_matrix_without_self(similarity_matrix, plot=True)
print(f"Variance of the similarity matrix (without self-similarity): {variance}")


In [None]:
# Iterate and add the variance column to the DF
df["sim_variance"] = [analyze_similarity_matrix_without_self(
    cosine_similarity(
        np.array(
            embed_vector(
                [i for i in list(df.iloc[x]) if i.strip().lower() != "not available"]
                )
            )
        )
    ) for x in range(len(df))]
plot_variance_distribution(df.sim_variance)

In [None]:
df = pd.read_csv("./drugbank_data.csv")
# Fit the drugs into their respective categories
df["one_category"] = [best_drug_category(x, distinct_drug_categories) for x in df["Drug Categories"]]

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

label_column = "Name"

def chunk_list(lst, chunk_size, token_limit):
    """Yield successive chunks from lst with a limit on chunk size and token count."""
    chunk = []
    total_tokens = 0
    for text in lst:
        token_count = len(text.split())
        if len(chunk) >= chunk_size or (total_tokens + token_count) > token_limit:
            yield chunk
            chunk = []
            total_tokens = 0
        chunk.append(text)
        total_tokens += token_count
    if chunk:
        yield chunk

def transform_text_columns_to_embeddings(data, text_columns, chunk_size=128, token_limit=6000):
    # Concatenate all text columns into a single string for each row
    combined_text = data[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    combined_text.replace("Not Available", "")
    
    embeddings = []
    
    # Process the text data in chunks to avoid batch size and token limits
    for text_chunk in chunk_list(combined_text.tolist(), chunk_size, token_limit):
        chunk_embeddings = embed_vector(text_chunk)
        embeddings.extend(chunk_embeddings)
    
    return embeddings

# Define the text columns (assuming the text columns are all except 'DrugBank ID')
text_columns = df.columns.drop('DrugBank ID')

# Transform the text columns into vector embeddings
embeddings = transform_text_columns_to_embeddings(df, text_columns)

# Extract the labels
labels = df[label_column]
categories = df["one_category"]

def perform_pca(embeddings, n_components=2):
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(embeddings)
    return pca_result

# Perform PCA to reduce the dimensionality to 2 components
pca_result = perform_pca(embeddings)

In [None]:
pca_data = pd.DataFrame({
        'Principal Component 1': pca_result[:, 0],
        'Principal Component 2': pca_result[:, 1],
        'Label': labels,
        'Category': categories
    })

In [None]:
# Plot PCA results with Matplotlib and mplcursors
def plot_pca_result_with_labels(pca_result, labels):
    df = pd.DataFrame({
        'Principal Component 1': pca_result[:, 0],
        'Principal Component 2': pca_result[:, 1],
        'Label': labels
    })
    
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(
        df['Principal Component 1'], df['Principal Component 2'],
        c=pd.factorize(df['Label'])[0], cmap='viridis', alpha=0.7
    )
    
    plt.title('PCA of Text Embeddings')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.grid(True)
    plt.show()

plot_pca_result_with_labels(pca_result, labels)

In [None]:
import pandas as pd
import os

# Directory to save text files
output_dir = './data/drugbank/'
os.makedirs(output_dir, exist_ok=True)

# Function to save each row as a text file
def save_row_as_text(row, output_dir):
    if row["DrugBank ID"] != "N/A":
        file_name = f"{row['DrugBank ID']}.txt"
        file_path = os.path.join(output_dir, file_name)
        with open(file_path, 'w') as file:
            for column, value in row.items():
                file.write(f"{column}: {value}\n")

# Save each row in the dataframe as a text file
df.apply(lambda row: save_row_as_text(row, output_dir), axis=1)

# List the created files to verify
os.listdir(output_dir)

In [None]:
from groq import Groq
from openai import OpenAI

def get_drug_category(drug_name, client_type="openai"):
    if client_type == "groq":
        client = Groq(
            api_key="<GROQ_API_KEY>"
        )
        model_type = "<MODEL_TYPE>"
    elif client_type == "openai":
        client = OpenAI(
            api_key=os.getenv("OPENAI_KEY")
        )
        model_type = "gpt-4o"
    else: 
        print("The `client_type` parameter only supports `groq` and `openai`.")
        raise TypeError
    completion = client.chat.completions.create(
        model=model_type,
        messages=[
            {"role": "system", "content": "You are a pharmacology expert, you will help me categorize the drugs into their respective category.\
                For example: input-'Ibuprofen', output-'Analgesics'\
                ONLY select from these categories:\
                [Analgesics, Antibiotics, Antivirals, Antifungals, Antihypertensives, Antidiabetics, Statins, Antidepressants, Antipsychotics, Bronchodilators, Unknown]\
                ONLY return the output category and nothing else."},
            {"role": "user", "content": drug_name}
        ]
    )

    # Get the content of the response
    result = completion.choices[0].message.content

    return result

In [None]:
pca_data["category"] = ""
total = len(pca_data)
for i in range(len(pca_data)):
    pca_data.loc[i, "category"] = get_drug_category(pca_data.Label.loc[i], client_type="openai")
    print(f"{round(100*i/total, 2)}%", end="\r")

In [None]:
pca_data.to_csv("./drug_bank_pca_data.csv", index=False)