In [1]:
import os
from datasets import load_dataset, Dataset
import pandas as pd


dataset_url = "https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-4.4.220/BIOGRID-ALL-4.4.220.tab3.zip"
dataset_path = "biogrid_data.zip"

# Download and extract dataset
if not os.path.exists(dataset_path):
    import urllib.request
    urllib.request.urlretrieve(dataset_url, dataset_path)
    os.system(f"unzip -o {dataset_path} -d biogrid_data")

# Load the extracted dataset
files = [f for f in os.listdir("biogrid_data") if f.endswith(".tab3.txt")]
file_path = os.path.join("biogrid_data", files[0])

df = pd.read_csv(file_path, sep="\t", low_memory=False)

df.to_csv("biogrid.csv", index=False)



In [2]:
!pip install llama-cpp-python



In [3]:
from llama_cpp import Llama

# Set the path to the downloaded model file
model_path = "/Users/masudip/Library/Application Support/nomic.ai/GPT4All/Llama-3.2-1B-Instruct-Q4_0.gguf"

# Load the Llama model
llm = Llama(model_path=model_path)

# Check if the model is loaded successfully
print("Model loaded successfully!")

llama_model_load_from_file_impl: using device Metal (Apple M1) - 5461 MiB free
llama_model_loader: loaded meta data with 35 key-value pairs and 147 tensors from /Users/masudip/Library/Application Support/nomic.ai/GPT4All/Llama-3.2-1B-Instruct-Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 1B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
llama_model_loader: - kv   5:                         general.size_label str              = 1B
llama_model_lo

Model loaded successfully!


Using gguf chat template: {{- bos_token }}
{%- if custom_tools is defined %}
    {%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
    {%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
    {%- if strftime_now is defined %}
        {%- set date_string = strftime_now("%d %b %Y") %}
    {%- else %}
        {%- set date_string = "26 Jul 2024" %}
    {%- endif %}
{%- endif %}
{%- if not tools is defined %}
    {%- set tools = none %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
    {%- set system_message = messages[0]['content']|trim %}
    {%- set messages = messages[1:] %}
{%- else %}
    {%- set system_message = "" %}
{%- endif %}

{#- System message #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if tools is not none %}
    {{- "Environment: ipython\n" }}
{%- endif %}
{{- "Cutting Knowledge Date

In [4]:
import pandas as pd

# Load your dataset
df = pd.read_csv("biogrid.csv", sep=",", low_memory=False)

# Focus on relevant columns such as 'Interactor A', 'Interactor B', and organism information
df_cleaned = df[['Official Symbol Interactor A', 'Official Symbol Interactor B', 
                 'Organism Name Interactor A', 'Organism Name Interactor B', 
                 'Experimental System', 'Score']]

# Filter out irrelevant or missing data
df_cleaned.dropna(subset=['Official Symbol Interactor A', 'Official Symbol Interactor B'], inplace=True)

# You can filter for human drug interactions if needed
df_cleaned = df_cleaned[df_cleaned['Organism Name Interactor A'] == 'Homo sapiens']
df_cleaned = df_cleaned[df_cleaned['Organism Name Interactor B'] == 'Homo sapiens']

# Create interaction descriptions for training
df_cleaned['interaction'] = df_cleaned.apply(
    lambda x: f"Drug 1: {x['Official Symbol Interactor A']} interacts with Drug 2: {x['Official Symbol Interactor B']}", axis=1
)

# Print cleaned data preview
print(df_cleaned.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.dropna(subset=['Official Symbol Interactor A', 'Official Symbol Interactor B'], inplace=True)


  Official Symbol Interactor A Official Symbol Interactor B  \
0                       MAP2K4                         FLNC   
1                         MYPN                        ACTN2   
2                        ACVR1                         FNTA   
3                        GATA2                          PML   
4                         RPA2                        STAT3   

  Organism Name Interactor A Organism Name Interactor B Experimental System  \
0               Homo sapiens               Homo sapiens          Two-hybrid   
1               Homo sapiens               Homo sapiens          Two-hybrid   
2               Homo sapiens               Homo sapiens          Two-hybrid   
3               Homo sapiens               Homo sapiens          Two-hybrid   
4               Homo sapiens               Homo sapiens          Two-hybrid   

  Score                                 interaction  
0     -  Drug 1: MAP2K4 interacts with Drug 2: FLNC  
1     -   Drug 1: MYPN interacts with 

In [5]:
from transformers import LlamaTokenizer
# Initialize the tokenizer
tokenizer = LlamaTokenizer.from_pretrained("nomic-ai/gpt4all-13b-snoozy")  # Use the appropriate tokenizer

# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the interaction data
def tokenize_function(examples):
    return tokenizer(examples['interaction'], padding="max_length", truncation=True)

# Convert your DataFrame into a HuggingFace dataset and tokenize
from datasets import Dataset

dataset = Dataset.from_pandas(df_cleaned[['interaction']])

# Lower the dataset size to approximately 3000 tokens
dataset = dataset.select(range(3000))

# Apply the tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split into training and testing
train_test_split = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [9]:
from tdc.multi_pred import DDI

# Load the DrugBank dataset
data = DDI(name='DrugBank')

# Split the dataset into train and test sets
split = data.get_split()

# Get training and testing data
train_data = split['train']
test_data = split['test']

# Preview the training and test data
print(train_data.head())
print(test_data.head())

Found local copy...
Loading...
Done!


  Drug1_ID                                              Drug1 Drug2_ID  \
0  DB04571                CC1=CC2=CC3=C(OC(=O)C=C3C)C(C)=C2O1  DB00460   
1  DB09536                                           O=[Ti]=O  DB00460   
2  DB01600              CC(C(O)=O)C1=CC=C(S1)C(=O)C1=CC=CC=C1  DB00460   
3  DB09000         CC(CN(C)C)CN1C2=CC=CC=C2SC2=C1C=C(C=C2)C#N  DB00460   
4  DB11630  OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)...  DB00460   

                                               Drug2  Y  
0  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...  1  
1  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...  1  
2  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...  1  
3  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...  1  
4  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...  1  
  Drug1_ID                                              Drug1 Drug2_ID  \
0  DB00503  CC(C)[C@H](NC(=O)N(C)CC1=CSC(=N1)C(C)C)C(=O)N[...  DB00169   
1  DB04930  CC1(C)C(C=C(Cl)Cl)C1C(=O)OCC1=CC(OC2=CC=CC=C2).

In [33]:
import pandas as pd

# Assuming train_data is already loaded with columns 'Drug1', 'Drug2', and 'Y'
df_interactions = train_data[['Drug1', 'Drug2', 'Y']]  # Use the correct column names

# Add a description for each interaction
df_interactions['interaction_description'] = df_interactions.apply(
    lambda x: f"Drug 1: {x['Drug1']} interacts with Drug 2: {x['Drug2']} -> Interaction: {x['Y']}", axis=1
)

# Preview the formatted data
print(df_interactions.head())

                                               Drug1  \
0                CC1=CC2=CC3=C(OC(=O)C=C3C)C(C)=C2O1   
1                                           O=[Ti]=O   
2              CC(C(O)=O)C1=CC=C(S1)C(=O)C1=CC=CC=C1   
3         CC(CN(C)C)CN1C2=CC=CC=C2SC2=C1C=C(C=C2)C#N   
4  OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)...   

                                               Drug2  Y  \
0  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...  1   
1  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...  1   
2  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...  1   
3  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...  1   
4  COC(=O)CCC1=C2NC(\C=C3/N=C(/C=C4\N\C(=C/C5=N/C...  1   

                             interaction_description  
0  Drug 1: CC1=CC2=CC3=C(OC(=O)C=C3C)C(C)=C2O1 in...  
1  Drug 1: O=[Ti]=O interacts with Drug 2: COC(=O...  
2  Drug 1: CC(C(O)=O)C1=CC=C(S1)C(=O)C1=CC=CC=C1 ...  
3  Drug 1: CC(CN(C)C)CN1C2=CC=CC=C2SC2=C1C=C(C=C2...  
4  Drug 1: OC1=CC=CC(=C1)C-1=C2\C

In [43]:
import gym
from gym import spaces
import numpy as np
import pandas as pd
from stable_baselines3 import PPO

class DrugInteractionEnv(gym.Env):
    def __init__(self, dataset):
        super(DrugInteractionEnv, self).__init__()
        self.dataset = dataset
        self.action_space = spaces.Discrete(2)  # Two actions: 0 -> conflict, 1 -> no conflict
        
        # Observation space: two encoded drug indices (size 2)
        self.observation_space = spaces.Box(low=0, high=1, shape=(2,), dtype=np.float32)
        
        # Initialize the state index
        self.state_idx = 0

    def reset(self):
        # Reset to the first drug interaction
        self.state_idx = 0
        state = self.dataset.iloc[self.state_idx]
        return self._encode_state(state)

    def step(self, action):
        # Get current interaction details
        state = self.dataset.iloc[self.state_idx]
        drug1 = state['Drug1']
        drug2 = state['Drug2']
        expected_interaction = state['Y']  # Expected interaction (1 or 0)

        # Reward logic: if action matches expected interaction, give positive reward, else negative
        if action == expected_interaction:
            reward = 1  # Correct prediction
        else:
            reward = -1  # Incorrect prediction

        # Move to the next state
        self.state_idx += 1
        done = self.state_idx >= len(self.dataset)

        # Return next state, reward, done, and info
        next_state = self.dataset.iloc[self.state_idx] if not done else None
        return self._encode_state(next_state) if next_state is not None else np.zeros(self.observation_space.shape), reward, done, {}

    def _encode_state(self, state):
        """
        Encodes the state (drug interactions) into a numerical format suitable for RL.
        This could involve converting the drug names to indices or using one-hot encoding.
        """
        drug1_index = self.dataset['Drug1'].unique().tolist().index(state['Drug1'])
        drug2_index = self.dataset['Drug2'].unique().tolist().index(state['Drug2'])
        return np.array([drug1_index, drug2_index], dtype=np.float32)

# Load your dataset (drug interactions)
df_interactions = train_data[['Drug1', 'Drug2', 'Y']]
df_interactions['interaction_description'] = df_interactions.apply(
    lambda x: f"Drug 1: {x['Drug1']} interacts with Drug 2: {x['Drug2']} -> Interaction: {x['Y']}", axis=1
)

# Initialize the environment
env = DrugInteractionEnv(df_interactions)

# Initialize PPO model
ppo_model = PPO("MlpPolicy", env, verbose=1)

# Train PPO for a set number of timesteps
ppo_model.learn(total_timesteps=10000)

# Save the fine-tuned PPO model
ppo_model.save("ppo_finetuned_drug_interactions")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
-----------------------------
| time/              |      |
|    fps             | 39   |
|    iterations      | 1    |
|    time_elapsed    | 52   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 39          |
|    iterations           | 2           |
|    time_elapsed         | 104         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.016120436 |
|    clip_fraction        | 0.0292      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.684      |
|    explained_variance   | -0.0434     |
|    learning_rate        | 0.0003      |
|    loss                 | 8.3         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.000911   |
|    value_loss         

In [None]:
from llama_cpp import Llama

# Set the path to the downloaded model file
model_path = "/Users/masudip/Library/Application Support/nomic.ai/GPT4All/Llama-3.2-1B-Instruct-Q4_0.gguf"

# Load the Llama model
llm = Llama(model_path=model_path)

def check_drug_interaction(drug1, dosage1, drug2, dosage2, patient_info):
    """
    Checks for drug interactions and returns:
    - '+1' if no conflict
    - '-1' if there is a conflict
    
    Parameters:
    - drug1, drug2: Names of the drugs
    - dosage1, dosage2: Dosages of the drugs
    - patient_info: Other relevant information (age, conditions, etc.)
    
    Returns:
    - "+1" if no interaction
    - "-1" if there is a conflict
    """
    
    prompt = f"""
    You are a medical AI that checks drug interactions. 
    Please only output either "+1" if the drugs are safe together or "-1" if there is a conflict.
    Do not provide any other text.

    Patient Info: {patient_info}
    Drug 1: {drug1}, Dosage: {dosage1}
    Drug 2: {drug2}, Dosage: {dosage2}

    Make sure your output contains a "+1"  OR "-1" if there is a conflict.
    """

    # Generate response from the Llama model
    response = llm(prompt, max_tokens=100, temperature=0.3)

    # Extract and clean the response
    output_text = response["choices"][0]["text"].strip()

    print(output_text)

    if "-1" in output_text:
        return "-1"
    elif "+1" in output_text:
        return "-1"
    else:
        return "ERROR: NO RESULT"

# Example usage
drug1 = "Ibuprofen"
dosage1 = "400mg"
drug2 = "Aspirin"
dosage2 = "300mg"
patient_info = "65-year-old male"

interaction_result = check_drug_interaction(drug1, dosage1, drug2, dosage2, patient_info)

print(interaction_result)  # Expecting "+1" or "-1, Alternative: [Drug]"


llama_model_load_from_file_impl: using device Metal (Apple M1) - 5454 MiB free
llama_model_loader: loaded meta data with 35 key-value pairs and 147 tensors from /Users/masudip/Library/Application Support/nomic.ai/GPT4All/Llama-3.2-1B-Instruct-Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 1B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
llama_model_loader: - kv   5:                         general.size_label str              = 1B
llama_model_lo

- Ibuprofen, Dosage: 400mg
     - Paracetamol, Dosage: 300mg
     - Ibuprofen, Dosage: 400mg
     - Paracetamol, Dosage: 300mg
     - Ibuprofen, Dosage: 400mg
     - Paracetamol, Dosage: 300mg
     - Ibuprofen, Dosage: 400mg
     - Paracetamol,
ERROR: NO RESULT


In [4]:
# Example usage
drug1 = "Ibuprofen"
dosage1 = "400mg"
drug2 = "Aspirin"
dosage2 = "300mg"
patient_info = "65-year-old male"

interaction_result = check_drug_interaction(drug1, dosage1, drug2, dosage2, patient_info)

print(interaction_result)  # Expecting "+1" or "-1, Alternative: [Drug]"


Llama.generate: 77 prefix-match hit, remaining 39 prompt tokens to eval
llama_perf_context_print:        load time =    1942.85 ms
llama_perf_context_print: prompt eval time =    1175.27 ms /    39 tokens (   30.14 ms per token,    33.18 tokens per second)
llama_perf_context_print:        eval time =    4109.23 ms /    99 runs   (   41.51 ms per token,    24.09 tokens per second)
llama_perf_context_print:       total time =    5357.73 ms /   138 tokens


1. Ibuprofen
     2. Aspirin
     3. Ibuprofen
     4. Aspirin
     5. Ibuprofen
     6. Aspirin
     7. Ibuprofen
     8. Aspirin
     9. Ibuprofen
    10. Aspirin
    11. Ibuprofen
    12. Aspirin
    13. Ib
ERROR: NO RESULT
