### Importing Libraries

In [1]:
from sentence_transformers import SentenceTransformer
 
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.chat_models import ChatOpenAI
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import BaseRetriever
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.base_language import BaseLanguageModel
from langchain import LLMChain
# EXTRA
# from langchain.prompts.base import BasePromptTemplate
# from langchain.prompts import PromptTemplate
import openai

import time
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import warnings
import os
import json

# import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

# import delta
from delta import *

warnings.filterwarnings("ignore")
pd.set_option("display.max_colwidth",100)

  from .autonotebook import tqdm as notebook_tqdm


### Initializing environmental configuration

In [2]:
from include.env_values import *
os.environ["HADOOP_HOME"] = hadoop_loc
os.environ["OPENAI_API_KEY"] = openai_api_key

In [3]:
with open('./include/config.json') as file:
    config = json.load(file)

### Initializing PySpark Session

In [4]:
spark = SparkSession.builder.getOrCreate()

In [5]:
spark

## Data Preperation

#### Loading raw data

In [6]:
ade_events_df=spark.read.load(f"{config['delta_path']}/bronze/ade_events").orderBy(F.rand(seed=42)).repartition(64).cache()

In [7]:
ade_events_df.show()

+--------------------+--------------------+------+
|                  id|                text|is_ADE|
+--------------------+--------------------+------+
|9e30ac5d-8f3d-456...| Its duration of ...| false|
|e1b28265-fd24-4d7...| An angiogram sho...| false|
|2bed409e-2f8f-488...| We report a 14-y...| false|
|cde4ccbb-53c7-43c...| The objective of...| false|
|75d85cf6-c386-48d...| Perforated appen...| false|
|0f52fbd0-564d-476...| CONCLUSIONS: Thi...| false|
|d8b04629-3ddb-439...| Treatment was st...| false|
|3ef074bf-bfe3-4ca...| At 26 weeks' ges...| false|
|188654af-33e5-4b7...| The mother had u...| false|
|52188914-747c-4b3...| Detailed immunol...| false|
|e1a43c5e-58f9-4b1...| Early detection ...| false|
|6784d899-bb4d-421...| To the best of o...| false|
|57a5d6e8-1182-45d...| PURPOSE: We repo...| false|
|949ed5b8-1df2-493...| Neuroimaging sho...| false|
|5583f0c7-0cad-45a...| The liver other ...| false|
|6d7d0461-0382-407...|Neuroleptic malig...|  true|
|f25a72b3-176a-425...| This pat

In [11]:
ade_events_df.select('*').where('is_ADE==True').count()

4271

In [12]:
ade_events_df.select('*').where('is_ADE==False').count()

16625

In [16]:
ade_events_df.select('*').where('is_ADE==True').limit(100).union(ade_events_df.select('*').where('is_ADE==False').limit(100)).orderBy(F.rand()).show()

+--------------------+--------------------+------+
|                  id|                text|is_ADE|
+--------------------+--------------------+------+
|67d4b4d0-fd69-49c...| Pyogenic liver a...| false|
|b5131b8f-26b4-455...|Deposits of plasm...|  true|
|3c111b07-a684-4d3...|We recommend that...|  true|
|b03bd85d-1b2f-48d...|In summary, we re...|  true|
|2c41e4e3-b05f-46c...| In addition to s...| false|
|46712487-120e-4e9...|We report on a pa...|  true|
|38cfe93c-eaab-430...|There have been o...|  true|
|188654af-33e5-4b7...| The mother had u...| false|
|35d49a8f-d9d9-497...|Severe hepatotoxi...|  true|
|8a7e14c0-af5b-49c...|Serious phenytoin...|  true|
|57429bb7-e78b-446...|The authors repor...|  true|
|8a282f1f-5306-4dd...| CASE: A perimeno...| false|
|de4be3ef-e780-4af...| The following is...| false|
|4a420779-2d96-4bb...|Phenobarbital hep...|  true|
|b82bc5b3-cf69-41c...| Our findings sug...| false|
|143fca15-773e-445...| Clinical aspects...| false|
|3f448803-999d-49b...| The long

### Splitting Data 

#### Data for Vector DB

In [305]:
# for loading in vector db
ade_events_loading = ade_events_df.selectExpr('text', 'is_ADE').limit(int(ade_events_df.count()/2))
# ade_events_loading = ade_events_loading.withColumn('is_ADE', F.when(F.col('is_ADE')=='true', "is_ADE").otherwise("not_ADE")).select('*')
ade_events_loading.limit(5).show()

+--------------------+------+
|                text|is_ADE|
+--------------------+------+
| Its duration of ...| false|
| An angiogram sho...| false|
| We report a 14-y...| false|
| The objective of...| false|
| Perforated appen...| false|
+--------------------+------+



In [254]:
ade_events_loading = ade_events_loading.withColumn('is_ADE', F.when(F.col('is_ADE')=='true', "is_ADE").otherwise("not_ADE")).select('*')
ade_events_loading.limit(5).show()

+--------------------+-------+
|                text| is_ADE|
+--------------------+-------+
| Its duration of ...|not_ADE|
| An angiogram sho...|not_ADE|
| We report a 14-y...|not_ADE|
| The objective of...|not_ADE|
| Perforated appen...|not_ADE|
+--------------------+-------+



In [252]:
ade_events_loading.count()

10448

#### Data for testing

In [306]:
# for testing tuned model
ade_events_testing = ade_events_df.selectExpr('text', 'is_ADE').subtract(ade_events_loading)
ade_events_testing.limit(5).show()

+--------------------+------+
|                text|is_ADE|
+--------------------+------+
| A 57-year-old ma...| false|
| Vitamin B12 (cya...| false|
|Psychosis in a 12...|  true|
|The induction of ...|  true|
| The fact that ox...| false|
+--------------------+------+



In [307]:
# saving this data for later analysis
ade_events_testing.write.format('delta').mode('overwrite').save(f"{config['delta_path']}\\silver\\test_data")

In [256]:
ade_events_testing = ade_events_testing.withColumn('is_ADE', F.when(F.col('is_ADE')=='true', "is_ADE").otherwise("not_ADE")).select('*')
ade_events_testing.limit(5).show()

+--------------------+-------+
|                text| is_ADE|
+--------------------+-------+
| A 57-year-old ma...|not_ADE|
| Vitamin B12 (cya...|not_ADE|
|Psychosis in a 12...| is_ADE|
|The induction of ...| is_ADE|
| The fact that ox...|not_ADE|
+--------------------+-------+



In [257]:
ade_events_testing.count()

10448

#### Function to transform Data so that it can be fed into LLM as embedding document

In [19]:
# this is the format we have to convert the above data
# """
# [statement]: "All patients had normal levels..."
# [nature]: "not_ADE"
# """
def transform_data(df):
    df = df.withColumn('text_input', F.lit("[statement]:")) \
        .withColumn('text_input', F.concat(F.col('text_input'), F.col('text'))) \
        .drop('text') \
        .withColumn('ade_value', F.lit("\n "+"[nature]:")) \
        .withColumn('ade_value', F.concat(F.col('ade_value'), F.col('is_ADE'))) \
        .drop('is_ADE') \
        .withColumn('text_input', F.concat(F.col('text_input'), F.col('ade_value'))) \
        .drop('ade_value')
    return df

## Model Development

In [15]:
statement = "Selective serotonin reuptake inhibitors and benzodiazepines appear to be the most common pharmacologic treatment approaches."

#### With Few-Shot Encoding

In [None]:
unfine_tuned_system_message_template = """
You are a helpful assistant built by Yash, you are good at helping classification of drug and it's affect. 
"""
# unfine_tuned_human_message_template = """
# Classify the drug and it's affect in the sentence '{statement}'.
# """
unfine_tuned_human_message_template = """
'\n' means next line

For each statement, analyse the sentiment of the statement, if it is indicating a side effect or not, as 'is_ADE' for side effect indication or 'not_ADE' for otherwise:

[statement]: "The postoperative course was uneventful and topical steroids were combined with neomycin and propamidine."
[nature]: "is_ADE"
###
[statement]: "His AFP was initially 9828 microg/L and rapidly dropped to 5597 microg/L in ten days after oral sorafenib treatment."
[nature]: "is_ADE"
###
[statement]: "All patients had normal levels of thyroid-stimulating hormone and no other gastrointestinal complaints; evaluation revealed no other cause for the weight loss."
[nature]: "not_ADE"
###
[statement]: {statement}
[nature]: ""

'###' means end of the line

only return generated [nature] form last line
"""

Above we have used few-shot learning approaching for making the output of our model more aligned with our needs.

In [166]:
# define system-level instructions
system_message_prompt = SystemMessagePromptTemplate.from_template(unfine_tuned_system_message_template)

In [167]:
# define human-driven instructions
human_message_prompt = HumanMessagePromptTemplate.from_template(unfine_tuned_human_message_template)

In [168]:
# combine instructions into a single prompt
chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

In [169]:
# define model to respond to prompt
llm = ChatOpenAI(model_name=config['openai_chat_model'], temperature=0.9)

In [170]:
# combine prompt and model into a unit of work (chain)
ade_classifier_chain = LLMChain(
  llm = llm,
  prompt = chat_prompt
  )

Generating a Response

In [171]:
output = ade_classifier_chain.generate([{'statement':statement}])

In [172]:
# get answer from result
generation = output.generations[0][0]
answer = generation.text

In [173]:
# display answer
if answer is not None:
    print(f"Statement : {statement}", '\n', f"Output: {answer}")

Statement : Selective serotonin reuptake inhibitors and benzodiazepines appear to be the most common pharmacologic treatment approaches. 
 Output: is_ADE


#### With Vector DB integration

In [274]:
# transforming data for few shot encoding
data_for_vector_db = transform_data(ade_events_loading)

In [284]:
data_for_vector_db.limit(5).show()

+--------------------+
|          text_input|
+--------------------+
|[statement]: Its ...|
|[statement]: An a...|
|[statement]: We r...|
|[statement]: The ...|
|[statement]: Perf...|
+--------------------+



In [299]:
# loading 'data_for_vector_store' in silver stage after transformation
data_for_vector_db.write.format('delta').mode('overwrite').save(f"{config['delta_path']}\\silver\\text_inputs")

#### Loading transformed data which is to be used for few shot encoding

In [4]:
few_shot_data = spark.read.format('delta').load(f"{config['delta_path']}\\silver\\text_inputs")

AttributeError: 'NoneType' object has no attribute 'read'

In [12]:
few_shot_data.limit(5).show()

+--------------------+
|          text_input|
+--------------------+
|[statement]: Its ...|
|[statement]: An a...|
|[statement]: We r...|
|[statement]: The ...|
|[statement]: Perf...|
+--------------------+



In [13]:
few_shot_data.count()

10448

Download the Embedding Model

In [279]:
original_model = SentenceTransformer('all-MiniLM-L12-v2')

Downloading (…)5dded/.gitattributes: 100%|████████████████████████████████████████| 1.18k/1.18k [00:00<00:00, 1.19MB/s]
Downloading (…)_Pooling/config.json: 100%|█████████████████████████████████████████████| 190/190 [00:00<00:00, 208kB/s]
Downloading (…)4d81d5dded/README.md: 100%|████████████████████████████████████████| 10.6k/10.6k [00:00<00:00, 10.6MB/s]
Downloading (…)81d5dded/config.json: 100%|█████████████████████████████████████████████| 573/573 [00:00<00:00, 520kB/s]
Downloading (…)ce_transformers.json: 100%|████████████████████████████████████████████████████| 116/116 [00:00<?, ?B/s]
Downloading (…)ded/data_config.json: 100%|█████████████████████████████████████████| 39.3k/39.3k [00:00<00:00, 196kB/s]
Downloading pytorch_model.bin: 100%|████████████████████████████████████████████████| 134M/134M [00:19<00:00, 7.01MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████████████████████████████████████████████| 53.0/53.0 [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 100

To use our model with our vector store, we need to wrap it as a LangChain HuggingFaceEmbeddings object. We could have had that object download the model for us, skipping the previous step, but if we had done that, future references to the model would trigger additional downloads. By downloading it, saving it to a local path, and then having the LangChain object read it from that path, we are bypassing unnecessary future downloads:

In [8]:
# Load Model as HuggingFaceEmbeddings Object
# encoder path
embedding_model_path = f"{config['base_path']}/embedding_model"

In [282]:
# reaload model using langchain wrapper
original_model.save(embedding_model_path)

In [9]:
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_path)

### Loading Data in Vector DB

In [286]:
# convert inputs to pandas dataframe
inputs = data_for_vector_db.toPandas()

In [292]:
inputs_list = (inputs['text_input'].to_list())

In [293]:
inputs_list

['[statement]: Its duration of action is brief and serious adverse effects have not been reported.\n [nature]:not_ADE',
 '[statement]: An angiogram showed a thrombus cranial to the screw.\n [nature]:not_ADE',
 '[statement]: We report a 14-year-old boy with high risk medulloblastoma treated with craniospinal radiation followed by chemotherapy, who experienced severe HSOS after only one course of chemotherapy including carboplatin, vincristine, and cyclophosphamide.\n [nature]:not_ADE',
 '[statement]: The objective of this paper is to report 5 cases of rhabdomyolysis (RML) in patients with acute leukemia (AL).\n [nature]:not_ADE',
 '[statement]: Perforated appendicitis was a coincidental event following IVF and embryo transfer.\n [nature]:not_ADE',
 '[statement]: CONCLUSIONS: This study shows that psoriasis can be driven by the innate immune system through TLR ligation.\n [nature]:not_ADE',
 '[statement]: Treatment was started with heparin and vasodilators.\n [nature]:not_ADE',
 "[statem

In [11]:
# instantiate vector store object
vector_store = FAISS.from_texts(
  embedding=embedding_model, 
  texts=inputs_list
  )

In [303]:
# Persist Vector Store to Storage
vector_store.save_local(folder_path=config['vector_store_path'])

### Model Building with Vector Store

In [12]:
# load vector data if not defined already
vector_store = FAISS.load_local(embeddings=embedding_model, folder_path=config['vector_store_path'])

In [13]:
# configure document retrieval 
n_documents = 5 # number of documents to retrieve 
retriever = vector_store.as_retriever(search_kwargs={'k': n_documents}) # configure retrieval mechanism

In [16]:
# get relevant documents
docs = retriever.get_relevant_documents(statement)
for doc in docs: 
  print(doc,'\n') 

page_content='[statement]: The combination of selective serotonin reuptake inhibitors and tricyclic antidepressants may be useful in treating patients who experience intolerable side effects or who are resistant to therapy with a single antidepressant.\n [nature]:not_ADE' metadata={} 

page_content='[statement]: Initially, antipsychotics and especially pimozide were considered to be the pharmacological approach of choice but, subsequently, tryciclic anti-depressants and selective serotonin re-uptake inhibitors (SSRIs) were also suggested to be effective, implicating the serotonergic system in the pathophysiology of the disorder.\n [nature]:not_ADE' metadata={} 

page_content='[statement]: Prevention, early recognition of the clinical presentation, identification and removal of the offending agents, supportive care, and specific pharmacologic therapy are all important to the successful management of serotonin syndrome.\n [nature]:not_ADE' metadata={} 

page_content='[statement]: Increas

In [12]:
type(docs[1].page_content)

str

Define Chain to Generate Responses

In [17]:
system_message_template = """
You are a helpful assistant built by Yash, you are good at helping classification of drug and it's affect. 
"""
human_message_template = """
'/n' means next-line thourghout the prompt

understand the statements for any adverse events and predict the [nature]. 'is_ADE' means [statement] reports an adverse event medically and 'not_ADE' means not adverse event.

{context} 
[statement]: {statement} 
[nature]: ''

'###' means end of line.

return output for last 'statement' in this way:
[nature]: 'is_ADE' (if adverse avtivity present)
[nature]: 'not_ADE' (if adverset avtivity not present)
[nature]: 'I can't Identify'
"""

In [18]:
# define system-level instructions
system_message_prompt = SystemMessagePromptTemplate.from_template(system_message_template)

In [19]:
# define human-driven instructions
human_message_prompt = HumanMessagePromptTemplate.from_template(human_message_template)

In [20]:
# combine instructions into a single prompt
chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

In [21]:
# define model to respond to prompt
llm = ChatOpenAI(model_name=config['openai_chat_model'], temperature=0.9)

In [22]:
# combine prompt and model into a unit of work (chain)
ade_classifier_chain = LLMChain(
  llm = llm,
  prompt = chat_prompt
  )

Generating a Response

In [23]:
context = ""
for doc in docs:
    # get document text
    context = context + "\n" + doc.page_content + "\n" + "###"

In [24]:
print(context)


[statement]: The combination of selective serotonin reuptake inhibitors and tricyclic antidepressants may be useful in treating patients who experience intolerable side effects or who are resistant to therapy with a single antidepressant.
 [nature]:not_ADE
###
[statement]: Initially, antipsychotics and especially pimozide were considered to be the pharmacological approach of choice but, subsequently, tryciclic anti-depressants and selective serotonin re-uptake inhibitors (SSRIs) were also suggested to be effective, implicating the serotonergic system in the pathophysiology of the disorder.
 [nature]:not_ADE
###
[statement]: Prevention, early recognition of the clinical presentation, identification and removal of the offending agents, supportive care, and specific pharmacologic therapy are all important to the successful management of serotonin syndrome.
 [nature]:not_ADE
###
[statement]: Increasing clinical experience with the selective serotonin reuptake inhibitors and tricyclic anti

In [25]:
output = ade_classifier_chain.generate([{'context': context, 'statement':statement}])

In [24]:
# get answer from result
generation = output.generations[0][0]
answer = generation.text

In [25]:
# display answer
if answer is not None:
    print(f"Output: {answer}")

Output: [statement]: Selective serotonin reuptake inhibitors and benzodiazepines appear to be the most common pharmacologic treatment approaches. 

[nature]: not_ADE


## Conversational ADE Classification
Text Classification According To Contains ADE or not.

PySpark UDF

In [174]:
# @udf(returnType=StringType())
def ade_classify(statement):
    output = ade_classifier_chain.generate([{'statement':statement}])
    # get answer from result
    generation = output.generations[0][0]
    answer = generation.text
    return answer[-8:-2]

ADE_classify = F.udf(lambda text: ade_classify(text),StringType())

spark.udf.register("ADE_classify", ade_classify,StringType())

<function __main__.ade_classify(statement)>

Without Vector DB

In [175]:
def ade_classify(statement):
    output = ade_classifier_chain.generate([{'statement':statement}])
    # get answer from result
    generation = output.generations[0][0]
    answer = generation.text
#     return answer
    return answer


In [176]:
test = "Before recognition of pregnancy, the latter had been treated for acute myelocytic leukaemia receiving cytarabine, daunorubicin and doxorubicin at conception and cytarabine and thioguanine at about 35-37 days post conception."

In [177]:
print(ade_classify(test))

is_ADE


With Vector DB

In [26]:
def vector_ade_classify(statement, n_examples = 5):
    retriever = vector_store.as_retriever(search_kwargs={'k': n_examples}) # configure retrieval mechanism
    docs = retriever.get_relevant_documents(statement)
    
    context = ""
    for doc in docs:
        # get document text
        context = context + "\n" + doc.page_content + "\n" + "###"
    
    output = ade_classifier_chain.generate([{'context': context, 'statement':statement}])
    # get answer from result
    generation = output.generations[0][0]
    answer = generation.text
    
    return answer

In [71]:
# loading test data
test_data = spark.read.format('delta').load(f"{config['delta_path']}\\silver\\test_data")
test_data.collect()

[Row(text=' A 57-year-old man with nodular rheumatoid arthritis was started on a combination of etanercept and methotrexate.', is_ADE=False),
 Row(text=' Vitamin B12 (cyanocobalamin) is an integral component of two biochemical reactions in man: the conversion of L-methylmalonyl coenzyme A into succinyl coenzyme A and the formation of methionine by methylation of homocysteine.', is_ADE=False),
 Row(text='Psychosis in a 12-year-old HIV-positive girl with an increased serum concentration of efavirenz.', is_ADE=True),
 Row(text='The induction of hypoglycaemia with PAS in this patient suggests a potential role for PAS in the treatment of diabetes mellitus.', is_ADE=True),
 Row(text=' The fact that oxcarbazepine is a prodrug and that the formation of the active MHD metabolite is a rate-limiting process may contribute to the relative low toxicity of the drug in overdose.', is_ADE=False),
 Row(text='METHODS: Five cases of contact dermatitis due to budesonide, a nonhalogenated steroid, are desc

In [27]:
answer = vector_ade_classify('Vitamin B12 (cyanocobalamin) is an integral component of two biochemical reactions in man: the conversion of L-methylmalonyl coenzyme A into succinyl coenzyme A and the formation of methionine by methylation of homocysteine.')



In [28]:
print(answer)

[statement]: Vitamin B12 (cyanocobalamin) is an integral component of two biochemical reactions in man: the conversion of L-methylmalonyl coenzyme A into succinyl coenzyme A and the formation of methionine by methylation of homocysteine. 
 [nature]:not_ADE


# YaaaaaaY!!! 🥳🍾

### Model Inference

In [76]:
# for _ in ade_text.collect()[25]:
#     print(_)
#     output = ade_classify(_)
#     print(output)

Define ADE Classifier Wrapper Class

In [51]:
llm = ChatOpenAI(model_name=config['openai_chat_model'], temperature=0.9)
retriever = vector_store.as_retriever(search_kwargs={'k': n_examples}) # configure retrieval mechanism
chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

NameError: name 'system_message_prompt' is not defined

In [124]:
class ADE_Classifier():
    from langchain import LLMChain
    
    def __init__(self, llm, retriever, prompt):
        self.llm = llm
        self.retriever = retriever
        self.prompt = prompt
        self.ade_classifier = LLMChain(llm=self.llm, prompt=prompt)
        
    def _get_class(self, context, statement, timeout_sec=60):
        '''get answer from llm with timeout handling'''
        
        # default result
        result = None
        
        # define end time
        end_time = time.time() + timeout_sec
        
        # try timeout
        while time.time() < end_time:
            # attempt to get a response
            try:
                result = self.ade_classifier.generate([{'context': context, 'statement':statement}])
                break # if successful response, stop lopping
                
            # if rate limit error...
            except openai.error.RateLimitError as rate_limit_error:
                if time.time() < end_time: # if time permits, sleep
                    time.sleep(2)
                    continue
                else: # otherwise, raise the exception
                    raise rate_limit_error
            
            # if other error, raise it
            except Exception as e:
                print(f'LLM ADE Classifier Chain encountered unexpected error: {e}')
                raise e
        return result
    
    def get_class(self, statement):
        '''get answer to provide question'''
        
        # default result
        result = {'generated_output':None}
        # get relevant documents
#         retriever = vector_store.as_retriever(search_kwargs={'k': n_examples}) # configure retrieval mechanism
        docs = self.retriever.get_relevant_documents(statement)
    
        context = ""
        for doc in docs:
            # get document text
            context = context + "\n" + doc.page_content + "\n" + "###"

        # get an answer from llm
        output = self._get_class(context=context, statement=statement)
        
        # get output from results
        generation = output.generations[0][0]
        answer = generation.text
        
        result['generated_output'] = answer
        
        return result

Testing the above inference class

In [110]:
# instantiate bot llm object
ade_classifier = ADE_Classifier(llm=llm, retriever=retriever, prompt=chat_prompt)

In [115]:
# get response to question
ade_classifier.get_class("Radiation-induced fibrosis (RIF) is a terminal sequela to irradiation that does not regress spontaneously.") 

{'generated_output': '[statement]: Radiation-induced fibrosis (RIF) is a terminal sequela to irradiation that does not regress spontaneously.\n[nature]: not_ADE'}

### Creating Modules for required functions

In [4]:
%%writefile ./include/ade_classifier.py
# importing libraries
from langchain import LLMChain
class ADE_Classifier():
    def __init__(self, llm, retriever, prompt):
        self.llm = llm
        self.retriever = retriever
        self.prompt = prompt
        self.ade_classifier = LLMChain(llm=self.llm, prompt=prompt)
        
    def _get_class(self, context, statement, timeout_sec=60):
        '''get answer from llm with timeout handling'''
        
        # default result
        result = None
        import time
        import openai
        # define end time
        end_time = time.time() + timeout_sec
        
        # try timeout
        while time.time() < end_time:
            # attempt to get a response
            try:
                result = self.ade_classifier.generate([{'context': context, 'statement':statement}])
                break # if successful response, stop lopping
                
            # if rate limit error...
            except openai.error.RateLimitError as rate_limit_error:
                if time.time() < end_time: # if time permits, sleep
                    time.sleep(2)
                    continue
                else: # otherwise, raise the exception
                    raise rate_limit_error
            
            # if other error, raise it
            except Exception as e:
                print(f'LLM ADE Classifier Chain encountered unexpected error: {e}')
                raise e
        return result
    
    def get_class(self, statement):
        '''get answer to provide question'''
        
        # default result
        result = {'generated_output':None}
        # get relevant documents
#         retriever = vector_store.as_retriever(search_kwargs={'k': n_examples}) # configure retrieval mechanism
        docs = self.retriever.get_relevant_documents(statement)
    
        context = ""
        for doc in docs:
            # get document text
            context = context + "\n" + doc.page_content + "\n" + "###"

        # get an answer from llm
        output = self._get_class(context=context, statement=statement)
        
        # get output from results
        generation = output.generations[0][0]
        answer = generation.text
        
        result['generated_output'] = answer
        
        return result

Overwriting ./include/ade_classifier.py


In [52]:
%%writefile ./include/utils.py
# importing libraries
import os
import json
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

class Utils():
    def __init__(self):
        self.embedding_model_path = embedding_model_path = "C:\\Users\\yraj\\Work\\POCs\\Drugs & Adverse Events\\embedding_model"
        self.model_name = 'gpt-3.5-turbo'
        self.vector_store_path = 'C:\\Users\\yraj\\Work\\POCs\\Drugs & Adverse Events\\data\\test_vector_store'
        self.system_message_template = """
        You are a helpful assistant built by Yash, you are good at helping classification of drug and it's affect. 
        """
        self.human_message_template = """
        '/n' means next-line thourghout the prompt

        understand the statements for any adverse events and predict the [nature]. 'is_ADE' means [statement] reports an adverse event medically and 'not_ADE' means not adverse event.

        {context} 
        [statement]: {statement} 
        [nature]: ''

        '###' means end of line.

        return output for last 'statement' in this way:
        [nature]: 'is_ADE' (if adverse avtivity present)
        [nature]: 'not_ADE' (if adverset avtivity not present)
        [nature]: 'I can't Identify'
        """
    def get_retriever(self, n_documents=5):
        # encoder path

        embedding_model = HuggingFaceEmbeddings(model_name=self.embedding_model_path)

        # load vector data if not defined already
        vector_store = FAISS.load_local(embeddings=embedding_model, folder_path=self.vector_store_path)
        # configure document retrieval 
        retriever = vector_store.as_retriever(search_kwargs={'k': n_documents}) 
        return retriever

    def get_prompt(self):

        # define system-level instructions
        system_message_prompt = SystemMessagePromptTemplate.from_template(self.system_message_template)
        # define human-driven instructions
        human_message_prompt = HumanMessagePromptTemplate.from_template(self.human_message_template)
        # combine instructions into a single prompt
        chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
        return chat_prompt

    def get_llm(self):
        # define model to respond to prompt
        llm = ChatOpenAI(model_name=self.model_name, temperature=0.9)
        return llm

Overwriting ./include/utils.py


## Changes made to Vector Store

In [37]:
statement = "Psychosis in a 12-year-old HIV-positive girl with an increased serum concentration of efavirenz."

In [39]:
test_df = ade_events_df.select('*').where('is_ADE==True').limit(100).union(ade_events_df.select('*').where('is_ADE==False').limit(100)).orderBy(F.rand())
test_df = test_df.withColumn('is_ADE', F.when(F.col('is_ADE')=='true', "is_ADE").otherwise("not_ADE")).select('*')
test_df.show()

+--------------------+--------------------+-------+
|                  id|                text| is_ADE|
+--------------------+--------------------+-------+
|14b37ca0-8d71-46e...|Cefuroxime-induce...| is_ADE|
|3a46af5b-c3eb-41f...| Tolazoline induc...|not_ADE|
|d8b04629-3ddb-439...| Treatment was st...|not_ADE|
|eff99579-b0cd-45a...|CONCLUSIONS: Sust...| is_ADE|
|33495918-a296-45d...|A patient with We...| is_ADE|
|bbd3f096-3b01-450...|We strongly suspe...| is_ADE|
|24e39be8-2183-476...| Gastrointestinal...|not_ADE|
|46712487-120e-4e9...|We report on a pa...| is_ADE|
|eb33cbbb-b954-40b...| The mechanical i...|not_ADE|
|6e377798-54de-45e...|Phenylpropanolami...| is_ADE|
|d5a3dfc5-1145-47d...| The ocular press...|not_ADE|
|12b9c351-0ec9-46f...|Systemic capillar...| is_ADE|
|5583f0c7-0cad-45a...| The liver other ...|not_ADE|
|fccf6ebf-4519-413...| Focal nodular hy...|not_ADE|
|5bf62eee-7710-42a...|CONCLUSION: All d...| is_ADE|
|331a85fa-a5d5-4e8...|According to the ...| is_ADE|
|ce265969-36

In [40]:
test_data_for_vector_db = transform_data(test_df)

In [41]:
test_data_for_vector_db.select('text_input').limit(5).show()

+--------------------+
|          text_input|
+--------------------+
|[statement]:Cefur...|
|[statement]: Tola...|
|[statement]: Trea...|
|[statement]:CONCL...|
|[statement]:A pat...|
+--------------------+



In [42]:
# convert inputs to pandas dataframe
inputs = test_data_for_vector_db.toPandas()

In [43]:
inputs

Unnamed: 0,id,text_input
0,14b37ca0-8d71-46ea-be72-4955b58cf423,[statement]:Cefuroxime-induced acute renal failure.\n [nature]:is_ADE
1,3a46af5b-c3eb-41f3-9ec9-e82995ec3378,[statement]: Tolazoline induces a dilatation of the pulmonary vascular system by stimulating H2 ...
2,d8b04629-3ddb-4394-8933-7ea374037e4f,[statement]: Treatment was started with heparin and vasodilators.\n [nature]:not_ADE
3,eff99579-b0cd-45a4-966f-26ed5d717dc7,[statement]:CONCLUSIONS: Sustained-release verapamil is thought to be the cause of the asthma at...
4,33495918-a296-45d5-8900-8e9eb13b32fc,[statement]:A patient with Wegener's granulomatosis rapidly developed a circumferential subglott...
...,...,...
195,a1dcdf73-2c2d-4bf6-ace0-e123c1986db3,[statement]:Isotretinoin teratogenicity.\n [nature]:is_ADE
196,32ca30ca-0bee-44c9-a4b0-72fbeed71b4e,[statement]:Ritonavir should be added to the list of drugs that can induce adverse cutaneous rea...
197,e972f060-34e9-4192-a004-5c5b034b63ef,"[statement]: Even though anti-inflammatory, analgesic and antipyretic effects of henna have been..."
198,469abcb0-64a5-4b23-8fb6-13a5dd2f3b42,"[statement]: During IFN therapy, serum aminotransferases fell within the normal range and viremi..."


In [44]:
inputs_list = (inputs["text_input"].to_list())
inputs_list

['[statement]:Cefuroxime-induced acute renal failure.\n [nature]:is_ADE',
 '[statement]: Tolazoline induces a dilatation of the pulmonary vascular system by stimulating H2 receptors.\n [nature]:not_ADE',
 '[statement]: Treatment was started with heparin and vasodilators.\n [nature]:not_ADE',
 '[statement]:CONCLUSIONS: Sustained-release verapamil is thought to be the cause of the asthma attack in this patient because she was not taking any other preparations; the symptoms started with the administration of sustained-release verapamil and were relieved after its discontinuation.\n [nature]:is_ADE',
 "[statement]:A patient with Wegener's granulomatosis rapidly developed a circumferential subglottic stenosis while on a cyclophosphamide regimen that had caused resolution of systemic symptoms and pulmonary infiltrates.\n [nature]:is_ADE",
 '[statement]:We strongly suspect that this lethal anuria was mainly due to ifosfamide, occurring in a patient having received previous cisplatin chemother

In [45]:
# instantiate vector store object
vector_store = FAISS.from_texts(
  embedding=embedding_model, 
  texts=inputs_list
  )

In [46]:
# Persist Vector Store to Storage
vector_store.save_local(folder_path="C:\\Users\\yraj\\Work\\POCs\\Drugs & Adverse Events\\data\\test_vector_store")

In [47]:
# load vector data if not defined already
vector_store = FAISS.load_local(embeddings=embedding_model, folder_path="C:\\Users\\yraj\\Work\\POCs\\Drugs & Adverse Events\\data\\test_vector_store")

In [48]:
n_examples = 5
retriever = vector_store.as_retriever(search_kwargs={'k': n_examples}) # configure retrieval mechanism
docs = retriever.get_relevant_documents(statement)
    
context = ""
for doc in docs:
    # get document text
    context = context + "\n" + doc.page_content + "\n" + "###"
print(context)


[statement]:Eosinophilia caused by clozapine was observed in challenge, preceded by a faster neutrophil production and consecutive decrease (z = 2.27, p = 0.01).
 [nature]:is_ADE
###
[statement]:We present a 46-year-old African-American man with AIDS who was admitted on two different occasions within three weeks for signs and symptoms of meningitis after using trimethoprim/sulfamethoxazole (TMP/SMX).
 [nature]:is_ADE
###
[statement]: In addition to severe vegetative anticholinergic symptoms, the clinical picture is often dominated by a toxic psychosis with hallucinations, disturbances of orientation, and psychomotoric agitation, aggression, or anxiety.
 [nature]:not_ADE
###
[statement]: An extraordinarily rare case report of mixed fat redistribution syndrome associated with osteopenia but not with relevant metabolic abnormalities is documented in a prepubertal child with congenital HIV infection treated with antiretroviral therapy since the age of six months, up to the present age of 

In [49]:
print(f"""
understand the statements for any adverse events and predict the [nature]. 'is_ADE' means [statement] reports an adverse event medically and 'not_ADE' means not adverse event.

{context} 
[statement]: {statement} 
[nature]: ''

'###' means end of line.

return output for last 'statement' in this way:
[nature]: 'is_ADE' (if adverse avtivity present)
[nature]: 'not_ADE' (if adverset avtivity not present)
[nature]: 'I can't Identify'
""")


understand the statements for any adverse events and predict the [nature]. 'is_ADE' means [statement] reports an adverse event medically and 'not_ADE' means not adverse event.


[statement]:Eosinophilia caused by clozapine was observed in challenge, preceded by a faster neutrophil production and consecutive decrease (z = 2.27, p = 0.01).
 [nature]:is_ADE
###
[statement]:We present a 46-year-old African-American man with AIDS who was admitted on two different occasions within three weeks for signs and symptoms of meningitis after using trimethoprim/sulfamethoxazole (TMP/SMX).
 [nature]:is_ADE
###
[statement]: In addition to severe vegetative anticholinergic symptoms, the clinical picture is often dominated by a toxic psychosis with hallucinations, disturbances of orientation, and psychomotoric agitation, aggression, or anxiety.
 [nature]:not_ADE
###
[statement]: An extraordinarily rare case report of mixed fat redistribution syndrome associated with osteopenia but not with relevant met