### Importing Libraries

In [78]:
# importing mlflow library
import mlflow
import pandas as pd
import numpy as np
import time

In [5]:
from include.env_values import *
# setting up environment
os.environ["OPENAI_API_KEY"] = openai_api_key
with open('./include/config.json') as file:
    config = json.load(file)

In [6]:
from include.ade_classifier import ADE_Classifier
from include.utils import *

### Testing our Model

In [4]:
# instantiating 'utility' class for getting llm, retriever, chat_prompt
utils = Utils()

In [5]:
# instantiaing classifier
ade_classifier = ADE_Classifier(llm=utils.get_llm(), retriever=utils.get_retriever(), prompt=utils.get_prompt())

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
answer = ade_classifier.get_class('The induction of hypoglycaemia with PAS in this patient suggests a potential role for PAS in the treatment of diabetes mellitus.')

In [13]:
print(answer)

{'generated_output': '[statement]:We report a male patient with advanced AIDS who developed hypercalcemia 2 weeks after institution of rhGH therapy.\n [nature]:is_ADE'}


## Registering Model to MlFlow

In [14]:
# setting mlflow experitment
_ = mlflow.set_experiment(f"{config['base_path']}\\ade_llm_classifier")

#### Define MLflow Wrapper for Model

In [15]:
class MLflowADEClassifier(mlflow.pyfunc.PythonModel):
    def __init__(self, llm, retriever, prompt):
        self.ade_classifier = ADE_Classifier(llm, retriever, prompt)
    
    def predict(self, context, input):
        statement = str(input["statement"])
        
        # return class
        return self.ade_classifier.get_class(statement)

#### Persist Model to MLflow

In [16]:
# instantiate mlflow model
model = MLflowADEClassifier(llm=utils.get_llm(), retriever=utils.get_retriever(), prompt=utils.get_prompt())

In [17]:
# persist model to mlflow
with mlflow.start_run():
  _ = (
    mlflow.pyfunc.log_model(
      python_model=model,
      extra_pip_requirements=['langchain==0.0.166', 'tiktoken==0.4.0', 'openai==0.27.6', 'faiss-cpu==1.7.4', 'typing-inspect==0.8.0', 'typing_extensions==4.5.0'],
      artifact_path='model',
      registered_model_name='ade_llm_classifier'
      )
    )

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

Registered model 'ade_llm_classifier' already exists. Creating a new version of this model...
2023/07/06 12:56:02 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: ade_llm_classifier, version 2
Created version '2' of model 'ade_llm_classifier'.


#### Elevate Model to Production Status

In [18]:
# connect to mlflow 
client = mlflow.MlflowClient()
 
# identify latest model version
latest_version = client.get_latest_versions('ade_llm_classifier', stages=['None'])[0].version
 
# move model into production
client.transition_model_version_stage(
    name='ade_llm_classifier',
    version=latest_version,
    stage='Production',
    archive_existing_versions=True
)

<ModelVersion: aliases=[], creation_timestamp=1688628361935, current_stage='Production', description=None, last_updated_timestamp=1688628362115, name='ade_llm_classifier', run_id='b502be00d75a42fc812089a9b0107ed2', run_link=None, source='file:///C:/Users/yraj/Work/POCs/Drugs%20%26%20Adverse%20Events/mlruns/928637442989638884/b502be00d75a42fc812089a9b0107ed2/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=2>

#### Test Model

In [7]:
# retrieve model from mlflow
model = mlflow.pyfunc.load_model(f"models:/ade_llm_classifier/Production")

 - mlflow (current: 2.4.1, required: mlflow==2.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# model_uri = f"models:/{config['registered_model_name']}/Production"
# requirements_path = mlflow.pyfunc.get_model_dependencies(model_uri)
# %pip install -r $requirements_path

In [132]:
text = "Echocardiogram showed normal left ventricular systolic function with mild mitral and tricuspid regurgitation and trace aortic insufficiency."

In [133]:
# assemble statement input
statement = pd.DataFrame({'statement':[
  f'{text}'
]})
# get response 
model.predict(statement)

{'generated_output': '[statement]: Two cases of polymorphic ventricular tachycardia induced by the administration of verapamil against paroxysmal supraventricular tachycardia.\n [nature]:is_ADE'}

#### Seeing MLflow UI

In [22]:
!mlflow ui

^C


open this url http://localhost:5000/ to see the mlflow ui

### Performance

In [1]:
spark = SparkSession.builder.getOrCreate()

In [2]:
spark

In [22]:
import pyspark.sql.functions as F

In [104]:
test_df = spark.read.format('delta').load(f"{config['delta_path']}\\silver\\test_data")

In [105]:
test_df = test_df.select('*').filter("is_ADE==True").limit(20).union(test_df.select('*').filter("is_ADE==False").limit(80))

In [106]:
test_df = test_df.orderBy(F.rand())

In [107]:
test_df.count()

100

In [108]:
test_df.show()

+--------------------+------+
|                text|is_ADE|
+--------------------+------+
|Intravitreal tria...|  true|
| It has been desc...| false|
| Within twelve ho...| false|
|Celiprolol pneumo...|  true|
| Echocardiogram s...| false|
| The Jarisch-Herx...| false|
| A prolonged and ...| false|
|   Gold nephropathy.|  true|
| The IgA was norm...| false|
| Preoperative che...| false|
| During a period ...| false|
|Case 2 demonstrat...|  true|
| This 4-year-6-mo...| false|
| CONCLUSIONS: Pro...| false|
|This report descr...|  true|
| Vitamin B12 (cya...| false|
| A 57-year-old ma...| false|
| Continuous irreg...| false|
|CONCLUSIONS: SD-O...|  true|
| Intrathecal admi...| false|
+--------------------+------+
only showing top 20 rows



In [109]:
test_df = test_df.withColumn('is_ADE', F.when(F.col('is_ADE')=='true', 1).otherwise(0)).selectExpr('text as statement', 'is_ADE')

In [110]:
test_df.show()

+--------------------+------+
|           statement|is_ADE|
+--------------------+------+
|Intravitreal tria...|     1|
| It has been desc...|     0|
| Within twelve ho...|     0|
|Celiprolol pneumo...|     1|
| Echocardiogram s...|     0|
| The Jarisch-Herx...|     0|
| A prolonged and ...|     0|
|   Gold nephropathy.|     1|
| The IgA was norm...|     0|
| Preoperative che...|     0|
| During a period ...|     0|
|Case 2 demonstrat...|     1|
| This 4-year-6-mo...|     0|
| CONCLUSIONS: Pro...|     0|
|This report descr...|     1|
| Vitamin B12 (cya...|     0|
| A 57-year-old ma...|     0|
| Continuous irreg...|     0|
|CONCLUSIONS: SD-O...|     1|
| Intrathecal admi...|     0|
+--------------------+------+
only showing top 20 rows



In [111]:
# converting our pyspark dataframe to pandas dataframe
test_pd_df = test_df.toPandas()

In [113]:
df = test_pd_df[:5]
df

Unnamed: 0,statement,is_ADE
0,Intravitreal triamcinolone may have had an inf...,1
1,It has been described most recently in patien...,0
2,"Within twelve hours, his clinical state was s...",0
3,Celiprolol pneumonitis.,1
4,Echocardiogram showed normal left ventricular...,0


In [114]:
for _ in range(len(df)):
    statement = df.reindex(columns=['statement'])[_ - 1:_]
    output = model.predict(statement)

    if output['generated_output'].find('not_ADE'):
        df['predicted-is_ADE'] = 0
    else:
        df['predicted-is_ADE'] = 1
        
    # putting loop to sleep due to number of time a model can take input in 1 minute
    time.sleep(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted-is_ADE'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted-is_ADE'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['predicted-is_ADE'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [115]:
df

Unnamed: 0,statement,is_ADE,predicted-is_ADE
0,Intravitreal triamcinolone may have had an inf...,1,0
1,It has been described most recently in patien...,0,0
2,"Within twelve hours, his clinical state was s...",0,0
3,Celiprolol pneumonitis.,1,0
4,Echocardiogram showed normal left ventricular...,0,0


In [134]:
confusion_matrix = pd.crosstab(df['is_ADE'], df['predicted-is_ADE'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0
Actual,Unnamed: 1_level_1
0,3
1,2


Based on the above confusion matrix our model accuracy can be calculated with of (TP+TN)/(P+N) = %

In [99]:
# generating dataframe with output
def generate_output_df(df):
    model = mlflow.pyfunc.load_model(f"models:/ade_llm_classifier/Production")
    for _ in range(len(df)):
        statement = df.reindex(columns=['statement'])[_ - 1:_]
        output = model.predict(statement)

        if output['generated_output'].find('not_ADE'):
            df['predicted-is_ADE'] = 0
        else:
            df['predicted-is_ADE'] = 1
        
        # putting loop to sleep due to number of time a model can take input in 1 minute
        time.sleep(20)
    return df

In [116]:
final_pd_df = generate_output_df(test_pd_df)

In [None]:
final_pd_df