"""
information about this solution accelerator is given at below link

https://www.databricks.com/solutions/accelerators/adverse-drug-event-detection

"""

In [None]:
class Util:
    def __init__(self, project_name, base_path=None):
        if base_path != None:
            self.base_path = base_path
        else:
            user = "yraj"

            # for databricks
            # user=dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')

            self.base_path = f"C:\\Users\\{user}\\Work\\POCs\\Drugs & Adverse Events"

        # self.project_name = project_name.strip().replace(' ', '-')
        self.data_path = f"{self.base_path}\\data"
        self.delta_path = f"{self.base_path}\\delta"

        import os
        
        try:
            os.listdir(f'{self.data_path}')
            print(f'{self.data_path} is already present')
            os.listdir(f'{self.delta_path}')
            print(f'{self.delta_path} is already present')
        except:
            os.mkdir(self.data_path)
            os.mkdir(self.delta_path)

        # for Databricks
        # dbutils.fs.mkdirs(self.base_path)
        # dbutils.fs.mkdirs(self.data_path)
        # dbutils.fs.mkdirs(self.delta_path)
        
        # for storing vector data
        self.vector_store_path = f"{self.data_path}\\vector_store"
        
        # models name for registering
        self.registered_model_name = "ade-llm"
        self.embedding_model_name = "all-MiniLM-L12-v2"
        self.openai_chat_model = "gpt-3.5-turbo"
        self.system_message_template = """You are a helpful assistant built by Yash, you are good at helping classification of drug and it's affect based on the context provided, the context is a document. If the context does not provide enough relevant information to determine the answer, just say I don't know. If the context is irrelevant to the question, just say I don't know. If you did not find a good answer from the context, just say I don't know. If the query doesn't form a complete question, just say I don't know. If there is a good answer from the context, try to summarize the context to answer the question."""
        self.human_message_template = """Given the context: {context}. Classify the drug and it's affect {statement}."""

        # MLflow settings
        import mlflow
        _ = mlflow.set_experiment(f'{self.base_path}\\{self.registered_model_name}')

    def load_remote_data(self, url, unpack=False):
        import requests

        fname = url.split("/")[-1]
        r = requests.get(url)
        print("*" * 100)
        print(f"downloading file {fname} to {self.data_path}")
        print("*" * 100)
        open(f"{self.data_path}\\{fname}", "wb").write(r.content)
        if unpack:
            import tarfile

            # open file
            file = tarfile.open(f"{self.data_path}\\{fname}")
            file.extractall(f"{self.data_path}")
            file.close()

    def print_paths(self):
        print(f"root folder                    : {self.base_path}")
        print(f"raw data location              : {self.data_path}")
        print(f"delta tables location          : {self.delta_path}")
        print(f"vector store location          : {self.vector_store_path}")
        print(f"mlflow experitment location    : {self.base_path}\\{self.registered_model_name}")
        print(f"model name                     : {self.registered_model_name}")

        # for Databricks
        # html_str = f"""
        # <p>
        # <b>base_path</b> = <i>{self.base_path}</i><br>
        # <b>data_path</b> [where your raw data will be stored] = <i>{self.data_path}</i><br>
        # <b>delta_path</b> [where your delta tables will be stored] = <i>{self.delta_path}</i><br>
        # </p>
        # """
        # displayHTML(html_str)

    def display_data(self):
        import os

        files = os.listdir(f"{self.data_path}")

        # for Databricks
        # files = dbutils.fs.ls(f'{self.data_path}')
        if len(files) == 0:
            print("no data available, please run load_remote_data(<url for the data>)")
        else:
            print("*" * 100)
            print(f"data available in {self.data_path} are:")
            print("*" * 100)
            for _ in files:
                print(_)
            # for Databricks
            # display(files)
