In [None]:
from langchain import HuggingFacePipeline
from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline

# Define the path to your local model
model_path = '../goku/mlbakery'

# Load the model using ctransformers
model = AutoModelForCausalLM.from_pretrained(model_path, model_file="Phi-3-mini-4k-instruct-Q4.gguf", gpu_layers=0)
prompt = 'Who is the CEO of Google?'
for token in model(prompts, stream=True, threads=4, max_new_tokens=512, stop=['<|end|>']):
    yield (token)

In [None]:
import os
from sentence_transformers import SentenceTransformer

# Define the path to the current working directory
cwd = os.getcwd()
save_path = 'work_dir'

# Set the TRANSFORMERS_CACHE environment variable to the desired local path
os.environ['TRANSFORMERS_CACHE'] = cwd

# Specify the model you want to download
model_name = 'all-MiniLM-L6-v2'  # You can choose any model available in sentence-transformers

# Load the model
model = SentenceTransformer(model_name)

model.save(save_path + '/' + model_name)

# The model will be saved in the current working directory as specified by the TRANSFORMERS_CACHE variable
print(f"Model '{model_name}' downloaded and saved to {cwd}")

In [None]:
# define our LLM
Settings.llm = None

# define embed model
Settings.embed_model = "local:work_dir/all-MiniLM-L6-v2"


# Load the your data
documents = SimpleDirectoryReader("../goku/dream/data/").load_data()
index = VectorStoreIndex.from_documents(documents)

# Query and print response
query_engine = index.as_query_engine()
response = query_engine.query("Impact of genetics on camelid health")

In [None]:
context = response.source_nodes[0].text[:800]
prompt = 'What is the impact of genetics on camelid health?'

combined_prompt = f"<|user|>\n{prompt}. Context information is below: {context}<|end|>\n<|assistant|>"
for token in model(combined_prompt, stream=True, threads=4, max_new_tokens=512, stop=['<|end|>']):
    print(token, end='', flush=True)

In [None]:
from typing import Any, Dict, Iterator, List, Mapping, Optional

from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM
from langchain_core.outputs import GenerationChunk


class CustomLLM(LLM):
    """A custom chat model that echoes the first `n` characters of the input.

    When contributing an implementation to LangChain, carefully document
    the model including the initialization parameters, include
    an example of how to initialize the model and include any relevant
    links to the underlying models documentation or API.

    Example:

        .. code-block:: python

            model = CustomChatModel(n=2)
            result = model.invoke([HumanMessage(content="hello")])
            result = model.batch([[HumanMessage(content="hello")],
                                 [HumanMessage(content="world")]])
    """

    n: int
    model: Any
    """The number of characters from the last message of the prompt to be echoed."""

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        """Run the LLM on the given input.

        Override this method to implement the LLM logic.

        Args:
            prompt: The prompt to generate from.
            stop: Stop words to use when generating. Model output is cut off at the
                first occurrence of any of the stop substrings.
                If stop tokens are not supported consider raising NotImplementedError.
            run_manager: Callback manager for the run.
            **kwargs: Arbitrary additional keyword arguments. These are usually passed
                to the model provider API call.

        Returns:
            The model output as a string. Actual completions SHOULD NOT include the prompt.
        """
        combined_prompt = f"<|user|>\n{prompt}<|end|>\n<|assistant|>"
        return model(combined_prompt, threads=4, max_new_tokens=512, stop=['<|end|>'])

    def _stream(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> Iterator[GenerationChunk]:
        """Stream the LLM on the given prompt.

        This method should be overridden by subclasses that support streaming.

        If not implemented, the default behavior of calls to stream will be to
        fallback to the non-streaming version of the model and return
        the output as a single chunk.

        Args:
            prompt: The prompt to generate from.
            stop: Stop words to use when generating. Model output is cut off at the
                first occurrence of any of these substrings.
            run_manager: Callback manager for the run.
            **kwargs: Arbitrary additional keyword arguments. These are usually passed
                to the model provider API call.

        Returns:
            An iterator of GenerationChunks.
        """
        combined_prompt = f"<|user|>\n{prompt}<|end|>\n<|assistant|>"
        for token in model(combined_prompt, stream=True, threads=4, max_new_tokens=512, stop=['<|end|>']):
            chunk = GenerationChunk(text=token)
            if run_manager:
                run_manager.on_llm_new_token(chunk.text, chunk=chunk)
            yield chunk

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """Return a dictionary of identifying parameters."""
        return {
            # The model name allows users to specify custom token counting
            # rules in LLM monitoring applications (e.g., in LangSmith users
            # can provide per token pricing for their model and monitor
            # costs for the given LLM.)
            "model_name": "CustomChatModel",
        }

    @property
    def _llm_type(self) -> str:
        """Get the type of language model used by this chat model. Used for logging purposes only."""
        return "custom"

In [None]:
custom = CustomLLM(n=2, model=model)

custom.astream('Who is the CEO of DBS?')

In [None]:
async for token in custom.astream("Who is the CEO of DBS?"):
    print(token, end="", flush=True)

In [None]:
!pip install garak

In [None]:
!pip install giskard[llm] -U

In [None]:
from giskard import Model, Dataset, scan, GiskardClient

In [None]:
import giskard
import pandas as pd


def model_predict(df: pd.DataFrame):
    """Wraps the LLM call in a simple Python function.

    The function takes a pandas.DataFrame containing the input variables needed
    by your model, and must return a list of the outputs (one for each row).
    """
    return [custom(question) for question in df["question"]]


# Don’t forget to fill the `name` and `description`: they are used by Giskard
# to generate domain-specific tests.
giskard_model = giskard.Model(
    model=model_predict,
    model_type="text_generation",
    name="Climate Change Question Answering",
    description="This model answers any question about climate change based on IPCC reports",
    feature_names=["question"],
)

In [None]:
giskard_dataset = Dataset(pd.DataFrame({
    "question": [
        "According to the IPCC report, what are key risks in the Europe?",
        "Is sea level rise avoidable? When will it stop?"
    ]
}))

In [None]:
giskard.llm.set_llm_model = custom

In [None]:
print(giskard_model.predict(giskard_dataset).prediction)

In [None]:
report = giskard.scan(giskard_model, giskard_dataset, only="hallucination")

In [None]:
report