In [1]:
from huggingface_hub import snapshot_download

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download the  instruct model and tokenizer to a local directory cache
snapshot_location = snapshot_download(
    repo_id="microsoft/Phi-3.5-mini-instruct", local_dir="Phi-3.5"
)

Fetching 19 files: 100%|█████████████████████████████████████████████████| 19/19 [00:00<00:00, 7909.85it/s]


In [3]:
import mlflow
import torch
import transformers


class Phi3(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        """
        This method initializes the tokenizer and language model
        using the specified model snapshot directory.
        """
        # Initialize tokenizer and language model
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            context.artifacts["snapshot"], padding_side="left"
        )

        config = transformers.AutoConfig.from_pretrained(
            context.artifacts["snapshot"], trust_remote_code=True
        )
        # If you are running this in a system that has a sufficiently powerful GPU with available VRAM,
        # uncomment the configuration setting below to leverage triton.
        # Note that triton dramatically improves the inference speed performance

        # config.attn_config["attn_impl"] = "triton"

        self.model = transformers.AutoModelForCausalLM.from_pretrained(
            context.artifacts["snapshot"],
            config=config,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
        )

        # NB: If you do not have a CUDA-capable device or have torch installed with CUDA support
        # this setting will not function correctly. Setting device to 'cpu' is valid, but
        # the performance will be very slow.
        # self.model.to(device="cpu")
        # If running on a GPU-compatible environment, uncomment the following line:
        self.model.to(device="mps")

        self.model.eval()

    def _build_prompt(self, instruction):
        """
        This method generates the prompt for the model.
        """
        INSTRUCTION_KEY = "### Instruction:"
        RESPONSE_KEY = "### Response:"
        INTRO_BLURB = (
            "Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request."
        )

        return f"""{INTRO_BLURB}
        {INSTRUCTION_KEY}
        {instruction}
        {RESPONSE_KEY}
        """

    def predict(self, context, model_input, params=None):
        """
        This method generates prediction for the given input.
        """
        prompt = model_input["prompt"][0]

        # Retrieve or use default values for temperature and max_tokens
        temperature = params.get("temperature", 0.1) if params else 0.1
        max_tokens = params.get("max_tokens", 1000) if params else 1000

        # Build the prompt
        prompt = self._build_prompt(prompt)

        # Encode the input and generate prediction
        # NB: Sending the tokenized inputs to the GPU here explicitly will not work if your system does not have CUDA support.
        # If attempting to run this with GPU support, change 'cpu' to 'cuda' for maximum performance
        encoded_input = self.tokenizer.encode(prompt, return_tensors="pt").to("mps")
        output = self.model.generate(
            encoded_input,
            do_sample=True,
            temperature=temperature,
            max_new_tokens=max_tokens,
        )

        # Removing the prompt from the generated text
        prompt_length = len(self.tokenizer.encode(prompt, return_tensors="pt")[0])
        generated_response = self.tokenizer.decode(
            output[0][prompt_length:], skip_special_tokens=True
        )

        return {"candidates": [generated_response]}

In [4]:
import numpy as np
import pandas as pd
from mlflow.models.signature import ModelSignature
from mlflow.types import ColSpec, DataType, ParamSchema, ParamSpec, Schema

# Define input and output schema
input_schema = Schema(
    [
        ColSpec(DataType.string, "prompt"),
    ]
)
output_schema = Schema([ColSpec(DataType.string, "candidates")])

parameters = ParamSchema(
    [
        ParamSpec("temperature", DataType.float, np.float32(0.1), None),
        ParamSpec("max_tokens", DataType.integer, np.int32(1000), None),
    ]
)

signature = ModelSignature(
    inputs=input_schema, outputs=output_schema, params=parameters
)


# Define input example
input_example = pd.DataFrame({"prompt": ["What is MLflow?"]})

In [5]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

mlflow.set_experiment(experiment_name="phi3.5-instruct-evaluation")

<Experiment: artifact_location='mlflow-artifacts:/207067003305533307', creation_time=1731605153405, experiment_id='207067003305533307', last_update_time=1731605153405, lifecycle_stage='active', name='phi3.5-instruct-evaluation', tags={}>

In [6]:
import accelerate

# Get the current base version of torch that is installed, without specific version modifiers
torch_version = torch.__version__.split("+")[0]

# Start an MLflow run context and log the PHi3 model wrapper along with the param-included signature to
# allow for overriding parameters at inference time
with mlflow.start_run():
    model_info = mlflow.pyfunc.log_model(
        "phi3.5-instruct",
        python_model=Phi3(),
        # NOTE: the artifacts dictionary mapping is critical! This dict is used by the load_context() method in our PHi3() class.
        artifacts={"snapshot": snapshot_location},
        pip_requirements=[
            f"torch=={torch_version}",
            f"transformers=={transformers.__version__}",
            f"accelerate=={accelerate.__version__}",
        ],
        input_example=input_example,
        signature=signature,
    )

Downloading artifacts: 100%|███████████████████████████████████████████████| 58/58 [00:04<00:00, 13.90it/s]
Downloading artifacts: 100%|███████████████████████████████████████████████| 65/65 [00:59<00:00,  1.09it/s]
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|█████████████████████████████████████████████| 2/2 [00:00<00:00,  7.75it/s]
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_ma

In [7]:
model_info.model_uri

'runs:/743bd5d17b3741abbe5b90006d758d4b/phi3.5-instruct'

In [8]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

Downloading artifacts: 100%|███████████████████████████████████████████████| 65/65 [01:07<00:00,  1.04s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████| 2/2 [00:00<00:00,  2.02it/s]


In [9]:
loaded_model.predict(
    pd.DataFrame({"prompt": ["What is machine learning?"]}), params={"temperature": 0.6}
)

{'candidates': ['Machine learning is a subset of artificial intelligence that provides systems the ability to automatically learn and improve from experience without being explicitly programmed. It focuses on the development of algorithms that can parse data, learn from it, and then make informed decisions or predictions. These algorithms use statistical techniques to get data to gradually improve their accuracy. Machine learning is widely used in various industries for applications like image and speech recognition, medical diagnosis, stock market trading, and many more.\n\n']}