## Recipe-1: Loading Open Source Model

In [None]:
# --- Recipe: Loading an Open Source Model from Hugging Face Hub ---
# Goal: Load a pre-trained open-source model and its tokenizer.
# Library: Hugging Face Transformers
# Note: Ensure you have run `pip install transformers torch accelerate` in your environment.
#       Some models (like Meta's Llama) are "gated" and require authentication.

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [None]:
# --- Configuration ---
# Choose a model ID from the Hugging Face Hub.
# Examples:
# 'mistralai/Mistral-7B-Instruct-v0.1' # Good performance, Apache 2.0 license
# 'google/gemma-2b-it' # Google's Gemma instruct-tuned model
# 'distilgpt2' # Very small, good for quick tests
model_id = "google/gemma-2b-it"# Using Gemma 2B instruct as an example

In [None]:
from huggingface_hub import HfApi

api = HfApi()
whoami = api.whoami(token="hf_xxxxxxxxxxxxxxx")
print(whoami)


{'type': 'user', 'id': '65feba1b57cc48d9d30d11cf', 'name': 'kalpasubbaiah', 'fullname': 'Kalpa Subbaiah', 'email': 'kalpa.subbaiah@gmail.com', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/319094e0eb55ce89334d7bd3685ceeb0.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'hugging_face_token_read', 'role': 'read', 'createdAt': '2025-04-22T09:03:46.223Z'}}}


In [None]:
# --- Authentication (Optional - Needed for Gated Models like Llama) ---
# If using a gated model, you need to:
# 1. Accept the license terms on the model's Hugging Face page.
# 2. Log in using the Hugging Face CLI: `huggingface-cli login`
#    This saves your token locally. The library will then use it automatically.
# Alternatively, provide the token explicitly (less secure):
# use_auth_token = "hf_YOUR_HUGGINGFACE_TOKEN" # Replace with your actual token
#use_auth_token = 'REPLACE_WITH_YOUR_HUGGING_FACE_TOKEN'# Get access to the model in hugging face by accepting the usage term and copy the Hugging face token here.
                                                       # In case you have not created the HUGGING FACE token , create the token and copy it here once you accept the model terms.
                                                       # make sure you have given the read permission while creating the token.
use_auth_token = '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' # Set to None or False if model is not gated or logged in via CLI

print(f"Loading tokenizer for model: {model_id}")
try:
    # 1. Load the Tokenizer
    #    AutoTokenizer automatically selects the correct tokenizer class based on the model ID.
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=use_auth_token)
    print("Tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    print("Check model ID, internet connection, and authentication if required.")
    exit()


Loading tokenizer for model: google/gemma-2b-it




Tokenizer loaded successfully.


In [None]:
print(f"\nLoading model: {model_id}")
print("This might take a while depending on model size and download speed...")
try:
    # 2. Load the Model
    #    AutoModelForCausalLM is suitable for text generation models.
    #    Use `device_map='auto'` to automatically distribute the model across available GPUs (requires accelerate).
    #    Use `torch_dtype=torch.bfloat16` (if supported by GPU) or torch.float16 for memory savings.
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        token=use_auth_token,
        device_map="auto", # Automatically use available GPU(s) or CPU
        torch_dtype=torch.bfloat16 # Use bfloat16 for efficiency if available
    )
    print(f"Model loaded successfully onto device: {model.device}") # Will show cuda:0 if GPU is used
except Exception as e:
    print(f"Error loading model: {e}")
    print("Check model ID, internet connection, authentication, and available GPU memory.")
    exit()


Loading model: google/gemma-2b-it
This might take a while depending on model size and download speed...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

2025-04-22 09:07:01.633017: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745312821.836290     220 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745312821.898286     220 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Model loaded successfully onto device: cuda:0


In [None]:
# --- Verification ---
# You now have the 'tokenizer' and 'model' objects ready for use in other recipes.
print("\nModel and Tokenizer are ready!")

# Example: Tokenize a sample text
sample_text = "Hello, LLM Chef!"
tokens = tokenizer.encode(sample_text)
print(f"\nSample text: '{sample_text}'")
print(f"Tokens: {tokens}")
decoded_text = tokenizer.decode(tokens)
print(f"Decoded tokens: '{decoded_text}'")


Model and Tokenizer are ready!

Sample text: 'Hello, LLM Chef!'
Tokens: [2, 4521, 235269, 629, 18622, 36614, 235341]
Decoded tokens: '<bos>Hello, LLM Chef!'


## Recipe-2: Calling a Proprietary Model API

In [None]:
# Requires setting the API key as an environment variable:
# export OPENAI_API_KEY='sk-YOUR_ACTUAL_API_KEY' using the terminal
# or using the python code below
import os
os.environ["OPENAI_API_KEY"] = "sk-proj-lSxx7Mgmzxm07z2b4BqbT3BlbkFJ3shMqcS4pPz616k25tLv"

In [None]:
# --- Recipe: Calling a Proprietary Model API ---
# Goal: Demonstrate making an API call to a hypothetical closed-source LLM endpoint.
# Library: requests (standard Python library), openai (example specific library)
# Note: This uses hypothetical examples. Replace with actual API details from your provider.
#       NEVER hardcode API keys directly in code for production systems. Use environment variables or secrets management.

import requests
import os
import json

# --- Using a specific library like OpenAI (Recommended if available) ---
# Ensure you have the library installed: pip install openai
# Requires setting the API key as an environment variable:
# export OPENAI_API_KEY='sk-YOUR_ACTUAL_API_KEY'

# Example using the OpenAI library structure (adapt for other providers like Anthropic, Google)
print("--- Example using OpenAI library ---")
try:
    from openai import OpenAI
    # The client automatically picks up the OPENAI_API_KEY environment variable
    client = OpenAI() # Add base_url if using a non-OpenAI compatible endpoint

    # Check if API key is set (optional but good practice)
    if not client.api_key:
        print("Error: OPENAI_API_KEY environment variable not set.")
    else:
        print("OpenAI client initialized.")
        prompt_text = "Explain the concept of Parameter-Efficient Fine-Tuning (PEFT) in one paragraph."
        model_to_use = "gpt-3.5-turbo" # Or other available model like "gpt-4"

        print(f"\nSending prompt to model: {model_to_use}")
        completion = client.chat.completions.create(
            model=model_to_use,
            messages=[
                {"role": "system", "content": "You are a helpful AI assistant explaining complex ML concepts simply."},
                {"role": "user", "content": prompt_text}
            ],
            max_tokens=150, # Limit the length of the response
            temperature=0.7 # Controls randomness (0=deterministic, >1=more random)
        )

        # Extract and print the response
        response_text = completion.choices[0].message.content
        print("\nAPI Response:")
        print(response_text)
        print("\nUsage Info:")
        print(completion.usage) # Shows token usage

except ImportError:
    print("OpenAI library not found. Skipping OpenAI example.")
except Exception as e:
    print(f"Error during OpenAI API call: {e}")

--- Example using OpenAI library ---
OpenAI client initialized.

Sending prompt to model: gpt-3.5-turbo

API Response:
Parameter-Efficient Fine-Tuning (PEFT) is a technique in machine learning where a pre-trained model is fine-tuned on a smaller dataset with fewer parameters to achieve high performance. Instead of training the entire model from scratch, PEFT focuses on updating only a subset of the model's parameters that are essential for the new task, thereby reducing the computational resources and time required for training. This approach allows for faster adaptation of the pre-trained model to new tasks while maintaining or even improving its performance, making it a more efficient and effective way to leverage pre-trained models for various applications.

Usage Info:
CompletionUsage(completion_tokens=119, prompt_tokens=43, total_tokens=162, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_token

In [None]:
#API CALL
# --- Generic Example using 'requests' library ---
# Useful if the provider doesn't have a dedicated Python library or for simple calls.
print("\n--- Example using generic 'requests' library ---")

# --- Configuration (Replace with actual values) ---
# **NEVER COMMIT ACTUAL KEYS TO VERSION CONTROL**
# Load from environment variables is best practice:
# api_key = os.environ.get("VENDOR_API_KEY")
api_key = os.getenv("OPENAI_API_KEY") # Replace or load from env var
api_endpoint_url = "https://api.openai.com/v1/chat/completions" # Replace with actual endpoint

if api_key == "YOUR_VENDOR_API_KEY_PLACEHOLDER":
    print("Warning: Using placeholder API key. Set a real key for actual use.")

# --- Prepare Request ---
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json",
}

messages =  [
    {"role": "user", "content": "Write a short tagline for an LLM cookbook."}
  ]

data = {
    "model": "gpt-3.5-turbo", # Replace with actual model name
    "messages": messages,
    "max_tokens": 20,
    "temperature": 0.8
}

print(f"\nSending request to generic endpoint: {api_endpoint_url}")
try:
    # --- Make API Call ---
    response = requests.post(api_endpoint_url, headers=headers, data=json.dumps(data))
    response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

    # --- Process Response ---
    response_data = response.json()
    print("\nAPI Response (JSON):")
    print(json.dumps(response_data, indent=2))

    # Extract the actual generated text (structure depends on API provider)
    # This is a hypothetical structure, adjust based on the actual API response
    if "choices" in response_data and len(response_data["choices"]) > 0:
        generated_text = response_data["choices"][0]["message"].get("content", "N/A")
        print(f"\nGenerated Text: {generated_text.strip()}")
    else:
        print("\nCould not extract generated text from response.")

except requests.exceptions.RequestException as e:
    print(f"Error during generic API call: {e}")
    if response is not None:
        print(f"Response status code: {response.status_code}")
        print(f"Response text: {response.text}")
except Exception as e:
     print(f"An unexpected error occurred: {e}")

# --- End of Recipe ---



--- Example using generic 'requests' library ---

Sending request to generic endpoint: https://api.openai.com/v1/chat/completions

API Response (JSON):
{
  "id": "chatcmpl-BP5wIeziwMdmgP4hAavUAabBAB9Dp",
  "object": "chat.completion",
  "created": 1745320730,
  "model": "gpt-3.5-turbo-0125",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "\"Elevate your home cooking with delicious and innovative recipes from our LLM cookbook!\"",
        "refusal": null,
        "annotations": []
      },
      "logprobs": null,
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 18,
    "completion_tokens": 18,
    "total_tokens": 36,
    "prompt_tokens_details": {
      "cached_tokens": 0,
      "audio_tokens": 0
    },
    "completion_tokens_details": {
      "reasoning_tokens": 0,
      "audio_tokens": 0,
      "accepted_prediction_tokens": 0,
      "rejected_prediction_tokens": 0
    }
  },
  "service_tier": "defaul

## Recipe-3: Basic Inference Tasks (Generation & Zero-Shot Classification)

In [None]:
# --- Recipe: Basic Inference Tasks (Generation & Zero-Shot Classification) ---
# Goal: Use a loaded model for common tasks via Hugging Face pipelines.
# Library: Hugging Face Transformers
# Prerequisite: A model and tokenizer should be loaded (e.g., from ch3_recipe_load_oss).
#               Or use a model ID directly within the pipeline.

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
import torch

# --- Option 1: Use a model ID directly in the pipeline (Easier for standard tasks) ---
print("--- Option 1: Using Pipelines with Model ID ---")

# Text Generation
print("\nLoading text-generation pipeline...")
try:
    # Using a small model for quick demo
    generator = pipeline('text-generation', model='distilgpt2', device=0 if torch.cuda.is_available() else -1) # Use GPU if available
    prompt = "The secret ingredient in the best AI recipes is"
    print(f"Generating text for prompt: '{prompt}'")
    outputs = generator(prompt, max_new_tokens=30, num_return_sequences=1) # max_new_tokens generates 30 tokens *after* the prompt
    print("Generated Text:")
    print(outputs[0]['generated_text'])
except Exception as e:
    print(f"Error during text generation: {e}")

--- Option 1: Using Pipelines with Model ID ---

Loading text-generation pipeline...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generating text for prompt: 'The secret ingredient in the best AI recipes is'
Generated Text:
The secret ingredient in the best AI recipes is that you can see exactly how many times you read the words, or by the amount of time you read them, or by the amount of time you


In [None]:
# Zero-Shot Classification
print("\nLoading zero-shot-classification pipeline...")
try:
    # Using a model fine-tuned for Natural Language Inference (NLI), suitable for zero-shot
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0 if torch.cuda.is_available() else -1)
    sequence_to_classify = "This cookbook focuses on practical LLM fine-tuning."
    candidate_labels = ["machine learning", "cooking", "finance", "sports"]
    print(f"\nClassifying sequence: '{sequence_to_classify}'")
    print(f"With candidate labels: {candidate_labels}")
    results = classifier(sequence_to_classify, candidate_labels)
    print("\nClassification Results:")
    # Print results sorted by score
    sorted_results = sorted(zip(results['labels'], results['scores']), key=lambda x: x[1], reverse=True)
    for label, score in sorted_results:
        print(f"- {label}: {score:.4f}")

except Exception as e:
    print(f"Error during zero-shot classification: {e}")



Loading zero-shot-classification pipeline...


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0



Classifying sequence: 'This cookbook focuses on practical LLM fine-tuning.'
With candidate labels: ['machine learning', 'cooking', 'finance', 'sports']

Classification Results:
- cooking: 0.9307
- machine learning: 0.0529
- sports: 0.0090
- finance: 0.0074


In [None]:
# --- Option 2: Use pre-loaded model and tokenizer (More control, useful if model already loaded) ---
print("\n--- Option 2: Using Pipelines with Pre-loaded Model/Tokenizer ---")
# This assumes you have 'model' and 'tokenizer' variables from ch3_recipe_load_oss
# We'll reload Gemma here for demonstration if the previous recipe wasn't run contiguously.

model_id_loaded = "google/gemma-2b-it" # Make sure this matches the model you intend to load/use
preloaded_model = None
preloaded_tokenizer = None

try:
    print(f"\nAttempting to load {model_id_loaded} for Option 2 demo...")
    preloaded_tokenizer = AutoTokenizer.from_pretrained(model_id_loaded)
    # Load specifically for Causal LM if planning text generation
    preloaded_model = AutoModelForCausalLM.from_pretrained(
        model_id_loaded,
        device_map="auto",
        torch_dtype=torch.bfloat16 # Use bfloat16 for efficiency if available
    )
    print("Pre-loaded model and tokenizer ready.")

    # Create pipeline using the loaded components
    generator_loaded = pipeline('text-generation', model=preloaded_model, tokenizer=preloaded_tokenizer) # device is inferred from model.device

    prompt_loaded = "To build a great AI application, you need"
    print(f"\nGenerating text using pre-loaded model: '{prompt_loaded}'")
    outputs_loaded = generator_loaded(prompt_loaded, max_new_tokens=30, num_return_sequences=1, do_sample=True, temperature=0.7)
    print("Generated Text (Pre-loaded):")
    print(outputs_loaded[0]['generated_text'])

    # Note: For zero-shot classification with a pre-loaded model, ensure the loaded model
    # is suitable for classification (e.g., AutoModelForSequenceClassification) and the task.
    # Using a CausalLM like Gemma directly in a zero-shot pipeline might not yield optimal results
    # without specific fine-tuning or prompt engineering for that task.
    print("\n(Skipping zero-shot with pre-loaded CausalLM model as it's not ideal for the task without adaptation)")

except NameError:
    print("\nSkipping Option 2 as 'model' and 'tokenizer' variables are not defined.")
    print("(This likely means the 'ch3_recipe_load_oss' recipe wasn't run just before this one).")
except Exception as e:
    print(f"\nError during Option 2 setup or inference: {e}")



--- Option 2: Using Pipelines with Pre-loaded Model/Tokenizer ---

Attempting to load google/gemma-2b-it for Option 2 demo...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Pre-loaded model and tokenizer ready.

Generating text using pre-loaded model: 'To build a great AI application, you need'
Generated Text (Pre-loaded):
To build a great AI application, you need to address several critical topics:

**1. Data Quality and Privacy:**

* **Data Sources:** Identify and assess the data sources used to train

(Skipping zero-shot with pre-loaded CausalLM model as it's not ideal for the task without adaptation)


## Recipe-4: Comparing Model Outputs

In [None]:
# --- Recipe: Comparing Outputs of Different Models ---
# Goal: Run the same prompt through two different models and compare their outputs.
# Library: Hugging Face Transformers

from transformers import pipeline, set_seed
import torch

# --- Configuration ---
# Choose two different model IDs
model_id_1 = "distilgpt2" # Smaller, faster model
model_id_2 = "google/gemma-2b-it" # Larger, potentially more capable model (requires more resources)

prompt = "The future of artificial intelligence is"

# Set seed for reproducibility of generation (if models use sampling)
set_seed(123)

# --- Load Pipelines for Both Models ---
print(f"Loading pipeline for Model 1: {model_id_1}")
try:
    # Use device=0 for GPU if available, otherwise -1 for CPU
    device_index = 0 if torch.cuda.is_available() else -1
    generator1 = pipeline('text-generation', model=model_id_1, device=device_index)
    print("Pipeline 1 loaded.")
except Exception as e:
    print(f"Error loading pipeline 1: {e}")
    generator1 = None # Ensure variable exists but is None

print(f"\nLoading pipeline for Model 2: {model_id_2}")
try:
    # Gemma might require bfloat16 for efficiency on some GPUs
    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
    generator2 = pipeline('text-generation', model=model_id_2, device=device_index, torch_dtype=dtype)
    print("Pipeline 2 loaded.")
except Exception as e:
    print(f"Error loading pipeline 2: {e}")
    print("This could be due to insufficient GPU memory for Gemma-2B.")
    generator2 = None # Ensure variable exists but is None

# --- Generate and Compare Outputs ---
print(f"\n--- Comparing Outputs for Prompt: '{prompt}' ---")

output1 = "Pipeline 1 failed to load."
if generator1:
    print(f"\nGenerating with Model 1 ({model_id_1})...")
    try:
        outputs1 = generator1(prompt, max_new_tokens=50, num_return_sequences=1, do_sample=True, temperature=0.7)
        output1 = outputs1[0]['generated_text']
    except Exception as e:
        output1 = f"Error during generation with Model 1: {e}"
print(f"\nOutput from {model_id_1}:\n{output1}")

output2 = "Pipeline 2 failed to load or generation failed."
if generator2:
    print(f"\nGenerating with Model 2 ({model_id_2})...")
    try:
        # Gemma instruct models often need specific prompt formatting if not using pipeline defaults
        # For basic comparison, we'll use the raw prompt here.
        outputs2 = generator2(prompt, max_new_tokens=50, num_return_sequences=1, do_sample=True, temperature=0.7)
        output2 = outputs2[0]['generated_text']
    except Exception as e:
        output2 = f"Error during generation with Model 2: {e}"
print(f"\nOutput from {model_id_2}:\n{output2}")

print("\n--- Comparison Complete ---")
#
by SYBOObserve differences in style, coherence, length, factual accuracy (if applicable), etc.


Loading pipeline for Model 1: distilgpt2


Device set to use cuda:0


Pipeline 1 loaded.

Loading pipeline for Model 2: google/gemma-2b-it


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Pipeline 2 loaded.

--- Comparing Outputs for Prompt: 'The future of artificial intelligence is' ---

Generating with Model 1 (distilgpt2)...

Output from distilgpt2:
The future of artificial intelligence is as important as ever, and that's a great question. It's a very difficult question to answer. So, I think you might be able to answer it, but there's a lot of work to be done, and it has to be done

Generating with Model 2 (google/gemma-2b-it)...

Output from google/gemma-2b-it:
The future of artificial intelligence is rapidly evolving, and the pace of change is accelerating. As AI becomes more sophisticated, it will have a profound impact on society, both positive and negative.

Here are some of the key trends that are shaping the future of AI:

* **

--- Comparison Complete ---
