In [1]:
# all imports

import os
import torch
import gc
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display, update_display
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load environment variables from .env file
load_dotenv(override=True)
hf_token = os.getenv("HF_TOKEN")

In [3]:
# login to Hugging Face Hub using the token from environment variables
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
# # initialize the ollama client

# OLLAMA_BASE_URL = "http://localhost:11434/v1"
# client = OpenAI(base_url=OLLAMA_BASE_URL, api_key="ollama")

In [5]:
# # test if the client is responsive

# messages = [{"role": "user", "content": "Hi there!"}]

# model = "llama3.1"

# response = client.chat.completions.create(
#     model=model,
#     messages=messages
# )

# print(response.choices[0].message.content)

In [6]:
# # function to stream response from the model, update the display in real-time, and return the final response as a string

# def stream_llm_response(model, messages):
#     response = client.chat.completions.create(
#         model=model,
#         messages=messages,
#         stream=True
#     )

#     full_response = ""
#     display_id = None

#     for chunk in response:
#         content = chunk.choices[0].delta.content or ""
#         full_response += content

#         if display_id is None:
#             display_id = display(Markdown(full_response), display_id=True).display_id
#         else:
#             update_display(Markdown(full_response), display_id=display_id)

#     return full_response


In [7]:
# # test the streaming function with a simple prompt

# messages = [{"role": "user", "content": "Tell a funny joke for a Machine Learning engineer."}]
# model = "llama3.1"
# response = stream_llm_response(model=model, messages=messages)
# print("\n\n--------------------------------------------------------------------------------\n\n")
# print(response)

In [8]:
# # define the system prompt for the extraction task

# system_prompt = """
# You are an AI assistant that extracts **accurate, structured insights** from markdown articles.

# ### Task
# From the given markdown article, produce:

# ## Key Takeaways
# - Core conclusions, claims, or findings
# - Important facts, numbers, decisions, or timelines

# ## Summary
# - A concise but complete overview
# - Capture the main topic, purpose, and key points

# ### Rules
# - Preserve meaning, context, and important qualifiers
# - Do not add opinions or external information
# - Do not include meta commentary
# - Use clear markdown headings and bullet points
# - Be concise but information-dense
# """

In [9]:
# # define the system prompt for the extraction task

# system_prompt = """
# You are an expert at extracting accurate, structured summaries and key takeaways from markdown articles.
# You are given a web scraped markdown article. 
# Your task is to extract the key takeaways and a concise summary from the article.
# The output should be in markdown format.
# Reply only with the markdown content, do not include any explanations or commentary.
# """

In [10]:
# # define the user prompt for the extraction task (for scraped-content/main_content.md)

# with open("scraped-content/main_content.md", "r", encoding="utf-8") as f:
#     content = f.read()

# user_prompt = f"""
# Here is a web scraped markdown article. 
# Extract the key takeaways and a concise summary according to the system prompt instructions.

# Article:
# {content}

# Return the output in markdown format. 
# Do not include any explanations or commentary, reply only with the markdown content.
# Do not explain your reasoning. 
# Start output immediately.
# """

In [11]:
# print(user_prompt)

In [12]:
# # call the model to perform the extraction task

# messages = [
#     {"role": "system", "content": system_prompt},
#     {"role": "user", "content": user_prompt}
# ]

# model = "llama3.1"

# response = stream_llm_response(model=model, messages=messages)

In [13]:
# generate a quantization config for loading the model in 4-bit precision using bitsandbytes

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
# define a function to stream response from the model, update the display in real-time, and return the final response as a string

# simple in-memory cache to avoid reloading model every call
_cache = {}

def stream_response(model_id, messages):
    # load tokenizer + model once per model_id
    if model_id not in _cache:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(model_id)
        model.eval()  # inference mode

        _cache[model_id] = (tokenizer, model)

    tokenizer, model = _cache[model_id]

    # convert chat messages to model input tokens
    inputs = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True
    )
    input_ids = inputs["input_ids"]

    # stream tokens as they are generated
    streamer = TextStreamer(
        tokenizer,
        skip_prompt=True,
        skip_special_tokens=True
    )

    # generate response
    outputs = model.generate(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=1000
    )

    # remove prompt tokens and keep only generated output
    generated_tokens = outputs[0][input_ids.shape[-1]:]

    # return final assistant output as string
    return tokenizer.decode(
        generated_tokens,
        skip_special_tokens=True
    )

In [15]:
# define the model_id and messages for testing the streaming function

model_id = "meta-llama/Llama-3.2-1B-Instruct"

messages = [{"role": "user", "content": "Tell a joke for a room of Data Scientists"}]

In [24]:
# call the streaming function and print the final response

response = stream_response(model_id=model_id, messages=messages)
print("\n\n--------------------------------------------------------------------------------\n\n")
print(response)

Loading weights: 100%|██████████| 146/146 [00:00<00:00, 433.99it/s, Materializing param=model.norm.weight]                              
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Here's a joke tailored for a room of data scientists:

Why did the data set go to therapy?

Because it had a lot of 'data' to process, and it was struggling to 'handle' its emotions. But in the end, it just needed to 'analyze' its feelings and work through its issues.


--------------------------------------------------------------------------------


Here's a joke tailored for a room of data scientists:

Why did the data set go to therapy?

Because it had a lot of 'data' to process, and it was struggling to 'handle' its emotions. But in the end, it just needed to 'analyze' its feelings and work through its issues.
