## LangChain Chaining Techniques

### Introduction
This notebook demonstrates key chaining functionalities in LangChain:
- SimpleSequentialChain
- SequentialChain
- LLMRouterChain
- TransformChain

Each chaining method is designed for different levels of complexity and control. Use simple chains for straightforward tasks, sequential chains for workflows, router chains for conditional branching, and transform chains when integrating custom logic.

In [46]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain_huggingface.llms import HuggingFacePipeline
from langchain_huggingface import ChatHuggingFace
from langchain_classic.chains import SimpleSequentialChain, SequentialChain, TransformChain, LLMChain, LLMMathChain
from langchain_classic.chains.router import LLMRouterChain, MultiPromptChain
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate

[No output generated]

In [47]:
# === GPU & Model Status Check ===
import gc

print("=== Initial Resource Status ===")

# GPU Status - Use pynvml for SYSTEM-WIDE memory (not just this process)
try:
    import pynvml
    pynvml.nvmlInit()
    device_count = pynvml.nvmlDeviceGetCount()
    print(f"\nGPU Count: {device_count}")

    for i in range(device_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(handle)
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)

        total_gb = info.total / 1024**3
        used_gb = info.used / 1024**3
        free_gb = info.free / 1024**3
        usage_pct = (info.used / info.total) * 100

        print(f"\nGPU {i}: {name}")
        print(f"  Total:  {total_gb:.2f} GB")
        print(f"  Used:   {used_gb:.2f} GB ({usage_pct:.1f}%)")
        print(f"  Free:   {free_gb:.2f} GB")

        # Warning if low on memory (7B model needs ~5GB with 4-bit quantization)
        if free_gb < 6.0:
            print(f"  ⚠️  WARNING: Low GPU memory! Model loading may fail.")
            print(f"      Consider running cleanup cells in other notebooks first.")

    pynvml.nvmlShutdown()
except ImportError:
    print("\n⚠️  pynvml not installed - falling back to PyTorch (per-process only)")
    import torch
    if torch.cuda.is_available():
        print(f"GPU Available: {torch.cuda.get_device_name(0)}")
        for i in range(torch.cuda.device_count()):
            total = torch.cuda.get_device_properties(i).total_memory / 1024**3
            allocated = torch.cuda.memory_allocated(i) / 1024**3
            print(f"  GPU {i}: {allocated:.2f} / {total:.2f} GB (THIS PROCESS ONLY)")
    else:
        print("No GPU available - using CPU")
except Exception as e:
    print(f"\nGPU status check failed: {e}")

# No Ollama in this notebook - HuggingFace only

print("\n" + "="*40)

=== Initial Resource Status ===

GPU Count: 1

GPU 0: b'NVIDIA GeForce RTX 4080 SUPER'
  Total:  15.99 GB
  Used:   2.49 GB (15.5%)
  Free:   13.51 GB



In [48]:
# Download model from HuggingFace (same base model as D1_01)
HF_LLM_MODEL = "NousResearch/Nous-Hermes-2-Mistral-7B-DPO"

[No output generated]

In [49]:
# 4-bit quantization config for efficient loading
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(HF_LLM_MODEL)

# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    HF_LLM_MODEL,
    device_map="auto",
    quantization_config=quantization_config,
)

# Verify model config
print(model.config)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "float16",
  "eos_token_id": 32000,
  "head_dim": null,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie

In [4]:
# Pipeline setup
text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    do_sample=True,
    temperature=0.7,
    return_full_text=False,
    eos_token_id=tokenizer.eos_token_id,
    skip_special_tokens=True,
)
llm = HuggingFacePipeline(pipeline=text_pipeline)

Device set to use cuda:0

In [5]:
chat_llm = ChatHuggingFace(llm=llm)

### SimpleSequentialChain

The `SimpleSequentialChain` is the most basic form of a chain. It takes a single input, passes it to a prompt, and the output of one step is directly passed as input to the next. It does not track intermediate steps or provide access to named outputs, making it suitable for linear, single-purpose chains.

Use case: quick linear pipelines like "generate → explain" or "summarize → expand".

In [6]:
template1 = "Give me a simple bullet point outline for a blog post on {topic}"
prompt1 = ChatPromptTemplate.from_template(template1)
chain1 = prompt1|chat_llm

template2 = "Write a blog post using this outline: {outline}"
prompt2 = ChatPromptTemplate.from_template(template2)
chain2 = prompt2|chat_llm

In [7]:
full_chain = chain1|chain2

In [8]:
result = full_chain.invoke("Artificial Intelligence") # That piece of code takes quite some time to execute
print(result.content)

Title: Artificial Intelligence: A Game-Changer for the Future

Introduction to Artificial Intelligence

Artificial Intelligence (AI) is a rapidly growing field that has the potential to revolutionize various industries and our daily lives. It is a branch of computer science that deals with the development of intelligent machines that work and react like humans. AI enables systems to perform tasks that normally require human intelligence, such as visual perception, speech recognition, decision-making, and language translation.

Historical Development and Evolution of AI

The concept of AI dates back to the 1950s, when the first AI research began. Over the years, AI has evolved through several phases, including rule-based systems, expert systems, and machine learning. In recent times, AI has been significantly improved with the advent of deep learning and neural networks, which allow machines to learn and improve from experience without being explicitly programmed.

Types of AI

1. Narro

### SequentialChain

`SequentialChain` is more flexible than `SimpleSequentialChain`. It supports multiple input and output variables and keeps track of intermediate outputs. Each step can depend on one or more outputs from earlier steps.

Use case: more complex workflows that need to reuse or transform earlier outputs in later steps.

In [9]:
template1 = "Give a summary of this employee's performance review:\n{review}"
prompt1 = ChatPromptTemplate.from_template(template1)
chain_1 = prompt1|chat_llm

In [10]:
template2 = "Identify key employee weaknesses in this review summary:\n{review_summary}"
prompt2 = ChatPromptTemplate.from_template(template2)
chain_2 = prompt2|chat_llm

In [11]:
template3 = "Create a personalized plan to help address and fix these weaknesses:\n{weaknesses}"
prompt3 = ChatPromptTemplate.from_template(template3)
chain_3 = prompt3|chat_llm

In [12]:
# Note: The following PromptTemplate examples are for reference only.
# The actual chains (chain_1, chain_2, chain_3) above use ChatPromptTemplate.from_template().
# prompt1 = PromptTemplate(input_variables=["topic"], template="Generate a question about {topic}.")
# prompt2 = PromptTemplate(input_variables=["question"], template="Provide a short answer to: {question}")

In [13]:
seq_chain = chain_1|chain_2|chain_3

In [14]:
employee_review = '''
Employee Information:
Name: Simeon Harrison
Position: Machine Learning Engineer
Date of Review: 10 March, 2025

Strengths:
Simeon is a highly skilled machine learning engineer with a deep understanding of programming languages, algorithms, and data science. His technical expertise shines through in his ability to efficiently solve complex problems and deliver high-quality code.

One of Simeon's greatest strengths is his collaborative nature. He actively engages with cross-functional teams, contributing valuable insights and seeking input from others. His open-mindedness and willingness to learn from colleagues make him a true team player.

Simeon consistently demonstrates initiative and self-motivation. He takes the lead in seeking out new projects and challenges, and his proactive attitude has led to significant improvements in existing processes and systems. His dedication to self-improvement and growth is commendable.

Another notable strength is Simeon's teaching skills. He has shown great prowess in developing teaching materials and delivering high-end online courses. His adaptability allows him to seamlessly transition between different projects and tasks such as teaching, which makes him a valuable asset to the team.


Weaknesses:
While Simeon possesses numerous strengths, there are a few areas where he could benefit from improvement. One such area is time management. Occasionally, Simeon struggles with effectively managing his time, resulting in missed deadlines or the need for additional support to complete tasks on time, especially before delivering courses for the first time. Developing better prioritization and time management techniques would greatly enhance his efficiency.

Another area for improvement is Simeon's written communication skills. He does not answer customer requests promptly, as he finds it difficult to focus on several tasks simultaneously. There were also instances where his written documentation lacked clarity, leading to confusion among team members. Focusing on enhancing his written communication abilities will help him effectively convey ideas and instructions.

Additionally, Simeon tends to take on too many responsibilities and hesitates to delegate tasks to others. This can result in an excessive workload and potential burnout. Encouraging him to delegate tasks appropriately will not only alleviate his own workload but also foster a more balanced and productive team environment.
'''

In [15]:
results = seq_chain.invoke(employee_review) # This too takes time to run

In [16]:
print(results.content)

To address and fix these weaknesses, a personalized plan for Simeon could include the following steps:

1. Time management and prioritization:
a. Schedule regular check-ins with a supervisor or team leader to review progress and adjust priorities as needed.
b. Implement time-blocking techniques to prioritize tasks and maximize productivity.
c. Use tools like project management software to help manage tasks and deadlines.

2. Written communication skills:
a. Provide training resources and guidance on improving written communication skills.
b. Encourage Simeon to focus on one task at a time, minimizing distractions and improving focus.
c. Provide a checklist or template for writing responses to customer requests to ensure clarity and consistency.

3. Delegation and workload management:
a. Offer guidance and support for Simeon to identify tasks that can be delegated to other team members.
b. Encourage open communication among team members to foster a culture of collaboration and support.


In [17]:
print(chain_1.invoke(employee_review).content)

Simeon Harrison, a Machine Learning Engineer, performed exceptionally in his role, showcasing strong technical skills, collaborative nature, initiative, and adaptability. However, he could improve his time management, written communication, and delegation skills.

In [18]:
print((chain_1|chain_2).invoke(employee_review).content)

The key employee weaknesses identified in this review summary are:

1. Time Management: Simeon Harrison needs to improve his time management skills in order to increase efficiency and productivity.
2. Written Communication Skills: The review suggests that he needs to work on his written communication skills for better team collaboration and effective communication.
3. Delegation: There is a need for Simeon Harrison to develop his skills in delegation, which will help in optimizing team performance and distributing workload more effectively.

In [19]:
print((chain_1|chain_2|chain_3).invoke(employee_review).content)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

To address and fix these weaknesses, a personalized plan can be created for Simeon as follows:

1. Time management:

   a. Break tasks into smaller, more manageable parts with clear deadlines.
   b. Prioritize tasks based on urgency and importance.
   c. Use time-tracking or project management tools to stay organized.
   d. Communicate with team members about progress and seek help if needed.
   e. Establish a routine/schedule to ensure balanced workload.
   f. Take breaks to avoid burnout and maintain productivity.

2. Written communication skills:

   a. Set a specific time each day to respond to customer requests promptly.
   b. Use templates and clear language to ensure written documentation is concise and easy to understand.
   c. Seek feedback from team members on written communication and implement changes accordingly.
   d. Encourage open communication to clarify any misunderstandings.

3. Task delegation:

   a. Identify team members' strengths and delegate tasks accordingly.


### LLMRouterChain

`LLMRouterChain` is used when you want to route a prompt to different chains or prompts depending on the input. It allows conditional execution paths, where an LLM can decide which destination (e.g., math, history, writing) to route a given input to based on predefined criteria or patterns.

Use case: topic routing, multi-skill assistants, task-specific logic dispatching.

In [20]:
beginner_template = '''You are an elementary school teacher who is really
focused on students in the age group of 6 to 10 and explain complex topics in easy to understand terms for the given age group. 
You assume no prior knowledge. Here is the question\n{input}'''

In [21]:
expert_template = '''You are a world expert physics professor who explains physics topics
to advanced audience members. You can assume anyone you answer has a 
PhD level understanding of Physics. Here is the question\n{input}'''

In [22]:
prompt_infos = [
    {'name':'advanced physics','description': 'Answers advanced physics questions',
     'prompt_template':expert_template},
    {'name':'beginner physics','description': 'Answers basic beginner physics questions',
     'prompt_template':beginner_template},
]

In [23]:
chain = MultiPromptChain.from_prompts(chat_llm, prompt_infos, verbose=True)

  validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)

In [24]:
print(chain.invoke("Why does a basket ball bounce?")['text'])



[1m> Entering new MultiPromptChain chain...[0mbeginner physics: {'input': 'Why does a basketball bounce?'}
[1m> Finished chain.[0m
A basketball bounces because of a concept called elastic potential energy. When you throw or hit a basketball, it gains kinetic energy, which is the energy of motion. As the basketball hits the ground, all that kinetic energy is transformed into elastic potential energy. Elastic potential energy is the energy stored in an object that is compressed, stretched, or deformed.

When the basketball hits the ground, it gets compressed or squished a little bit. Imagine pushing down on a toy that bounces back when you let go. That's because the energy you used to push it down is stored as elastic potential energy, and when you let go, the energy is released and the toy bounces back up. The same thing happens with a basketball!

Now, when the basketball is on the ground and has this stored energy, it wants to go back to its original shape. This is called "sprin

In [25]:
print(chain.invoke("How do Feynman Diagrams work?")['text'])



[1m> Entering new MultiPromptChain chain...[0madvanced physics: {'input': 'How do Feynman Diagrams work in the context of quantum field theory?'}
[1m> Finished chain.[0m
Feynman diagrams are a graphical representation of the mathematical expressions of quantum field theory (QFT). They provide a visual and intuitive way to represent and analyze processes in particle physics and quantum electrodynamics (QED). These diagrams were invented by Nobel laureate Richard Feynman and are named after him.

In QFT, particles are treated as excitations of fields, and their interactions are described by the coupling of these fields. Feynman diagrams represent the amplitudes of different processes as a sum over all possible ways the particles can interact. These diagrams consist of lines, representing particles, and vertices, representing interactions.

The basic elements of a Feynman diagram include:

1. Vertices (interaction points): Each vertex represents an interaction between particles, gov

In [26]:
print(chain.invoke("How high can an astronaut jump on the moon?")['text'])



[1m> Entering new MultiPromptChain chain...[0mbeginner physics: {'input': 'How high can an astronaut jump on the moon?'}
[1m> Finished chain.[0m
On the Moon, an astronaut can jump much higher than on Earth! That's because there's less gravity on the Moon. On Earth, gravity pulls us down with a force of about 9.8 meters per second squared (or m/s^2). But on the Moon, the gravity is only about 1/6 of Earth's gravity, which means it pulls objects down at about 1.6 m/s^2.

This makes it easier for astronauts to jump high! Imagine you're jumping on a trampoline. On Earth, each jump reaches 10 times higher than your normal height. On the Moon, each jump would reach about 6 times higher! So, if you're 5 feet tall on Earth, you would jump about 50 feet high on the Moon. Isn't that amazing?

### TransformChain

`TransformChain` allows you to insert arbitrary Python logic into a LangChain pipeline. It lets you define a transformation function that takes in inputs and returns a modified dictionary of outputs. This is useful for pre- or post-processing data before or after it passes through a model or another chain.

Use case: text normalization, formatting, filtering, or enrichment between model steps.

In [27]:
# Define a simple transformation function
def uppercase_fn(inputs: dict) -> dict:
    return {"output": inputs["text"].upper()}

transform_chain = TransformChain(input_variables=["text"], output_variables=["output"], transform=uppercase_fn)

In [28]:
# Run it
output = transform_chain.invoke({"text": "this should be uppercase"})
print("TransformChain output:", output)

TransformChain output: {'text': 'this should be uppercase', 'output': 'THIS SHOULD BE UPPERCASE'}

### MathChain
LangChain's MathChain is a specialized chain used to evaluate or solve math-related prompts, especially those involving multi-step reasoning or intermediate calculations. It's part of LangChain’s approach to tool-augmented reasoning, where LLMs use helper functions (like a calculator) to improve accuracy.

It does so by:

 - Having the LLM generate a math expression or plan

 - Using a Python REPL tool (or custom calculator tool) to actually compute the result

 - Returning the final result in a structured way

It's especially useful for:

 - Word problems

 - Problems involving arithmetic, algebra, or logic

 - Cases where hallucination of numbers is problematic

In [29]:
# pip install numexpr

In [30]:
# Initialize the math chain
math_chain = LLMMathChain.from_llm(llm=llm)

# Run a word problem
result = math_chain.invoke("If a train travels 60 km in 1.5 hours, what is its average speed?")
print(result)

{'question': 'If a train travels 60 km in 1.5 hours, what is its average speed?', 'answer': 'Answer: 0.011111111111111112'}

In [31]:
# Run a word problem that breaks it
result = math_chain.invoke("A football is kicked from the ground and reaches its maximum height of 5m in 10m horizontal distance from where it was kicked. How far from the kicking point will it land, assuming there is no air resistance and it flies in a perfectly parabolic arc?")
print(result)

{'question': 'A football is kicked from the ground and reaches its maximum height of 5m in 10m horizontal distance from where it was kicked. How far from the kicking point will it land, assuming there is no air resistance and it flies in a perfectly parabolic arc?', 'answer': 'Answer:  8.0'}

In [32]:
# Running an algebra problem does not work
result = math_chain.invoke("Can you solve the following equation for x? x^2 + x - 2 = 0")
print(result)

ValueError: LLMMathChain._evaluate("
np.roots([1, 1, -2])
") raised error: Expression np.roots([1, 1, -2]) has forbidden control characters.. Please try again with a valid numerical expression

### Applying Chains
**Task decomposition** (or "dividing labor") is a key concept in prompt engineering and chain design.

It involves the technique of:

 - Breaking down a complex task into smaller, manageable sub-tasks

 - Solving them step-by-step, and optionally recombining the results

This leads to:

 - Better accuracy

 - Clearer LLM reasoning

 - Easier chaining of logic

---

Take for example this task: “Write a summary of the main arguments in this article, and list 3 questions the reader should consider.”

We break it down into two steps:
1. Summarize the text

2. Generate reflective questions based on the summary



In [33]:
# Summarization Prompt
summarize_prompt = PromptTemplate.from_template(
    "Summarize the main arguments of the following article:\n\n{article}"
)

In [34]:
# Reflection Prompt
question_prompt = PromptTemplate.from_template(
    "Based on the following summary, list 3 important questions the reader should consider:\n\n{summary}"
)

In [35]:
# Compose chains using the pipe syntax
summarize_chain = summarize_prompt | llm
question_chain = question_prompt | llm

In [36]:
# Input text
article_text = (
    "The demand for AI skills has exploded in recent years, and it’s likely to continue as more and more businesses embrace artificial intelligence solutions."
    "The number of employers looking for AI-literate employees quadrupled between 2010 and 2019, and AI skills are becoming essential across a wide range of industries, making them a valuable asset for advancing your career and staying competitive in a rapidly evolving job market."
    "AI-related jobs typically pay 11% more than non-AI roles within the same company."
    "Skills with AI are particularly useful if you plan to work in the information, professional services, administrative, or finance sectors."
    "AI adoption offers several potential benefits. It helps automate repetitive processes like data entry to improve operational efficiency."
    "AI can also process and analyze large data sets rapidly, enabling it to identify patterns and make reasoned predictions to aid robust decision-making."
    "For some businesses, using tools such as call bots and chatbots helps streamline customer interactions to boost engagement and satisfaction."
    "Numerous businesses have used AI to improve customer experiences and drive growth. For example, J.P. Morgan and Chase developed the award-winning OmniAI platform to deliver accurate financial insights."
    "The model can perform deep, comprehensive analyses of vast data sets, reducing operational costs and enabling faster solution development."
    "AI has the potential to automate non-routine tasks and solve some of the world’s most complex problems."
    "For example, AI technologies can model climate change predictions, improve energy grid efficiency, and even help you reduce your household energy consumption through smart home heating systems."
    "Other applications include analyzing data during clinical trials and optimizing journeys to reduce the load on transport infrastructure."
    "However, calculating the impact of AI on global challenges is complex, and even seemingly perfect solutions can have unintended outcomes."
    "For instance, improving your home’s efficiency may encourage you to spend more time in your perfectly heated house, increasing your use of energy-hungry appliances."
    "Accounting for unforeseen effects is just one potential pitfall of relying on AI."
    "Poor data protection practices increase the risk of privacy violations, while training models on biased data could lead to discrimination."
    "This is why ethical practices are essential for responsible AI development."
)

In [37]:
# Step-by-step execution
summary = summarize_chain.invoke({"article": article_text})
questions = question_chain.invoke({"summary": summary})

In [38]:
# Output
print("Summary:\n", summary)
print("\nReflective Questions:\n", questions)

Summary:
 Some AI developers advocate for an AI Bill of Rights to ensure responsible AI development. The AI Bill of Rights aims to protect individuals’ right to privacy, right to transparency, and right to fairness.The AI Bill of Rights should also include a right to explanation, ensuring that AI systems are always transparent and explainable, as well as a right to redress, ensuring that responsible data-sharing practices are in place.However, implementing these rights will require cooperation among governments, businesses, and individuals.

The article highlights the growing demand for AI skills in the job market and the potential benefits of AI adoption for businesses. AI-related jobs typically pay 11% more than non-AI roles within the same company. AI skills are becoming essential across various industries, making them a valuable asset for advancing your career and staying competitive in a rapidly changing job market. AI technologies can automate repetitive processes, improve operat

## ⚠️ Important: GPU Memory Management

**Running cleanup cells alone does NOT fully release GPU memory!**

The cleanup cell removes Python references and clears PyTorch cache, but the kernel process
still holds GPU allocations until it terminates.

**To fully release GPU memory:**
1. Run the cleanup cell below
2. Then: **Kernel → Shutdown** (or restart) this notebook's kernel

**Before running the next notebook:**
- Check GPU status in the startup cell
- If GPU is still >80% used, shutdown unused notebook kernels

In [50]:
# === Resource Cleanup ===
import gc
import torch

print("=== Cleaning Up Resources ===\n")

# 1. Delete HuggingFace objects
print("1. Releasing HuggingFace resources...")
hf_objects = ['model', 'tokenizer', 'text_pipeline', 'llm', 'chat_llm', 'pipe']
deleted = []
for obj_name in hf_objects:
    if obj_name in globals():
        del globals()[obj_name]
        deleted.append(obj_name)
if deleted:
    print(f"   ✓ Deleted: {', '.join(deleted)}")
else:
    print("   No HuggingFace objects to delete")

# 2. Clear GPU cache
print("\n2. Clearing GPU cache...")
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

# 3. Python garbage collection
gc.collect()
print("3. ✓ Garbage collection complete")

# 4. Show SYSTEM-WIDE GPU memory status (using pynvml)
print("\n4. Final GPU Memory Status (System-Wide):")
try:
    import pynvml
    pynvml.nvmlInit()
    for i in range(pynvml.nvmlDeviceGetCount()):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        used_gb = info.used / 1024**3
        free_gb = info.free / 1024**3
        print(f"   GPU {i}: {used_gb:.2f} GB used, {free_gb:.2f} GB free")
    pynvml.nvmlShutdown()
except:
    # Fallback to per-process only
    for i in range(torch.cuda.device_count()):
        allocated = torch.cuda.memory_allocated(i) / 1024**3
        print(f"   GPU {i}: {allocated:.2f} GB (this process only)")

print("\n" + "="*40)
print("✓ Cleanup complete!")

# === OPTIONAL: Shutdown Kernel to Fully Release GPU Memory ===
# Uncomment the next line to shutdown this kernel after cleanup
# (Required to fully release GPU memory for other notebooks)

# from IPython import get_ipython; get_ipython().kernel.do_shutdown(restart=False)

=== Cleaning Up Resources ===

1. Releasing HuggingFace resources...
   ✓ Deleted: model, tokenizer

2. Clearing GPU cache...
3. ✓ Garbage collection complete

4. Final GPU Memory Status (System-Wide):
   GPU 0: 8.88 GB used, 7.11 GB free

✓ Cleanup complete!
