# Basic LLM Integration - Interactive Notebook

This notebook provides hands-on examples for integrating and working with different LLM models in LangChain.

## 1. Setup and Imports

In [None]:
# Import necessary libraries
import os
import time
import hashlib
import json
import sys
from functools import lru_cache
from typing import Optional, Dict, Any

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Core LangChain imports
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langchain_core.callbacks import StreamingStdOutCallbackHandler

# Model imports
from langchain_openai import OpenAI, ChatOpenAI
from langchain_huggingface import HuggingFaceHub, HuggingFaceEndpoint

# Token counting
import tiktoken

print("Successfully imported all libraries!")

## 2. Working with Different Model Types

In [None]:
# Initialize different types of models

# OpenAI completion model
openai_completion = OpenAI(
    model_name="gpt-3.5-turbo-instruct",
    temperature=0.7,
    max_tokens=200
)

# OpenAI chat model
chat_model = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0.7,
    max_tokens=200
)

print("Models initialized successfully!")

# Test completion model
completion_response = openai_completion("Write a haiku about technology:")
print("Completion Model Response:")
print(completion_response)

# Test chat model
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="Write a haiku about technology.")
]
chat_response = chat_model.invoke(messages)
print("\nChat Model Response:")
print(chat_response.content)

## 3. Model Parameters and Configuration

In [None]:
# Compare different temperature settings
low_temp_model = OpenAI(temperature=0.1, max_tokens=100)
high_temp_model = OpenAI(temperature=0.9, max_tokens=100)

prompt = "The future of artificial intelligence is"

print("Low temperature (0.1) - More focused:")
low_response = low_temp_model(prompt)
print(low_response)

print("\nHigh temperature (0.9) - More creative:")
high_response = high_temp_model(prompt)
print(high_response)

# Test other parameters
advanced_model = OpenAI(
    temperature=0.7,
    max_tokens=50,
    stop=["\n\n", "###"],  # Stop at these sequences
    frequency_penalty=0.5,  # Reduce repetition
    presence_penalty=0.3   # Encourage new topics
)

print("\nAdvanced model with stop sequences:")
advanced_response = advanced_model("List three benefits of renewable energy:")
print(advanced_response)

## 4. Working with Different Model Providers

In [None]:
# Test different OpenAI models
models = {
    "gpt-3.5-turbo": ChatOpenAI(model="gpt-3.5-turbo", max_tokens=50),
    "gpt-3.5-turbo-instruct": OpenAI(model="gpt-3.5-turbo-instruct", max_tokens=50)
}

test_prompt = "Explain machine learning in one sentence."

for name, model in models.items():
    try:
        if hasattr(model, 'invoke'):
            response = model.invoke(test_prompt)
            content = response.content if hasattr(response, 'content') else str(response)
        else:
            content = model(test_prompt)
        print(f"{name}: {content}")
    except Exception as e:
        print(f"{name}: Error - {e}")

# Test Hugging Face model (if token is available)
try:
    hf_model = HuggingFaceHub(
        repo_id="google/flan-t5-large",
        model_kwargs={"temperature": 0.5, "max_length": 100}
    )
    
    hf_response = hf_model("Translate to French: 'Hello, how are you?'")
    print(f"\nHugging Face response: {hf_response}")
except Exception as e:
    print(f"\nHugging Face model not available: {e}")

## 5. Streaming Responses

In [None]:
# Create streaming model
streaming_model = ChatOpenAI(
    model="gpt-3.5-turbo",
    streaming=True,
    temperature=0.7,
    max_tokens=150
)

print("Streaming response:")
for chunk in streaming_model.stream("Tell me a short story about a robot learning to paint:"):
    print(chunk.content, end="", flush=True)
print()

# Streaming with callbacks
print("\nStreaming with callbacks:")
callback_model = ChatOpenAI(
    model="gpt-3.5-turbo",
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
    temperature=0.7,
    max_tokens=100
)

response = callback_model.invoke("Explain the concept of neural networks in simple terms:")

## 6. Token Management

In [None]:
# Token counting function
def count_tokens(text, model="gpt-3.5-turbo"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(text))

# Test token counting
test_text = "This is a sample text to count tokens."
token_count = count_tokens(test_text)
print(f"Text: '{test_text}'")
print(f"Token count: {token_count}")

# Optimize prompts for token usage
def optimize_prompt(prompt, max_tokens=100):
    token_count = count_tokens(prompt)
    if token_count > max_tokens:
        # Truncate the prompt
        words = prompt.split()
        truncated = " ".join(words[:int(len(words) * max_tokens / token_count)])
        return truncated
    return prompt

# Test prompt optimization
long_prompt = "This is a very long prompt that contains a lot of text and details about various topics " * 10
print(f"\nOriginal prompt length: {len(long_prompt)} characters")
print(f"Original token count: {count_tokens(long_prompt)}")

optimized = optimize_prompt(long_prompt, max_tokens=50)
print(f"\nOptimized prompt length: {len(optimized)} characters")
print(f"Optimized token count: {count_tokens(optimized)}")
print(f"Optimized prompt: {optimized[:100]}...")

## 7. Response Caching

In [None]:
# Implement caching for LLM responses
class CachedLLM:
    def __init__(self, model):
        self.model = model
        self.cache = {}
    
    def _get_cache_key(self, prompt, **kwargs):
        # Create a unique key for the prompt and parameters
        key_data = {"prompt": prompt, **kwargs}
        return hashlib.md5(json.dumps(key_data, sort_keys=True).encode()).hexdigest()
    
    def invoke(self, prompt, **kwargs):
        cache_key = self._get_cache_key(prompt, **kwargs)
        
        if cache_key in self.cache:
            print("Using cached response")
            return self.cache[cache_key]
        
        print("Generating new response")
        response = self.model.invoke(prompt, **kwargs)
        content = response.content if hasattr(response, 'content') else str(response)
        self.cache[cache_key] = content
        return content

# Test cached LLM
cached_model = CachedLLM(ChatOpenAI(model="gpt-3.5-turbo", max_tokens=50))

# First call (generates new response)
print("First call:")
response1 = cached_model.invoke("What is machine learning?")
print(f"Response: {response1}")

# Second call (uses cached response)
print("\nSecond call:")
response2 = cached_model.invoke("What is machine learning?")
print(f"Response: {response2}")

# Different prompt (generates new response)
print("\nDifferent prompt:")
response3 = cached_model.invoke("What is deep learning?")
print(f"Response: {response3}")

## 8. Error Handling and Retry Logic

In [None]:
# Implement robust error handling
class RobustLLM:
    def __init__(self, model, max_retries=3, retry_delay=1):
        self.model = model
        self.max_retries = max_retries
        self.retry_delay = retry_delay
    
    def invoke(self, prompt, **kwargs) -> Optional[str]:
        for attempt in range(self.max_retries):
            try:
                response = self.model.invoke(prompt, **kwargs)
                return response.content if hasattr(response, 'content') else str(response)
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt < self.max_retries - 1:
                    time.sleep(self.retry_delay * (2 ** attempt))  # Exponential backoff
                else:
                    print(f"Max retries exceeded for prompt: {prompt[:50]}...")
                    return None
        return None

# Test robust LLM
robust_model = RobustLLM(ChatOpenAI(model="gpt-3.5-turbo", max_tokens=50))

response = robust_model.invoke("Explain the concept of blockchain in simple terms:")
if response:
    print(f"Response: {response}")
else:
    print("Failed to get response after retries")

## 9. Performance Monitoring

In [None]:
# Implement performance monitoring
class MonitoredLLM:
    def __init__(self, model):
        self.model = model
        self.stats = {
            "total_requests": 0,
            "total_tokens": 0,
            "total_time": 0,
            "errors": 0
        }
    
    def invoke(self, prompt, **kwargs) -> Dict[str, Any]:
        start_time = time.time()
        
        try:
            response = self.model.invoke(prompt, **kwargs)
            end_time = time.time()
            
            # Calculate metrics
            response_time = end_time - start_time
            input_tokens = count_tokens(prompt)
            output_content = response.content if hasattr(response, 'content') else str(response)
            output_tokens = count_tokens(output_content)
            
            # Update stats
            self.stats["total_requests"] += 1
            self.stats["total_tokens"] += input_tokens + output_tokens
            self.stats["total_time"] += response_time
            
            return {
                "content": output_content,
                "response_time": response_time,
                "input_tokens": input_tokens,
                "output_tokens": output_tokens,
                "total_tokens": input_tokens + output_tokens
            }
            
        except Exception as e:
            self.stats["errors"] += 1
            return {
                "error": str(e),
                "response_time": time.time() - start_time
            }
    
    def get_stats(self):
        avg_time = self.stats["total_time"] / max(1, self.stats["total_requests"])
        avg_tokens = self.stats["total_tokens"] / max(1, self.stats["total_requests"])
        
        return {
            **self.stats,
            "average_response_time": avg_time,
            "average_tokens_per_request": avg_tokens
        }

# Test monitored LLM
monitored_model = MonitoredLLM(ChatOpenAI(model="gpt-3.5-turbo", max_tokens=100))

# Make several requests
prompts = [
    "Write a haiku about technology.",
    "Explain photosynthesis in one sentence.",
    "What is the capital of France?"
]

for prompt in prompts:
    result = monitored_model.invoke(prompt)
    if "content" in result:
        print(f"Prompt: {prompt}")
        print(f"Response: {result['content'][:50]}...")
        print(f"Tokens: {result['total_tokens']}, Time: {result['response_time']:.2f}s\n")

# Print statistics
stats = monitored_model.get_stats()
print("Model Statistics:")
for key, value in stats.items():
    if isinstance(value, float):
        print(f"{key}: {value:.2f}")
    else:
        print(f"{key}: {value}")

## 10. Exercise: Smart LLM Wrapper

In [None]:
# TODO: Implement a smart LLM wrapper
# 1. Create task complexity detection
# 2. Implement model selection logic
# 3. Add caching layer
# 4. Include fallback mechanisms
# 5. Add performance monitoring

class SmartLLM:
    def __init__(self):
        # Initialize different models for different tasks
        self.simple_model = ChatOpenAI(model="gpt-3.5-turbo", max_tokens=150)
        self.complex_model = ChatOpenAI(model="gpt-4", max_tokens=300)  # If available
        
        # Initialize cache
        self.cache = {}
        
        # Task complexity keywords
        self.complex_keywords = [
            "analyze", "compare", "synthesize", "evaluate", "create",
            "complex", "detailed", "comprehensive", "in-depth"
        ]
    
    def _detect_complexity(self, prompt: str) -> bool:
        """Determine if the task is complex based on keywords and length"""
        # Check for complexity keywords
        has_complex_keywords = any(keyword in prompt.lower() for keyword in self.complex_keywords)
        
        # Check prompt length (longer prompts tend to be more complex)
        is_long_prompt = len(prompt.split()) > 50
        
        return has_complex_keywords or is_long_prompt
    
    def _get_cache_key(self, prompt: str) -> str:
        """Generate cache key for prompt"""
        return hashlib.md5(prompt.encode()).hexdigest()
    
    def invoke(self, prompt: str, **kwargs) -> str:
        """Smart invocation with model selection, caching, and fallback"""
        cache_key = self._get_cache_key(prompt)
        
        # Check cache first
        if cache_key in self.cache:
            print("Using cached response")
            return self.cache[cache_key]
        
        # Select appropriate model
        is_complex = self._detect_complexity(prompt)
        selected_model = self.complex_model if is_complex else self.simple_model
        
        print(f"Using {'complex' if is_complex else 'simple'} model")
        
        try:
            # Get response
            response = selected_model.invoke(prompt, **kwargs)
            content = response.content if hasattr(response, 'content') else str(response)
            
            # Cache the response
            self.cache[cache_key] = content
            
            return content
            
        except Exception as e:
            print(f"Selected model failed: {e}")
            
            # Try fallback model
            try:
                fallback_model = self.simple_model if is_complex else self.complex_model
                print("Trying fallback model")
                
                response = fallback_model.invoke(prompt, **kwargs)
                content = response.content if hasattr(response, 'content') else str(response)
                
                self.cache[cache_key] = content
                return content
                
            except Exception as fallback_error:
                print(f"Fallback also failed: {fallback_error}")
                return "I'm unable to process your request at the moment."

# Test the smart LLM
smart_llm = SmartLLM()

# Test simple task
print("=== Testing Simple Task ===")
simple_response = smart_llm.invoke("What is the capital of France?")
print(f"Response: {simple_response}\n")

# Test complex task
print("=== Testing Complex Task ===")
complex_response = smart_llm.invoke("Analyze the impact of artificial intelligence on global economics and society.")
print(f"Response: {complex_response[:200]}...\n")

# Test caching (should use cached response)
print("=== Testing Caching ===")
cached_response = smart_llm.invoke("What is the capital of France?")
print(f"Cached response: {cached_response}")

## 11. Summary

In this notebook, we've covered:
- Working with different types of LLM models (completion vs chat)
- Configuring model parameters for optimal performance
- Implementing streaming responses for better user experience
- Managing tokens and optimizing prompts
- Adding caching to improve performance and reduce costs
- Implementing robust error handling and retry logic
- Monitoring model performance metrics
- Building a smart LLM wrapper with automatic model selection

These techniques provide a solid foundation for building robust and efficient LLM-powered applications with LangChain.