In [2]:
#%pip install langchain langchain-community openai tiktoken faiss-cpu numpy nltk pypdf sentence-transformers transformers tokenizers unstructured


In [1]:
#Import Libraries
import os
import sys
import textwrap
import getpass
import nltk
from langchain.document_loaders import UnstructuredURLLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.embeddings import HuggingFaceBgeEmbeddings

# Download NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yogit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
class ChatbotConfig:
    def __init__(self):
        # Set OpenAI API Key securely
        if 'OPENAI_API_KEY' not in os.environ:
            api_key = getpass.getpass('Enter your OpenAI API Key: ')
            if api_key.strip():
                os.environ['OPENAI_API_KEY'] = api_key
            else:
                raise ValueError("API key is required")
        
        # URLs to scrape
        self.urls = [
            'https://stability.ai/news/stability-ai-launches-the-first-of-its-stablelm-suite-of-language-models',
            'https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html',
            'https://blog.gopenai.com/paper-review-llama-2-open-foundation-and-fine-tuned-chat-models-23e539522acb',
        ]
        
        # Text splitting parameters
        self.chunk_size = 1000
        self.chunk_overlap = 200
        
        # Model configuration
        self.embedding_model = 'openai'  # 'openai' or 'huggingface'
        self.llm_model = 'gpt-3.5-turbo'  # OpenAI model to use
        
        # Validation
        self.validate()
    
    def validate(self):
        """Validate configuration"""
        if not self.urls:
            raise ValueError("At least one URL is required")
        if self.chunk_size <= 0:
            raise ValueError("Chunk size must be positive")
        if self.chunk_overlap >= self.chunk_size:
            raise ValueError("Chunk overlap must be less than chunk size")

# Initialize configuration
try:
    config = ChatbotConfig()
    print("Configuration loaded successfully")
    print(f"URLs: {len(config.urls)} configured")
    print(f"Chunk size: {config.chunk_size}")
    print(f"Embedding model: {config.embedding_model}")
except Exception as e:
    print(f"Configuration error: {e}")
    raise

Configuration loaded successfully
URLs: 3 configured
Chunk size: 1000
Embedding model: openai


In [10]:
class DocumentProcessor:
    """Handles document loading and processing"""
    
    @staticmethod
    def load_documents(urls, max_retries=3):
        """Load documents from URLs """
        for attempt in range(max_retries):
            try:
                print(f"Loading documents from {len(urls)} URLs...")
                loader = UnstructuredURLLoader(urls=urls)
                docs = loader.load()
                
                if not docs:
                    print("No data loaded from URLs")
                    return []
                
                print(f"Successfully loaded {len(docs)} documents")
                return docs
                
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {e}")
                if attempt == max_retries - 1:
                    print("All retry attempts failed")
                    return []
                print("Retrying...")
    
    @staticmethod
    def split_documents(docs, chunk_size, chunk_overlap):
        """Split documents into chunks"""
        try:
            splitter = CharacterTextSplitter(
                separator='\n',
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap
            )
            chunks = splitter.split_documents(docs)
            print(f"Split documents into {len(chunks)} chunks")
            return chunks
        except Exception as e:
            print(f"Error splitting documents: {e}")
            return []

class EmbeddingManager:
    """Manages embedding models"""
    
    @staticmethod
    def get_embeddings(model_type='openai', **kwargs):
        """Get embedding model based on type"""
        try:
            if model_type == 'openai':
                embeddings = OpenAIEmbeddings()
                print("OpenAI embeddings initialized")
                return embeddings
            
            elif model_type == 'huggingface' and HUGGINGFACE_AVAILABLE:
                model_name = kwargs.get('model_name', 'BAAI/bge-large-en')
                model_kwargs = kwargs.get('model_kwargs', {'device': 'cpu'})
                encode_kwargs = kwargs.get('encode_kwargs', {'normalize_embeddings': True})
                
                embeddings = HuggingFaceBgeEmbeddings(
                    model_name=model_name,
                    model_kwargs=model_kwargs,
                    encode_kwargs=encode_kwargs
                )
                print(f"HuggingFace embeddings initialized ({model_name})")
                return embeddings
            
            else:
                raise ValueError(f"Unknown or unavailable embedding model type: {model_type}")
                
        except Exception as e:
            print(f"Error initializing embeddings: {e}")
            raise

class VectorStoreManager:
    """Manages vector store operations"""
    
    @staticmethod
    def create_vectorstore(chunks, embeddings):
        """Create FAISS vector store"""
        try:
            vectorstore = FAISS.from_documents(chunks, embeddings)
            print("FAISS vector store created successfully")
            return vectorstore
        except Exception as e:
            print(f"Error creating vector store: {e}")
            raise

class LLMManager:
    """Manages LLM operations"""
    
    @staticmethod
    def create_llm(model_name='gpt-3.5-turbo'):
        """Create ChatOpenAI LLM wrapper"""
        try:
            llm = ChatOpenAI(model_name=model_name)
            print(f"LLM initialized ({model_name})")
            return llm
        except Exception as e:
            print(f"Error initializing LLM: {e}")
            raise

class QASystem:
    """Manages QA operations"""
    
    def __init__(self, llm, vectorstore):
        self.llm = llm
        self.vectorstore = vectorstore
        self.chain = None
        self._initialize_chain()
    
    def _initialize_chain(self):
        """Initialize the QA chain"""
        try:
            self.chain = RetrievalQAWithSourcesChain.from_llm(
                llm=self.llm,
                retriever=self.vectorstore.as_retriever()
            )
            print("QA chain initialized successfully")
        except Exception as e:
            print(f"Error initializing QA chain: {e}")
            raise
    
    def ask_question(self, question):
        """Ask a question and return answer with sources"""
        try:
            result = self.chain({"question": question}, return_only_outputs=True)
            answer = result.get('answer', 'No answer found.')
            sources = result.get('sources', '')
            return answer, sources
        except Exception as e:
            return f"Error: {e}", ""

In [11]:
# Main workflow
def initialize_chatbot(config):
    """Initialize the complete chatbot system"""
    try:
        print("Initializing chatbot system...")
        
        # Step 1: Load documents
        documents = DocumentProcessor.load_documents(config.urls)
        if not documents:
            raise RuntimeError("No documents loaded. Check your URLs or network connection.")
        
        # Step 2: Split documents
        chunks = DocumentProcessor.split_documents(
            documents, config.chunk_size, config.chunk_overlap
        )
        if not chunks:
            raise RuntimeError("Failed to split documents into chunks.")
        
        # Step 3: Initialize embeddings
        embeddings = EmbeddingManager.get_embeddings(config.embedding_model)
        
        # Step 4: Create vector store
        vectorstore = VectorStoreManager.create_vectorstore(chunks, embeddings)
        
        # Step 5: Initialize LLM
        llm = LLMManager.create_llm(config.llm_model)
        
        # Step 6: Create QA system
        qa_system = QASystem(llm, vectorstore)
        
        print("Chatbot system initialized successfully!")
        return qa_system
        
    except Exception as e:
        print(f"Failed to initialize chatbot: {e}")
        raise

# Initialize the chatbot
try:
    qa_system = initialize_chatbot(config)
except Exception as e:
    print(f"\nTroubleshooting tips:")
    print("1. Check your internet connection")
    print("2. Verify your OpenAI API key is valid")
    print("3. Ensure all packages are installed correctly")
    print("4. Try with different URLs if some are not accessible")
    raise

Initializing chatbot system...
Loading documents from 3 URLs...
Successfully loaded 3 documents
Split documents into 37 chunks
OpenAI embeddings initialized
FAISS vector store created successfully
LLM initialized (gpt-3.5-turbo)
QA chain initialized successfully
Chatbot system initialized successfully!


In [13]:
def interactive_qa(qa_system):
    """Run interactive QA session"""
    print("\n" + "="*50)
    print("CHATBOT READY!")
    print("="*50)
    print("\nHow to use:")
    print("  - Type your question and press Enter")
    print("  - Type 'exit' to quit")
    print("  - Type 'help' for usage tips")
    print("  - Type 'sources' to see available sources")
    print("\n" + "-"*50)
    
    while True:
        try:
            query = input("\nYour question: ").strip()
            
            if not query:
                continue
                
            if query.lower() == 'exit':
                print("Goodbye!")
                break
                
            if query.lower() == 'help':
                print("\n Help:")
                print("  - Ask questions about the content from the configured URLs")
                print("  - The chatbot will search through the loaded documents")
                print("  - Answers are based on the actual content from the websites")
                continue
                
            if query.lower() == 'sources':
                print("\n Available sources:")
                for i, url in enumerate(config.urls, 1):
                    print(f"  {i}. {url}")
                continue
            
            # Get answer
            print("\n Thinking...")
            answer, sources = qa_system.ask_question(query)
            
            # Display answer
            print("\n Answer:")
            wrapped_answer = textwrap.fill(answer, width=80)
            print(wrapped_answer)
            
            # Display sources if available
            if sources:
                print("\n Sources:")
                print(sources)
            
        except KeyboardInterrupt:
            print("\n\n Interrupted by user. Goodbye!")
            break
        except Exception as e:
            print(f"\n Error: {e}")
            print("Please try again or type 'exit' to quit.")

# Start interactive session
if 'qa_system' in locals():
    interactive_qa(qa_system)
else:
    print("Chatbot not initialized. Please run the previous cells first.")


CHATBOT READY!

How to use:
  - Type your question and press Enter
  - Type 'exit' to quit
  - Type 'help' for usage tips
  - Type 'sources' to see available sources

--------------------------------------------------

 Thinking...

 Answer:
Amazon SageMaker is a fully managed service that provides developers and data
scientists with the ability to build, train, and deploy machine learning models
quickly.

 Sources:
https://blog.gopenai.com/paper-review-llama-2-open-foundation-and-fine-tuned-chat-models-23e539522acb

 Thinking...

 Answer:
LLM stands for Large Language Models.

 Sources:
https://blog.gopenai.com/paper-review-llama-2-open-foundation-and-fine-tuned-chat-models-23e539522acb
Goodbye!
