<a href="https://colab.research.google.com/github/bayas1820/python/blob/main/zluri_assesment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from sentence_transformers import SentenceTransformer
import numpy as np
import openai
import time
from tqdm import tqdm

class DocumentationAgent:
    def __init__(self):
        self.domain = None
        self.base_url = None
        self.visited_urls = set()
        self.document_chunks = []
        self.embeddings = None

        # Initialize models
        print("Loading embedding model...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Initialize OpenAI client
        print("Initializing OpenAI client...")
        self.llm_client = openai.OpenAI(api_key="your-api-key-here")  # Replace with your actual API key

        # Configuration
        self.max_pages = 30
        self.chunk_size = 1000
        self.max_depth = 2

    def set_base_url(self, url: str):
        """Set and validate the base URL"""
        parsed = urlparse(url)
        if not parsed.scheme or not parsed.netloc:
            url = 'https://' + url  # Add scheme if missing

        parsed = urlparse(url)
        if not parsed.scheme.startswith('http'):
            raise ValueError("URL must start with http:// or https://")

        self.base_url = url
        self.domain = parsed.netloc
        print(f"Base URL set to: {self.base_url}")

    def is_valid_url(self, url: str) -> bool:
        """Check if URL should be crawled"""
        parsed = urlparse(url)
        return (parsed.netloc == self.domain and
                not any(ext in url.lower() for ext in ['.pdf', '.jpg', '.png']))

    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def extract_main_content(self, soup: BeautifulSoup) -> str:
        """Extract main content from pages"""
        # Remove navigation and other non-content elements
        for element in soup(['nav', 'footer', 'header', 'script', 'style',
                            'aside', 'form', 'iframe']):
            element.decompose()

        # Try to find the main content
        article = (soup.find('article') or
                  soup.find('main') or
                  soup.find('div', class_=re.compile('content|main|article')))

        if article:
            return self.clean_text(article.get_text())
        return self.clean_text(soup.get_text())

    def crawl(self, url: str, depth: int = 0):
        """Recursively crawl documentation pages"""
        if (len(self.visited_urls) >= self.max_pages or
            url in self.visited_urls or
            not self.is_valid_url(url)):
            return

        self.visited_urls.add(url)

        try:
            with requests.Session() as session:
                headers = {'User-Agent': 'DocumentationBot/1.0'}
                response = session.get(url, headers=headers, timeout=15)
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')
                content = self.extract_main_content(soup)

                if content:
                    self.document_chunks.append({
                        'text': content[:5000],
                        'url': url,
                        'title': soup.title.string if soup.title else url
                    })

                    # Follow links if we haven't reached max depth
                    if depth < self.max_depth:
                        for link in tqdm(soup.find_all('a', href=True), desc=f"Crawling depth {depth}"):
                            next_url = urljoin(url, link['href'])
                            if (self.is_valid_url(next_url) and
                                next_url not in self.visited_urls):
                                self.crawl(next_url, depth + 1)

        except Exception as e:
            print(f"Error crawling {url}: {str(e)}")

    def generate_embeddings(self):
        """Generate embeddings for documents"""
        if not self.document_chunks:
            print("No documents to embed. Please crawl first.")
            return

        print("Generating embeddings...")
        texts = [chunk['text'] for chunk in self.document_chunks]
        self.embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings for {len(self.document_chunks)} documents")

    def ask_question(self, question: str, top_k: int = 3):
        """Answer a question with source references"""
        if not self.embeddings or not self.document_chunks:
            return "Please crawl the site and generate embeddings first."

        try:
            # Get question embedding
            question_embedding = self.embedding_model.encode(question)

            # Find most relevant chunks
            similarities = np.dot(self.embeddings, question_embedding)
            top_indices = np.argsort(similarities)[-top_k:][::-1]
            relevant_chunks = [self.document_chunks[i] for i in top_indices]

            # Prepare context for LLM
            context = "\n\n".join([
                f"From {chunk['title']} ({chunk['url']}):\n{chunk['text'][:1500]}..."
                for chunk in relevant_chunks
            ])

            # Generate answer using LLM
            response = self.llm_client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful documentation assistant. Answer questions based on the provided context. Be concise and accurate."},
                    {"role": "user", "content": f"Question: {question}\n\nContext:\n{context}"}
                ],
                temperature=0.3
            )

            answer = response.choices[0].message.content
            sources = list({chunk['url'] for chunk in relevant_chunks})

            return f"{answer}\n\nSources:\n" + "\n".join(f"- {src}" for src in sources)

        except Exception as e:
            return f"Error generating answer: {str(e)}"

def main():
    # Initialize agent
    agent = DocumentationAgent()

    # Get URL input
    while True:
        url_input = input("Enter the help website URL (e.g., help.zluri.com or https://help.example.com): ").strip()
        try:
            agent.set_base_url(url_input)
            break
        except ValueError as e:
            print(f"Invalid URL: {e}. Please try again.")

    # Crawl the site
    print("\nCrawling documentation...")
    agent.crawl(agent.base_url)

    # Generate embeddings
    print("\nProcessing documents...")
    agent.generate_embeddings()

    # Interactive Q&A
    print("\nEnter your questions (type 'exit' to quit):")
    while True:
        question = input("\nQuestion: ").strip()
        if question.lower() in ['exit', 'quit']:
            break
        if not question:
            continue

        answer = agent.ask_question(question)
        print("\nAnswer:")
        print(answer)

if __name__ == "__main__":
    main()

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
from sentence_transformers import SentenceTransformer
import numpy as np
import openai
import time
from tqdm import tqdm

class DocumentationAgent:
    def __init__(self):
        self.domain = None
        self.base_url = None
        self.visited_urls = set()
        self.document_chunks = []
        self.embeddings = None

        # Initialize models
        print("Loading embedding model...")
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Initialize OpenAI client
        print("Initializing OpenAI client...")
        self.llm_client = openai.OpenAI(api_key="your-api-key-here")  # Replace with your actual API key

        # Configuration
        self.max_pages = 30
        self.chunk_size = 1000
        self.max_depth = 2

    def set_base_url(self, url: str):
        """Set and validate the base URL"""
        parsed = urlparse(url)
        if not parsed.scheme or not parsed.netloc:
            url = 'https://' + url  # Add scheme if missing

        parsed = urlparse(url)
        if not parsed.scheme.startswith('http'):
            raise ValueError("URL must start with http:// or https://")

        self.base_url = url
        self.domain = parsed.netloc
        print(f"Base URL set to: {self.base_url}")

    def is_valid_url(self, url: str) -> bool:
        """Check if URL should be crawled"""
        parsed = urlparse(url)
        return (parsed.netloc == self.domain and
                not any(ext in url.lower() for ext in ['.pdf', '.jpg', '.png']))

    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def extract_main_content(self, soup: BeautifulSoup) -> str:
        """Extract main content from pages"""
        # Remove navigation and other non-content elements
        for element in soup(['nav', 'footer', 'header', 'script', 'style',
                            'aside', 'form', 'iframe']):
            element.decompose()

        # Try to find the main content
        article = (soup.find('article') or
                  soup.find('main') or
                  soup.find('div', class_=re.compile('content|main|article')))

        if article:
            return self.clean_text(article.get_text())
        return self.clean_text(soup.get_text())

    def crawl(self, url: str, depth: int = 0):
        """Recursively crawl documentation pages"""
        if (len(self.visited_urls) >= self.max_pages or
            url in self.visited_urls or
            not self.is_valid_url(url)):
            return

        self.visited_urls.add(url)

        try:
            with requests.Session() as session:
                headers = {'User-Agent': 'DocumentationBot/1.0'}
                response = session.get(url, headers=headers, timeout=15)
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')
                content = self.extract_main_content(soup)

                if content:
                    self.document_chunks.append({
                        'text': content[:5000],
                        'url': url,
                        'title': soup.title.string if soup.title else url
                    })

                    # Follow links if we haven't reached max depth
                    if depth < self.max_depth:
                        for link in tqdm(soup.find_all('a', href=True), desc=f"Crawling depth {depth}"):
                            next_url = urljoin(url, link['href'])
                            if (self.is_valid_url(next_url) and
                                next_url not in self.visited_urls):
                                self.crawl(next_url, depth + 1)

        except Exception as e:
            print(f"Error crawling {url}: {str(e)}")

    def generate_embeddings(self):
        """Generate embeddings for documents"""
        if not self.document_chunks:
            print("No documents to embed. Please crawl first.")
            return

        print("Generating embeddings...")
        texts = [chunk['text'] for chunk in self.document_chunks]
        self.embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
        print(f"Generated embeddings for {len(self.document_chunks)} documents")

    def ask_question(self, question: str, top_k: int = 3):
        """Answer a question with source references"""
        if self.embeddings is None or not self.document_chunks:
            return "Please crawl the site and generate embeddings first."

        try:
            # Get question embedding
            question_embedding = self.embedding_model.encode(question)

            # Find most relevant chunks
            similarities = np.dot(self.embeddings, question_embedding)
            top_indices = np.argsort(similarities)[-top_k:][::-1]
            relevant_chunks = [self.document_chunks[i] for i in top_indices]

            # Prepare context for LLM
            context = "\n\n".join([
                f"From {chunk['title']} ({chunk['url']}):\n{chunk['text'][:1500]}..."
                for chunk in relevant_chunks
            ])

            # Generate answer using LLM
            response = self.llm_client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful documentation assistant. Answer questions based on the provided context. Be concise and accurate."},
                    {"role": "user", "content": f"Question: {question}\n\nContext:\n{context}"}
                ],
                temperature=0.3
            )

            answer = response.choices[0].message.content
            sources = list({chunk['url'] for chunk in relevant_chunks})

            return f"{answer}\n\nSources:\n" + "\n".join(f"- {src}" for src in sources)

        except Exception as e:
            return f"Error generating answer: {str(e)}"

def main():
    # Initialize agent
    agent = DocumentationAgent()

    # Get URL input
    while True:
        url_input = input("Enter the help website URL (e.g., help.zluri.com or https://help.example.com): ").strip()
        try:
            agent.set_base_url(url_input)
            break
        except ValueError as e:
            print(f"Invalid URL: {e}. Please try again.")

    # Crawl the site
    print("\nCrawling documentation...")
    agent.crawl(agent.base_url)

    # Generate embeddings
    print("\nProcessing documents...")
    agent.generate_embeddings()

    # Interactive Q&A
    print("\nEnter your questions (type 'exit' to quit):")
    while True:
        question = input("\nQuestion: ").strip()
        if question.lower() in ['exit', 'quit']:
            break
        if not question:
            continue

        answer = agent.ask_question(question)
        print("\nAnswer:")
        print(answer)

if __name__ == "__main__":
    main()

In [None]:
#!pip install beautifulsoup4 requests sentence-transformers torch
import argparse
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np

# URL Content Scraper
def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        content = []
        for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'li']):
            content.append(tag.get_text(strip=True))
        return " ".join(content)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return None

# AI-Powered Question Answering
class QnAAgent:
    def __init__(self, documentation_text): # Changed _init_ to __init__
        self.documentation = documentation_text
        self.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        self.doc_sentences = documentation_text.split('. ')  # Split into sentences
        self.doc_embeddings = self.model.encode(self.doc_sentences, convert_to_tensor=True)  # Embed sentences

    def answer_question(self, question):
        question_embedding = self.model.encode(question, convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(question_embedding, self.doc_embeddings)

        # Get the index of the most similar sentence
        most_similar_idx = np.argmax(similarity).item()  # Use numpy for argmax

        # Lowered similarity threshold
        if similarity[0][most_similar_idx] > 0.3:
            # Return the most similar sentence as the answer
            return self.doc_sentences[most_similar_idx]
        else:
            return "Sorry, I couldn't find an answer to that question in the provided documentation."

# Main Functionality
def main():
    parser = argparse.ArgumentParser(description="AI Q&A Agent")
    parser.add_argument('--url', type=str, required=True, help="URL of the help documentation")
    # Simulate passing command-line arguments:
    args = parser.parse_args(['--url', 'https://www.w3schools.com/sql/sql_intro.asp'])
    content = scrape_website(args.url)
    if not content:
        print("Failed to extract content from the URL.")
        return

    agent = QnAAgent(content) # Changed _init_ to __init__
    print("Documentation processed. You can now ask questions.")

    while True:
        question = input("> ")
        if question.lower() in ['exit', 'quit']:
            print("Exiting Q&A Agent. Goodbye!")
            break
        print(agent.answer_question(question))

# Corrected the conditional statement to use the correct special variable name '__name__'
if __name__ == '__main__': # Changed _name_ to __name__
  main()

Documentation processed. You can now ask questions.
> what is rdbms
RDBMS is the basis for SQL, and for all modern database systems such as MS SQL Server, IBM DB2, Oracle, MySQL, and Microsoft Access
