In [1]:
!pip install googlesearch-python openai requests beautifulsoup4 pymupdf pdf2image pytesseract --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h

## Módulo de Búsqueda en Internet

In [9]:
from googlesearch import search
import requests
from bs4 import BeautifulSoup

def buscar_en_internet(query, num_results=5, lang="es"):
    urls = search(query, num_results=num_results, lang=lang, advanced=True)
    return urls

def analizar_pagina(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        contenido_principal = soup.find('main') or soup.find('article') or soup
        parrafos = contenido_principal.find_all('p')
        texto_largo = " ".join([p.get_text().strip() for p in parrafos if len(p.get_text().strip()) > 50])
        return texto_largo
    except requests.RequestException as e:
        print(f'Error al acceder a {url}: {e}')
        return ""

## Módulo de Extracción de Texto de PDFs

In [10]:
import pymupdf
import pytesseract
from pdf2image import convert_from_path
import numpy as np
import cv2

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def is_digitalized(pdf_path):
    pdf_document = pymupdf.open(pdf_path)
    first_page = pdf_document.load_page(0)
    text = first_page.get_text()
    return bool(text.strip())

def extract_text_from_pdf(pdf_path):
    pdf_document = pymupdf.open(pdf_path)
    num_pages = pdf_document.page_count
    text_by_page = [pdf_document.load_page(page_num).get_text() for page_num in range(num_pages)]
    return text_by_page

def extract_text_from_scanned_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text_by_page = []
    for image in images:
        image_np = np.array(image)
        gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
        text = pytesseract.image_to_string(gray)
        text_by_page.append(text)
    return text_by_page

def extract_text_from_pdf_file(pdf_path):
    if is_digitalized(pdf_path):
        return extract_text_from_pdf(pdf_path)
    else:
        return extract_text_from_scanned_pdf(pdf_path)

## Módulo de Interacción con el Usuario

In [11]:
class ConversationMemory:
    def __init__(self):
        self.history = []

    def add_message(self, role, content):
        self.history.append({"role": role, "content": content})

    def get_context(self):
        return self.history

memory = ConversationMemory()

def handle_conversation(user_input, model, api_key):
    memory.add_message("user", user_input)
    context = memory.get_context()
    client = OpenAI(api_key=api_key)

    try:
        completion = client.Completion.create(
            model=model,
            messages=context,
            temperature=0.5,
            top_p=1,
            max_tokens=1024,
            stream=True
        )

        ai_response = ""
        for chunk in completion:
            if chunk.choices[0].delta.content:
                ai_response += chunk.choices[0].delta.content
                print(chunk.choices[0].delta.content, end="")

        memory.add_message("assistant", ai_response)
        return ai_response
    except Exception as e:
        print(f"Error al procesar la solicitud: {e}")
        return None

# Version 1 del chatbot

## Integración con el Modelo Llama3

In [19]:
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
import numpy as np
import cv2
import os

client = OpenAI(
    base_url="https://integrate.api.nvidia.com/v1",
    api_key="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"
)

class ConversationMemory:
    def __init__(self):
        self.history = []

    def add_message(self, role, content):
        self.history.append({"role": role, "content": content})

    def get_context(self):
        return self.history

memory = ConversationMemory()

def handle_conversation(user_input):
    memory.add_message("user", user_input)
    context = memory.get_context()

    try:
        completion = client.chat.completions.create(
            model="meta/llama3-70b-instruct",
            messages=context,
            temperature=0.5,
            top_p=1,
            max_tokens=1024,
            stream=True
        )

        ai_response = ""
        for chunk in completion:
            if chunk.choices[0].delta.content:
                ai_response += chunk.choices[0].delta.content

        memory.add_message("assistant", ai_response)
        return ai_response
    except Exception as e:
        print(f"Error al procesar la solicitud: {e}")
        return None

def buscar_en_internet(query, num_results=5, lang="es"):
    try:
        response = requests.get(f"https://www.google.com/search?q={query}&num={num_results}&hl={lang}")
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', href=True) if 'url?q=' in a['href']]
        return [link.split('url?q=')[1].split('&sa=U')[0] for link in links]
    except Exception as e:
        print(f"Error al buscar en internet: {e}")
        return []

def analizar_pagina(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        contenido_principal = soup.find('main') or soup.find('article') or soup
        parrafos = contenido_principal.find_all('p')
        texto_largo = " ".join([p.get_text().strip() for p in parrafos if len(p.get_text().strip()) > 50])
        return texto_largo
    except requests.RequestException as e:
        print(f'Error al acceder a {url}: {e}')
        return ""

def descargar_pdf(url, nombre_archivo):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(nombre_archivo, 'wb') as file:
            file.write(response.content)
        return nombre_archivo
    except requests.RequestException as e:
        print(f"Error al descargar el PDF: {e}")
        return None

pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Ajusta esto si es necesario

def is_digitalized(pdf_path):
    pdf_document = fitz.open(pdf_path)
    first_page = pdf_document.load_page(0)
    text = first_page.get_text()
    return bool(text.strip())

def extract_text_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    num_pages = pdf_document.page_count
    text_by_page = [pdf_document.load_page(page_num).get_text() for page_num in range(num_pages)]
    return text_by_page

def extract_text_from_scanned_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text_by_page = []
    for image in images:
        image_np = np.array(image)
        gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
        text = pytesseract.image_to_string(gray)
        text_by_page.append(text)
    return text_by_page

def extract_text_from_pdf_file(pdf_path):
    if is_digitalized(pdf_path):
        return extract_text_from_pdf(pdf_path)
    else:
        return extract_text_from_scanned_pdf(pdf_path)

def process_request(user_input):
    if "buscar" in user_input:
        query = user_input.replace("buscar", "").strip()
        urls = buscar_en_internet(query)
        for url in urls:
            contenido = analizar_pagina(url)
            if contenido:
                response = handle_conversation(contenido)
                print(f"Assistant: {response}")

    elif "descargar pdf" in user_input:
        url = user_input.replace("descargar pdf", "").strip()
        pdf_path = descargar_pdf(url, "descargado.pdf")
        if pdf_path:
            try:
                text_by_page = extract_text_from_pdf_file(pdf_path)
                for page_text in text_by_page:
                    response = handle_conversation(page_text)
                    print(f"Assistant: {response}")
            except FileNotFoundError as e:
                print(f"Error: {e}")

    else:
        response = handle_conversation(user_input)
        print(f"Assistant: {response}")

# Ejemplo de uso
prompts = [
    "buscar cómo armar un backend",
    "descargar pdf https://url.del.pdf/documento.pdf",  # Asegúrate de proporcionar un URL válido de un PDF
    "Describe the significance of the Battle of Hastings."
]

for prompt in prompts:
    process_request(prompt)

# Asegúrate de eliminar el archivo descargado después de usarlo
if os.path.exists("descargado.pdf"):
    os.remove("descargado.pdf")

Error al acceder a https://maps.google.com/maps%3Fq%3Dc%25C3%25B3mo%2Barmar%2Bun%2Bbackend%26num%3D5%26hl%3Des%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449: 404 Client Error: Not Found for url: https://maps.google.com/maps%3Fq%3Dc%25C3%25B3mo%2Barmar%2Bun%2Bbackend%26num%3D5%26hl%3Des%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449
Error al acceder a /search%3Fq%3Dc%25C3%25B3mo%2Barmar%2Bun%2Bbackend%26num%3D5%26sca_esv%3D458fc5d25ecd7a59%26sca_upv%3D1%26hl%3Des%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111&opi=89978449: No connection adapters were found for '/search%3Fq%3Dc%25C3%25B3mo%2Barmar%2Bun%2Bbackend%26num%3D5%26sca_esv%3D458fc5d25ecd7a59%26sca_upv%3D1%26hl%3Des%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111&opi=89978449'
Error al acceder a https://www.youtube.com/watch%3Fv%3Ds_0ad5JYkMk: 429 Client Error: Too Many Requests for url: https://www.youtube.com/watch%3Fv%3Ds_0ad5JYkMk
Error 

# Version Final

In [17]:
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
import numpy as np
import cv2
import os

client = OpenAI(
    base_url="https://integrate.api.nvidia.com/v1",
    api_key="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"
)

class ConversationMemory:
    def __init__(self, max_tokens=8192):
        self.history = []
        self.max_tokens = max_tokens

    def add_message(self, role, content):
        self.history.append({"role": role, "content": content})
        self.trim_history()

    def get_context(self):
        return self.history

    def trim_history(self):
        total_tokens = sum(len(message["content"]) for message in self.history)
        while total_tokens > self.max_tokens:
            removed_message = self.history.pop(0)
            total_tokens -= len(removed_message["content"])

memory = ConversationMemory(max_tokens=6000)

def handle_conversation(user_input):
    memory.add_message("user", user_input)
    context = memory.get_context()

    try:
        completion = client.chat.completions.create(
            model="meta/llama3-70b-instruct",
            messages=context,
            temperature=0.5,
            top_p=1,
            max_tokens=1024,
            stream=True
        )

        ai_response = ""
        for chunk in completion:
            if chunk.choices[0].delta.content:
                ai_response += chunk.choices[0].delta.content

        memory.add_message("assistant", ai_response)
        return ai_response
    except Exception as e:
        print(f"Error al procesar la solicitud: {e}")
        return None

def buscar_en_internet(query, num_results=5, lang="es"):
    try:
        response = requests.get(f"https://www.google.com/search?q={query}&num={num_results}&hl={lang}")
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', href=True) if 'url?q=' in a['href']]
        return [link.split('url?q=')[1].split('&sa=U')[0] for link in links]
    except Exception as e:
        print(f"Error al buscar en internet: {e}")
        return []

def analizar_pagina(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        contenido_principal = soup.find('main') or soup.find('article') or soup
        parrafos = contenido_principal.find_all('p')
        texto_largo = " ".join([p.get_text().strip() for p in parrafos if len(p.get_text().strip()) > 50])
        return texto_largo
    except requests.RequestException as e:
        print(f'Error al acceder a {url}: {e}')
        return ""

def descargar_pdf(url, nombre_archivo):
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(nombre_archivo, 'wb') as file:
            file.write(response.content)
        return nombre_archivo
    except requests.RequestException as e:
        print(f"Error al descargar el PDF: {e}")
        return None

pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Ajusta esto si es necesario

def is_digitalized(pdf_path):
    pdf_document = fitz.open(pdf_path)
    first_page = pdf_document.load_page(0)
    text = first_page.get_text()
    return bool(text.strip())

def extract_text_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    num_pages = pdf_document.page_count
    text_by_page = [pdf_document.load_page(page_num).get_text() for page_num in range(num_pages)]
    return text_by_page

def extract_text_from_scanned_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text_by_page = []
    for image in images:
        image_np = np.array(image)
        gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
        text = pytesseract.image_to_string(gray)
        text_by_page.append(text)
    return text_by_page

def extract_text_from_pdf_file(pdf_path):
    if is_digitalized(pdf_path):
        return extract_text_from_pdf(pdf_path)
    else:
        return extract_text_from_scanned_pdf(pdf_path)

def process_request(user_input):
    if "buscar" in user_input:
        query = user_input.replace("buscar", "").strip()
        urls = buscar_en_internet(query)
        for url in urls:
            contenido = analizar_pagina(url)
            if contenido:
                response = handle_conversation(contenido)
                print(f"Assistant: {response}")

    elif "descargar pdf" in user_input:
        url = user_input.replace("descargar pdf", "").strip()
        pdf_path = descargar_pdf(url, "descargado.pdf")
        if pdf_path:
            try:
                text_by_page = extract_text_from_pdf_file(pdf_path)
                for page_text in text_by_page:
                    response = handle_conversation(page_text)
                    print(f"Assistant: {response}")
            except FileNotFoundError as e:
                print(f"Error: {e}")

    else:
        response = handle_conversation(user_input)
        print(f"Assistant: {response}")

# Ejemplo de uso
prompts = [
    "buscar cómo armar un backend",
    "descargar pdf https://example.com/documento.pdf",  # Asegúrate de proporcionar un URL válido de un PDF
    "Describe the significance of the Battle of Hastings."
]

for prompt in prompts:
    process_request(prompt)

# Asegúrate de eliminar el archivo descargado después de usarlo
if os.path.exists("descargado.pdf"):
    os.remove("descargado.pdf")

Error al acceder a https://maps.google.com/maps%3Fq%3Dc%25C3%25B3mo%2Barmar%2Bun%2Bbackend%26num%3D5%26hl%3Des%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449: 404 Client Error: Not Found for url: https://maps.google.com/maps%3Fq%3Dc%25C3%25B3mo%2Barmar%2Bun%2Bbackend%26num%3D5%26hl%3Des%26um%3D1%26ie%3DUTF-8%26ved%3D1t:200713%26ictx%3D111&opi=89978449
Error al acceder a /search%3Fq%3Dc%25C3%25B3mo%2Barmar%2Bun%2Bbackend%26num%3D5%26sca_esv%3D458fc5d25ecd7a59%26sca_upv%3D1%26hl%3Des%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111&opi=89978449: No connection adapters were found for '/search%3Fq%3Dc%25C3%25B3mo%2Barmar%2Bun%2Bbackend%26num%3D5%26sca_esv%3D458fc5d25ecd7a59%26sca_upv%3D1%26hl%3Des%26ie%3DUTF-8%26tbm%3Dshop%26source%3Dlnms%26ved%3D1t:200713%26ictx%3D111&opi=89978449'
Error al acceder a https://www.youtube.com/watch%3Fv%3Ds_0ad5JYkMk: 429 Client Error: Too Many Requests for url: https://www.youtube.com/watch%3Fv%3Ds_0ad5JYkMk
Error 