<a href="https://colab.research.google.com/github/arielcintra/smart_bot_boy/blob/main/smart_boy_bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install Flask sentence-transformers pymongo Werkzeug python-docx openpyxl PyPDF2 requests beautifulsoup4 lxml Pillow torch pytesseract

Collecting pymongo
  Downloading pymongo-4.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1

In [2]:
import os
import json
from flask import Flask, request, render_template, jsonify
from sentence_transformers import SentenceTransformer, util, InputExample, losses
from torch.utils.data import DataLoader
from pymongo import MongoClient
from werkzeug.utils import secure_filename
import docx
import openpyxl
import csv
import PyPDF2
import requests
from bs4 import BeautifulSoup
from PIL import Image
from abc import ABC, abstractmethod
from pytesseract import image_to_string
import torch

In [3]:
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = "uploads"
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)

In [18]:
class DocumentRepository:
    def __init__(self):
        self.client = MongoClient(os.getenv("MONGO_URI", "mongodb://localhost:27017/"))
        self.db = self.client['smart-boy_db']
        self.collection = self.db['SmartBoy']

    def insert_document(self, text, embedding):
        self.collection.insert_one({"text": text, "embedding": embedding.tolist()})

    def find_all_documents(self):
        return self.collection.find()

    def get_all_texts(self):
        return [doc['text'] for doc in self.collection.find()]

In [5]:
class NLPModel:
    def __init__(self, model_name="paraphrase-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def encode(self, text):
        return self.model.encode(text, convert_to_tensor=True)

    def fine_tune(self, train_data: list):
        examples = [InputExample(texts=[text], label=1.0) for text in train_data]
        train_dataloader = DataLoader(examples, shuffle=True, batch_size=16)
        train_loss = losses.CosineSimilarityLoss(self.model)
        self.model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1)

In [6]:
class BaseTextExtractor(ABC):
    @abstractmethod
    def extract_text(self, file):
        pass

In [7]:
class TxtTextExtractor(BaseTextExtractor):
    def extract_text(self, file):
        return file.read().decode("utf-8")

In [8]:
class DocxTextExtractor(BaseTextExtractor):
    def extract_text(self, file):
        doc = docx.Document(file)
        return "\n".join(para.text for para in doc.paragraphs)

In [9]:
class XlsxTextExtractor(BaseTextExtractor):
    def extract_text(self, file):
        workbook = openpyxl.load_workbook(file)
        sheet = workbook.active
        return "\n".join(" ".join(str(cell) for cell in row) for row in sheet.iter_rows(values_only=True))

In [10]:
class CsvTextExtractor(BaseTextExtractor):
    def extract_text(self, file):
        reader = csv.reader(file.read().decode('utf-8').splitlines())
        return "\n".join(" ".join(row) for row in reader)

In [11]:
class PdfTextExtractor(BaseTextExtractor):
    def extract_text(self, file):
        reader = PyPDF2.PdfReader(file)
        return "\n".join(page.extract_text() for page in reader.pages)

In [12]:
class ImageTextExtractor(BaseTextExtractor):
    def extract_text(self, file):
        try:
            from pytesseract import image_to_string
            image = Image.open(file)
            return image_to_string(image)
        except ImportError:
            return "OCR library (pytesseract) not installed. Cannot extract text from images."

In [13]:
class TextExtractorFactory:
    @staticmethod
    def get_extractor(extension):
        extractors = {
            'txt': TxtTextExtractor(),
            'docx': DocxTextExtractor(),
            'xlsx': XlsxTextExtractor(),
            'csv': CsvTextExtractor(),
            'pdf': PdfTextExtractor(),
            'jpg': ImageTextExtractor(),
            'jpeg': ImageTextExtractor(),
            'png': ImageTextExtractor(),
        }
        return extractors.get(extension)

In [19]:
repository = DocumentRepository()
nlp_model = NLPModel()

class DocumentService:
    @staticmethod
    def process_text(text):
        embedding = nlp_model.encode(text)
        repository.insert_document(text, embedding)
        DocumentService.fine_tune_model()
        return "Text processed and stored successfully!"

    @staticmethod
    def process_file(file):
        ext = file.filename.rsplit('.', 1)[-1].lower()
        extractor = TextExtractorFactory.get_extractor(ext)
        if extractor:
            text = extractor.extract_text(file)
            return DocumentService.process_text(text)
        return "Invalid or empty file."

    @staticmethod
    def process_link(url):
        text = DocumentService.extract_text_from_link(url)
        if text and len(text) > 0:
            embedding = nlp_model.encode(text)
            repository.insert_document(text, embedding)
            DocumentService.fine_tune_model()
            return "Link content processed and stored successfully!"
        return "No content retrieved from the URL."

    @staticmethod
    def extract_text_from_link(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                paragraphs = soup.find_all('p')
                text = "\n".join([para.get_text() for para in paragraphs])
                return text
            else:
                return "Failed to retrieve the webpage."
        except Exception as e:
            return f"An error occurred: {e}"

    @staticmethod
    def search_answer(question):
        question_embedding = nlp_model.encode(question)
        documents = repository.find_all_documents()
        similarities = []

        for doc in documents:
            doc_embedding = doc['embedding']
            similarity = util.pytorch_cos_sim(question_embedding, torch.tensor(doc_embedding))[0][0]
            similarities.append((doc['text'], similarity))

        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[0][0] if similarities else "No relevant information found."

    @staticmethod
    def fine_tune_model():
        texts = repository.get_all_texts()
        if texts:
            nlp_model.fine_tune(texts)
            return "Model fine-tuned successfully!"
        return "No data available for fine-tuning."

In [20]:
app = Flask(__name__)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/process_file', methods=['POST'])
def process_file():
    if 'file' not in request.files:
        return jsonify({"message": "No file uploaded."}), 400
    file = request.files['file']
    return jsonify({"message": DocumentService.process_file(file)})

@app.route('/process_text', methods=['POST'])
def process_text():
    text = request.json.get("text", "")
    if text:
        response = DocumentService.process_text(text)  # Fine-tuning automático aqui
        return jsonify({"message": response}), 200
    return jsonify({"message": "No text provided."}), 400

@app.route("/process_link", methods=["POST"])
def process_link():
    url = request.json.get("url", "")
    if url:
        response = DocumentService.process_link(url)  # Fine-tuning automático aqui
        return jsonify({"message": response}), 200
    return jsonify({"message": "No URL provided."}), 400

@app.route("/search_answer", methods=["POST"])
def search_answer():
    question = request.json.get("question", "")
    if question:
        answer = DocumentService.search_answer(question)
        return jsonify({"answer": answer}), 200
    return jsonify({"answer": "No question provided."}), 400

In [None]:
if __name__ == "__main__":
    app.run(debug=True, port=3978)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:3978
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [21]:
!jupyter nbconvert --to script smart_boy_bot.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

In [22]:
!pip freeze > requirements.txt