<a href="https://colab.research.google.com/github/bhaveshkolhe/TASK/blob/main/TASK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
!pip install flask python-dotenv requests beautifulsoup4 sentence-transformers faiss-cpu langchain langchain-community Flask-Limiter transformers




In [32]:
import os
import requests
from bs4 import BeautifulSoup
from langchain.document_loaders import WebBaseLoader  # (if needed elsewhere)
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA

# Instead of OpenAI, we'll use a local LLM via HuggingFacePipeline
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

from flask import Flask, request, jsonify
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address

from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.docstore import InMemoryDocstore

from langchain.embeddings import HuggingFaceEmbeddings

import numpy as np
import faiss

# --- Helper Function to Fetch Data ---
def fetch_data_from_url(url):
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if the request fails
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract text content from all <p> tags
    texts = [p.get_text() for p in soup.find_all('p')]
    return "\n".join(texts)

# --- Fetch and Process Data ---
url = "https://brainlox.com/courses/category/technical"
data = fetch_data_from_url(url)

# Use RecursiveCharacterTextSplitter to chunk the data
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", " ", ""]
)
texts = text_splitter.split_text(data)

# Use SentenceTransformer for initial embeddings
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)
embeddings = model.encode(texts).tolist()

# Create Document objects from the texts
documents = [Document(page_content=text) for text in texts]

# --- Create FAISS Vector Store ---
dimension = len(embeddings[0])
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(np.array(embeddings).astype("float32"))

# Create a document store and mapping from FAISS index positions to document IDs
docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(documents)})
index_to_docstore_id = {i: str(i) for i in range(len(documents))}

# Create an embeddings function for query-time embeddings
embedding_function = HuggingFaceEmbeddings(model_name=model_name)

# Initialize the FAISS vector store with all required parameters
vectorstore = FAISS(faiss_index, embedding_function, docstore, index_to_docstore_id)

# --- Initialize a Local LLM using HuggingFacePipeline ---
# Here we use a local text-generation pipeline with a model like "distilgpt2".
generator = pipeline("text-generation", model="distilgpt2", max_length=256)
llm = HuggingFacePipeline(pipeline=generator)

# Build the RetrievalQA chain using the FAISS retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

# --- Create Flask App with Rate Limiting ---
app = Flask(__name__)

@app.route('/ask', methods=['POST'])
def ask():
    data = request.get_json()
    if not data or 'question' not in data:
        return jsonify({"error": "No question provided"}), 400

    question = data['question']
    try:
        result = qa_chain.run(question)
    except Exception as e:
        return jsonify({"error": str(e)}), 500
    return jsonify({"answer": result})

if __name__ == '__main__':
    app.run(debug=True)


Device set to use cpu


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
