In [1]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def scrape_webpages(urls):
    documents = []
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text()
        documents.append((url, text))
    return documents

def preprocess_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    text = ' '.join(tokens)
    return text

def preprocess_documents(documents):
    preprocessed_docs = []
    for url, text in documents:
        preprocessed_text = preprocess_text(text)
        preprocessed_docs.append((url, preprocessed_text))
    return preprocessed_docs

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aravind\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aravind\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document

model_kwargs = {'device': 'cuda'}
embeddings = HuggingFaceEmbeddings(model_kwargs=model_kwargs)

def process_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=150,
        length_function=len,
    )
    
    chunks = []
    for url, text in documents:
        doc = Document(page_content=text, metadata={"source": url})
        doc_chunks = text_splitter.split_documents([doc])
        chunks.extend(doc_chunks)
    
    return chunks

def store_documents(chunks):
    db = Chroma.from_documents(chunks, embedding=embeddings, persist_directory="test_index")
    db.persist()
    return db

  from tqdm.autonotebook import tqdm, trange


ImportError: Could not import sentence_transformers python package. Please install it with `pip install sentence-transformers`.