In [1]:
# !pip install --upgrade "protobuf>3.20"
# !pip install networkx

In [1]:
from datasets import load_dataset
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
warnings.filterwarnings("ignore")
import requests
from bs4 import BeautifulSoup
import pandas as pd
import threading
import time
import random
from threading import Thread
import os
import chromadb
from chromadb.config import Settings
chroma_client = chromadb.PersistentClient(path="./")
from googlesearch import search

  from .autonotebook import tqdm as notebook_tqdm


## Vanilla RAG Pipeline using Chroma Database

**User Query:** The process starts with a query from the user.

**Retrieval Step:** Relevant documents or knowledge snippets are retrieved from a database/internet or document store using the query. Common retrieval methods include embeddings and similarity search.

**Candidate Selection:** The top retrieved documents are selected based on relevance to the query.

**Augmentation:** The selected documents are combined with the query to form an augmented input.

**Generation with LLM:** The augmented input is passed to a Large Language Model (LLM), which generates a response using both the query and the retrieved documents.

**Response Delivery:** The generated answer is provided back to the user, enriched with relevant information from the retrieval step.



In [2]:
model_exists = False
if os.path.exists('../Saved_Models/Sky/fine-tuned-bert-sentiment_2024_10_04_0'):   
    torch.cuda.empty_cache()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_name = '../Saved_Models/Sky/fine-tuned-bert-sentiment_2024_10_04_0'
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)
    model = model.to(device)
    model_exists = True
else:
    print("Model not available")
# data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Model not available


#### Data Cleaning

In [3]:
#This piece of code will clear any text which is in latex format.

def clean_latex(text):
    # Remove LaTeX commands like {\\displaystyle}, {\\text{}}, etc.
    cleaned_text = re.sub(r'\\displaystyle|\\text\{.*?\}', '', text)
    
    # Remove any LaTeX curly braces and unnecessary whitespaces
    cleaned_text = re.sub(r'\\[a-z]+|{|}', '', cleaned_text)
    
    # Replace LaTeX-specific representations like \\dots with their equivalent
    cleaned_text = re.sub(r'\\dots', '...', cleaned_text)
    
    # Remove multiple spaces introduced by LaTeX removal
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

In [4]:
#This class contains the code to generate multiple threads.

class ThreadWithReturnValue(Thread):
    def __init__(self, group=None, target=None, name=None,
                 args=(), kwargs={}, Verbose=None):
        Thread.__init__(self, group, target, name, args, kwargs)
        self._return = None

    def run(self):
        if self._target is not None:
            self._return = self._target(*self._args,
                                                **self._kwargs)
    def join(self, *args):
        Thread.join(self, *args)
        return self._return

In [5]:
def create_chunks(cleaned_data):
    ids_list = []
    final_chunks = []
    random_number = random.randint(0,10000000000)
    loop = math.ceil(len(cleaned_data)/2000)
    for i in range(0,loop):
        if i ==0:
            final_chunks.append(cleaned_data[(i*2000):(i+1)*2000])
        else:
            final_chunks.append(cleaned_data[(i*2000)-500:(i+1)*2000])
        ids_list.append(str(random_number+(i/50)))
    return (final_chunks,ids_list)
    # return final_chunks

In [6]:
# Data Mining function is used to scare the data from websites.
def data_mining(website):
    filtered_content = ""
    if (".gov" not in website) and ("linkedin.com" not in website) and ("reddit.com" not in website): #Ignore gov, linkedin and reddit websites
        URL = website
        r = requests.get(URL) 
        soup = BeautifulSoup(r.content, 'html5lib')
        for tag in soup(['nav', 'header', 'footer', 'script', 'style', 'aside']): #Remove the information which is not useful
            tag.decompose()
        for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'li','strong']):  # Find all header and paragraph tags
            filtered_content = filtered_content+tag.get_text()
        remove_latex = clean_latex(filtered_content)                             #Remove all latex text by calling clean latex function which is defined above
        chunks = create_chunks(remove_latex)                                     #Divide the complete paragraph into chunks
    return chunks


In [7]:
def get_google_search_links(query):             #Get links from google
    return [link for link in search(query)]

In [8]:
#Data Cleaning using PySpark (Uncomment only when the data is in millions)

# spark.stop()
# spark = SparkSession.builder \
#     .master("local") \
#     .config("spark.driver.memory", "1g") \
#     .config("spark.executor.memory", "1g") \
#     .appName("WebScrapingWithPySpark") \
#     .getOrCreate()

# cleaned_data_udf = udf(clean_latex, StringType())
# # create_chunks_udf = udf(create_chunks, ArrayType(StringType()), ArrayType(StringType()))
# schema = StructType([
#     StructField("chunks", ArrayType(StringType())),
#     StructField("ids", ArrayType(StringType()))
# ])
# create_chunks_udf = udf(create_chunks, schema)


In [9]:
collection = chroma_client.get_or_create_collection(name="my_collection")   #Create collection 

In [10]:
# Query Examples
query = ["How to make crossiant?","What is Big Data?","Which of the following is a Characteristic of BigDdata?","What is Hadoop?","What is the Function of RAM?"]

In [16]:
q = query[1]

In [17]:
###############################Data Fecthing################################
def fetch_data(q):
    website_links = get_google_search_links(q)                                                                 #Get Link
    threads_list = [ThreadWithReturnValue(target=data_mining, args=(website,)) for website in website_links[:5]] #Create 5 threads
    [thread.start() for thread in threads_list]                                                                  #Start the threads
    fetched_data = [thread.join() for thread in threads_list]                                                    #Call the function to fetch the data
    return fetched_data

#### RAG Pipeline

In [18]:
start_time = time.time()
try:
    fetched_data = fetch_data(q)
    # print("Query : {}".format(q))
    
    final_chunks = []
    final_ids = []
    for f in fetched_data:
        for i in f[0]:
            final_chunks.append(i)
        for i in f[1]:
            final_ids.append(i)

    # # print(flat_list)
    
    collection.add(
        documents=final_chunks,
        ids=final_ids
    )
except Exception as httperr:
    print("Cannot fetch the data currently since google doesnt allow data fetching through public wifi")
    
results = collection.query(
    query_texts=q, # Chroma will embed this for you
    n_results=2 # how many results to return
)
print("Distances : ",results['distances'])

if model_exists:
    input_ids = tokenizer.encode("<context>{}agent_1:{}".format(results["documents"][0][0],q), return_tensors='pt',max_length=512, truncation=True).to(device)
    output = model.generate(
        input_ids, 
        max_length=128, 
        num_beams=10, 
        early_stopping=True, 
        no_repeat_ngram_size=2,  # Prevent repeating n-grams
        num_return_sequences=1,  # Number of sequences to return
        temperature=0.7,  # Sampling temperature
        top_k=50,  # Top-K sampling
        top_p=0.9  # Top-p (nucleus) sampling
    )
    resp = tokenizer.decode(output[0], skip_special_tokens=True)
    print(resp)
else:
    print("LLM Model is unavailable")
    print("Retrieved Chunk from the chroma vector database :",results["documents"][0])
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Cannot fetch the data currently since google doesnt allow data fetching through public wifi
Distances :  [[0.4907780587673187, 0.49605846405029297]]
LLM Model is unavailable
Retrieved Chunk from the chroma vector database : ['What Is Big Data? Big Data ExplainedBig data has become more than a buzzword as information has grown more complex and vast in quantity and organizations struggling to gather, curate, understand, and use data effectively. It also describes challenges in IT, business, as well as emerging analytics technologies. But where did the term come from, how can you use big data at your organization, and how can you advance your big data analytics strategies? We’ll address these questions and provide tips to get started using your big data.History of big dataIn the 1960s, the United States created a large data center to store millions of tax records. This data center was the first real use case of digital data management. Through the 1990s and 2000s, leaders in the data spac