#RAG

In [8]:
!pip install pypdf2

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [9]:
from google.colab import drive
import os
from google import generativeai as genai
import time
import textwrap
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from io import StringIO, open , BytesIO  #create RAM files
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
from PyPDF2 import PdfReader

#Generator

In [12]:
##mount drive and get api key
drive.mount('/content/gdrive')
with open('/content/gdrive/MyDrive/Informatica/Machine Learning/api-key.txt', 'r') as file:
    api_key = file.read().strip()

#configure client
genai.configure(api_key = api_key)
client = genai.GenerativeModel('gemini-1.5-flash')

def format_response(text):
    wrapper = textwrap.TextWrapper(width=80)  # Set to 80 columns wide, but adjust as needed
    return wrapper.fill(text=text)  #this will wrap long text to make it more readable

#send a request to the model via the api
def call_llm(text, system_message= None):
    """
    this function will call the genai api and returns a formatted response
    """
    try:
        prompt = '\n'.join(text)

        #initialize messages list (this will generate back and forth between the user and the model)
        messages= []

        if system_message:
            messages.append({'role': 'user', 'parts': [f"System: {system_message}"]})  #role indicate who is speaking and parts the content (passed as a list)
            messages.append({'role': 'model', 'parts': ["Understood. How can I help?"]})

        messages.append({'role': 'user', 'parts': [prompt]})


        #generate response
        response = client.generate_content(
            contents=messages,
            generation_config={
                'temperature': 0.7,  # Balanced creativity
                'max_output_tokens': 2048,  # Maximum length
                'top_p': 0.9,
                'top_k': 40
            },
            safety_settings={
                'HARM_CATEGORY_HARASSMENT': 'BLOCK_NONE',
                'HARM_CATEGORY_HATE_SPEECH': 'BLOCK_NONE',
                'HARM_CATEGORY_SEXUALLY_EXPLICIT': 'BLOCK_NONE',
                'HARM_CATEGORY_DANGEROUS_CONTENT': 'BLOCK_NONE'
            }
        )

        return format_response(response.text)

    except Exception as e:
        print(f'Exception {str(e)}')

db_records = [
    "Retrieval Augmented Generation (RAG) represents a sophisticated hybrid approach in the field of artificial intelligence, particularly within the realm of natural language processing (NLP).",
    "It innovatively combines the capabilities of neural network-based language models with retrieval systems to enhance the generation of text, making it more accurate, informative, and contextually relevant.",
    "This methodology leverages the strengths of both generative and retrieval architectures to tackle complex tasks that require not only linguistic fluency but also factual correctness and depth of knowledge.",
    "At the core of Retrieval Augmented Generation (RAG) is a generative model, typically a transformer-based neural network, similar to those used in models like GPT (Generative Pre-trained Transformer) or BERT (Bidirectional Encoder Representations from Transformers).",
    "This component is responsible for producing coherent and contextually appropriate language outputs based on a mixture of input prompts and additional information fetched by the retrieval component.",
    "Complementing the language model is the retrieval system, which is usually built on a database of documents or a corpus of texts.",
    "This system uses techniques from information retrieval to find and fetch documents that are relevant to the input query or prompt.",
    "The mechanism of relevance determination can range from simple keyword matching to more complex semantic search algorithms which interpret the meaning behind the query to find the best matches.",
    "This component merges the outputs from the language model and the retrieval system.",
    "It effectively synthesizes the raw data fetched by the retrieval system into the generative process of the language model.",
    "The integrator ensures that the information from the retrieval system is seamlessly incorporated into the final text output, enhancing the model's ability to generate responses that are not only fluent and grammatically correct but also rich in factual details and context-specific nuances.",
    "When a query or prompt is received, the system first processes it to understand the requirement or the context.",
    "Based on the processed query, the retrieval system searches through its database to find relevant documents or information snippets.",
    "This retrieval is guided by the similarity of content in the documents to the query, which can be determined through various techniques like vector embeddings or semantic similarity measures.",
    "The retrieved documents are then fed into the language model.",
    "In some implementations, this integration happens at the token level, where the model can access and incorporate specific pieces of information from the retrieved texts dynamically as it generates each part of the response.",
    "The language model, now augmented with direct access to retrieved information, generates a response.",
    "This response is not only influenced by the training of the model but also by the specific facts and details contained in the retrieved documents, making it more tailored and accurate.",
    "By directly incorporating information from external sources, Retrieval Augmented Generation (RAG) models can produce responses that are more factual and relevant to the given query.",
    "This is particularly useful in domains like medical advice, technical support, and other areas where precision and up-to-date knowledge are crucial.",
    "Retrieval Augmented Generation (RAG) systems can dynamically adapt to new information since they retrieve data in real-time from their databases.",
    "This allows them to remain current with the latest knowledge and trends without needing frequent retraining.",
    "With access to a wide range of documents, Retrieval Augmented Generation (RAG) systems can provide detailed and nuanced answers that a standalone language model might not be capable of generating based solely on its pre-trained knowledge.",
    "While Retrieval Augmented Generation (RAG) offers substantial benefits, it also comes with its challenges.",
    "These include the complexity of integrating retrieval and generation systems, the computational overhead associated with real-time data retrieval, and the need for maintaining a large, up-to-date, and high-quality database of retrievable texts.",
    "Furthermore, ensuring the relevance and accuracy of the retrieved information remains a significant challenge, as does managing the potential for introducing biases or errors from the external sources.",
    "In summary, Retrieval Augmented Generation represents a significant advancement in the field of artificial intelligence, merging the best of retrieval-based and generative technologies to create systems that not only understand and generate natural language but also deeply comprehend and utilize the vast amounts of information available in textual form.",
    "A RAG vector store is a database or dataset that contains vectorized data points."
]

query = 'define a rag store'


response = call_llm(query)
print(response)


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
A rag store is a retail establishment that sells used or discarded clothing and
textiles.  These items are often sold at low prices and may include clothing,
linens, rags, and other fabrics suitable for cleaning, crafting, or repurposing.
The quality of the goods can vary widely.


In [11]:
#convert pdf into txt file
buffer = StringIO()
laparams = LAParams()

#get pdf in memory
with open('/content/Blockchain & Web3-booklet 2024.pdf', 'rb') as file:
    pdfFile = BytesIO(file.read())  #BytesIO file


#extract raw text
extract_text_to_fp(pdfFile, buffer, laparams=laparams)


#decode content
content = buffer.getvalue()
print(content)



Osservatorio  
Blockchain & Web3 
Web3: Why, What 
and When? 
Gennaio 2024
2
PARTNER
ADVISORY BOARDSPONSOR
Osservatorio Blockchain & Web3 
Ricerca 2023-2024
Indice IndiceIndice 
Introduzione  .............................................................................................................................. 4
di Alessandro Perego e Donatella Sciuto
Executive Summary  ............................................................................................................... 7
di Valeria Portale, Francesco Bruschi e Giacomo Vella
Glossario  ................................................................................................................................... 18
Nota Metodologica  .............................................................................................................. 22
Report  ...................................................................................................................................... 26
Osservatori On Demand  ....