Install the required libraries

In [None]:
!pip install pymupdf python-docx openai==0.28 numpy


In [None]:
import os
import openai
import numpy as np
import docx
import fitz


 Set OpenAI API Key

In [None]:
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

if openai.api_key is None:
    raise ValueError("OpenAI API Key not found. Please set it in the .env file.")


Functions to load text

In [None]:
def load_text_from_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

def load_text_from_docx(file_path):
    doc = docx.Document(file_path)
    text = [para.text for para in doc.paragraphs]
    return '\n'.join(text)

def load_text_from_pdf(file_path):
    pdf = fitz.open(file_path)
    text = [pdf.load_page(page_num).get_text() for page_num in range(len(pdf))]
    return '\n'.join(text)


Functions to vectorize text

In [None]:
def vectorize_text(text):
    """
    Convert text into a vector using OpenAI's embedding model.

    Parameters:
    text (str): The input text to be vectorized.

    Returns:
    np.array: A numpy array containing the text embeddings.
    """
    response = openai.Embedding.create(
        input=text,
        model="text-embedding-ada-002"
    )
    embeddings = response['data'][0]['embedding']
    return np.array(embeddings)

def search_text(query, text):
    """
    Compute the cosine similarity between the vectorized query and the vectorized text.

    Parameters:
    query (str): The user's query.
    text (str): The text to be searched.

    Returns:
    float: The similarity score between the query and the text.
    """
    query_vector = vectorize_text(query)
    text_vector = vectorize_text(text)
    similarity = np.dot(text_vector, query_vector)
    return similarity


Function to generate text using the OpenAI Completion API

In [None]:
def generate_text(retrieved_text, user_input):
    """
    Generate a response based on retrieved text and user input using OpenAI's GPT-3.5 Turbo.

    Parameters:
    retrieved_text (str): The relevant text retrieved from the document.
    user_input (str): The user's input question.

    Returns:
    str: The generated response text.
    """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an AI specialized in answering questions based on provided text data. Limit your response to the context of the provided text."},
            {"role": "user", "content": user_input},
            {"role": "assistant", "content": retrieved_text},
        ],
        max_tokens=200,
        temperature=0.7
    )
    return response.choices[0]['message']['content'].strip()


Main function to handle the RAG application

In [None]:
def rag_application(file_path, query):
    """
    Handle the Retrieval-Augmented Generation (RAG) application.

    Parameters:
    file_path (str): The path to the file containing the text data (.txt, .docx, or .pdf).
    query (str): The user's query.

    Returns:
    str: The generated text based on the query, or a message if no relevant information is found.
    """
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".txt":
        text = load_text_from_txt(file_path)
    elif ext == ".docx":
        text = load_text_from_docx(file_path)
    elif ext == ".pdf":
        text = load_text_from_pdf(file_path)
    else:
        raise ValueError("Unsupported file type")

    similarity = search_text(query, text)

    if similarity > 0.5:
        generated_text = generate_text(text, query)
        return generated_text
    else:
        return "No relevant information found in the text."


In [None]:
# Set the file path (update this path to your file's location)
file_path = "C:\\Users\\pc\\OneDrive\Bureau\\ragapprroject\\NLP models.pdf"
# Function to process the query and return the result
def rag_generate(query):
    """
    Process the query using the RAG application and return the result.
    
    Parameters:
    query (str): The user's query.
    
    Returns:
    str: The generated text based on the query, or a message if no relevant information is found.
    """
    result = rag_application(file_path, query)
    return result


In [None]:
prompt = "What is the subject of this text?"

In [None]:
response = rag_generate(prompt)
response

In [None]:
prompts = ["What is NPL?", "How does machine learning work?", "What is deep learning?"]

for prompt in prompts:
    response = rag_generate(prompt)
    print(response)
