Install the required libraries

In [1]:
!pip install pymupdf python-docx openai==0.28 numpy




In [2]:
import os
import openai
import numpy as np
import docx
import fitz
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
from typing import List, Tuple

 Set OpenAI API Key

In [3]:
load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

if openai.api_key is None:
    raise ValueError("OpenAI API Key not found. Please set it in the .env file.")


Functions to load text

In [4]:
def load_text(file_path: str) -> str:
    """Load text from various file types."""
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".txt":
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    elif ext == ".docx":
        doc = docx.Document(file_path)
        return '\n'.join(para.text for para in doc.paragraphs)
    elif ext == ".pdf":
        with fitz.open(file_path) as pdf:
            return '\n'.join(page.get_text() for page in pdf)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

# Function to clean and preprocess text
def preprocess_text(text: str) -> str:
    """Clean and preprocess text."""
    return text.lower().replace('\n', ' ')


Functions to vectorize text

In [5]:
def vectorize_text(text: str) -> np.ndarray:
    """Create a vector representation of the text."""
    try:
        response = openai.Embedding.create(
            input=text,
            model="text-embedding-ada-002"
        )
        return np.array(response['data'][0]['embedding'])
    except openai.error.OpenAIError as e:
        print(f"Error in vectorizing text: {e}")
        return None


Function to chunk text

In [6]:
# Function to chunk text
def chunk_text(text: str, chunk_size: int = 1000) -> List[str]:
    """Split text into chunks of specified size."""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Function to find the most relevant chunk
def find_most_relevant_chunk(query: str, chunks: List[str]) -> Tuple[str, float]:
    """Find the most relevant chunk to the query."""
    query_vector = vectorize_text(query)
    if query_vector is None:
        return "", 0.0

    chunk_vectors = [vectorize_text(chunk) for chunk in chunks]
    valid_chunk_vectors = [vec for vec in chunk_vectors if vec is not None]

    if not valid_chunk_vectors:
        return "", 0.0

    similarities = cosine_similarity([query_vector], valid_chunk_vectors)[0]
    max_similarity_index = np.argmax(similarities)
    return chunks[max_similarity_index], similarities[max_similarity_index]


Function to generate text using the OpenAI Completion API

In [7]:
# Function to generate response
def generate_response(context: str, query: str) -> str:
    """Generate a response using OpenAI's API."""
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an AI specialized in answering questions based on provided text data. Limit your response to the context of the provided text."},
                {"role": "user", "content": query},
                {"role": "assistant", "content": context},
            ],
            max_tokens=200,
            temperature=0.7
        )
        return response.choices[0]['message']['content'].strip()
    except openai.error.OpenAIError as e:
        print(f"Error in generating response: {e}")
        return "Sorry, I couldn't generate a response at this time."

Main function to handle the RAG application

In [13]:
def rag_application(file_path: str, query: str, similarity_threshold: float = 0.5) -> str:
    """Main RAG application function."""
    try:
        text = load_text(file_path)
        preprocessed_text = preprocess_text(text)
        chunks = chunk_text(preprocessed_text)
        most_relevant_chunk, similarity = find_most_relevant_chunk(query, chunks)
        
        if similarity > similarity_threshold:
            return generate_response(most_relevant_chunk, query)
        else:
            return "No relevant information found in the text."
    except Exception as e:
        print(f"Error in RAG application: {e}")
        return "An error occurred while processing your request."

In [14]:
# Set the file path (update this path to your file's location)
file_path = "C:\\Users\\pc\\OneDrive\\Bureau\\ragapprroject\\NLP models.pdf"

In [15]:
prompt = "What is the subject of this text?"

In [16]:
response = rag_application(file_path, prompt)
response

'The subject of the text is about the implementation of fine-tuning and reinforcement learning from human feedback in order to adapt models for real-world applications like chatbots. The text discusses pre-training objectives, data collection, data cleaning, and tokenization as part of this implementation process.'

In [17]:
prompts = ["What is NLP?", "How does Pre-training work?","What are Challenges with large language models (LLMs)?","What is Retrieval augmented generation (RAG)?"]

for prompt in prompts:
    response = rag_application(file_path, prompt)
    print(f"Query: {prompt}\nResponse: {response}\n")

Query: What is NLP?
Response: NLP stands for Natural Language Processing. It involves the use of computational techniques to analyze, understand, and generate human language data. In the context provided, NLP is discussed in relation to pre-trained models that enhance performance, scalability, and innovation in AI applications. The text suggests that leveraging these models can lead to advancements in AI development and deployment.

Query: How does Pre-training work?
Response: Pre-training works by training a model to predict the next word in a sequence of text. This initial training helps the model understand language patterns and concepts from a diverse set of text data. The collected data is cleaned to remove noise and inconsistencies, and then tokenized to break down the text into smaller units for processing. This pre-training process forms the foundation for the model to understand language, but further steps like fine-tuning and reinforcement learning are necessary to adapt the 