We will start by extracting sentences from the paper "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks"

In [1]:
!pip install PyPDF2 nltk pandas



In [2]:
import PyPDF2
import nltk
import pandas as pd
import re

# Download NLTK data files (first time only)
nltk.download('punkt')

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def clean_text(text):
    # Remove unwanted characters and split by sentences
    text = re.sub(r'\s+', ' ', text)
    sentences = nltk.sent_tokenize(text)
    return sentences

# Extract text from the PDF
pdf_path = 'RAG.pdf'
text = extract_text_from_pdf(pdf_path)

# Clean and split text into sentences
sentences = clean_text(text)

# Save sentences into a CSV file
df = pd.DataFrame(sentences, columns=['sentence'])
df.to_csv('sentences.csv', index=False)

print("Sentences extracted and saved to sentences.csv")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yazan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Sentences extracted and saved to sentences.csv


Now we will generate embeddings using MiniLM 

In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the dataset
data = pd.read_csv('sentences.csv')
sentences = data['sentence'].tolist()

# Load the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings
embeddings = model.encode(sentences)

# Save embeddings to a file for further use
np.save('embeddings.npy', embeddings)


  from tqdm.autonotebook import tqdm, trange


In [4]:
from sklearn.decomposition import PCA

# Load embeddings
embeddings = np.load('embeddings.npy')

# Apply PCA for dimensionality reduction
pca = PCA(n_components=128)
reduced_embeddings = pca.fit_transform(embeddings)

# Save reduced embeddings for further use
np.save('reduced_embeddings.npy', reduced_embeddings)


In [5]:
!pip install Levenshtein



In [6]:
!pip install matplotlib_venn



In [7]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance as levenshtein_distance
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

# Load embeddings
embeddings = np.load('embeddings.npy')
reduced_embeddings = np.load('reduced_embeddings.npy')

# Create a search query
search_query = "How does retrieval-augmented generation work?"

# Generate embedding for the search query
query_embedding = model.encode([search_query])

# Compute similarity for original embeddings
similarities_original = cosine_similarity(query_embedding, embeddings)
top_10_indices_original = similarities_original.argsort()[0][-10:][::-1]
top_10_sentences_original = [sentences[i] for i in top_10_indices_original]

# Compute similarity for reduced embeddings
reduced_query_embedding = pca.transform(query_embedding)
similarities_reduced = cosine_similarity(reduced_query_embedding, reduced_embeddings)
top_10_indices_reduced = similarities_reduced.argsort()[0][-10:][::-1]
top_10_sentences_reduced = [sentences[i] for i in top_10_indices_reduced]

# Evaluate using Levenshtein distance
def evaluate_similarity(query, results):
    distances = [levenshtein_distance(query, result) for result in results]
    return distances

distances_original = evaluate_similarity(search_query, top_10_sentences_original)
distances_reduced = evaluate_similarity(search_query, top_10_sentences_reduced)

# Create a pivot table
evaluation_df_original = pd.DataFrame({
    'Sentence': top_10_sentences_original,
    'Levenshtein Distance': distances_original,
    'Type': 'Original'
})

evaluation_df_reduced = pd.DataFrame({
    'Sentence': top_10_sentences_reduced,
    'Levenshtein Distance': distances_reduced,
    'Type': 'Reduced'
})

evaluation_df = pd.concat([evaluation_df_original, evaluation_df_reduced])

# Create a pivot table to summarize results
pivot_table = evaluation_df.pivot_table(index='Sentence', columns='Type', values='Levenshtein Distance', aggfunc='first').fillna('-')
print(pivot_table)


Type                                               Original Reduced
Sentence                                                           
As shown in Table 6, learned retrieval improves...     51.0    51.0
H Retrieval Collapse In preliminary experiments...    186.0   186.0
In many real-world applications, retrieval supe...        -   140.0
Learned Retrieval There is signiﬁcant work on l...    154.0   154.0
Our work uniﬁes previous successes in incorpora...        -   168.0
REALM: Retrieval-augmented language model pre-t...     32.0       -
Retrieval-Augmented Generation for Knowledge-In...    496.0       -
Some work optimizes the retrieval module to aid...    179.0   179.0
We conducted an thorough investigation of the l...    187.0   187.0
We endow pre-trained, parametric-memory generat...    152.0   152.0
We explore a general-purpose ﬁne-tuning recipe ...    147.0   147.0
[46] also found spurious retrieval results when...    101.0   101.0


In [9]:
from collections import Counter
intersection_sentences = set(top_10_sentences_original) & set(top_10_sentences_reduced)
def get_word_counts(sentences):
    words = ' '.join(sentences).lower().split()
    return Counter(words)

intersection_words = get_word_counts(intersection_sentences)
print("\nCommon Words in Intersection Sentences:")
for word, count in intersection_words.items():
    if count > 1:  # Only show words that appear more than once
        print(f'{word}: {count}')


Common Words in Intersection Sentences:
learned: 3
retrieval: 9
work: 2
on: 2
learning: 2
to: 7
retrieve: 2
documents: 2
in: 6
with: 2
pre-trained,: 2
language: 2
models: 3
[: 5
we: 6
for: 4
some: 2
such: 2
as: 5
generation: 4
the: 7
component: 2
and: 3
of: 2
a: 6
downstream: 2
approach: 2
general-purpose: 2
ﬁne-tuning: 2
retrieval-augmented: 2
which: 2
non-parametric: 2
results: 2
tasks.: 2
