In [1]:
documents = [
    "This is a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings.",
    "Understanding document structure aids in keyword extraction.",
    "Efficient keyword extraction enhances search accuracy.",
    "Semantic similarity improves document retrieval performance.",
    "Machine learning algorithms can optimize keyword extraction methods."
]

In [None]:
%pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
embeddingModel = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [5]:
docEmbeddings = embeddingModel.encode(documents)

In [6]:
len(docEmbeddings[0])

384

In [7]:
query = "Natural language processing techniques enhance keyword extraction efficiency"
queryEmbedding = embeddingModel.encode(query)

In [9]:
docEmbeddings.shape

(8, 384)

In [8]:
queryEmbedding.shape

(384,)

In [14]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(np.array([queryEmbedding]),docEmbeddings)

In [15]:
similarities

array([[0.19311799, 0.5493834 , 0.596101  , 0.5637232 , 0.71312535,
        0.8027224 , 0.39504266, 0.7680669 ]], dtype=float32)

In [22]:
mostSimilarDocIdx = np.argmax(similarities)
print(mostSimilarDocIdx)
mostSimilarDoc = documents[mostSimilarDocIdx]
print("Q:", query)
print(mostSimilarDoc)

5
Q: Natural language processing techniques enhance keyword extraction efficiency
Efficient keyword extraction enhances search accuracy.


In [24]:
sortedIdx = np.argsort(similarities[0])
sortedIdx = sortedIdx[::-1]
sortedIdx

array([5, 7, 4, 2, 3, 1, 6, 0])

In [29]:
rankedDocs = [(documents[i], similarities[0][i]) for i in sortedIdx]
rankedDocs

[('Efficient keyword extraction enhances search accuracy.',
  np.float32(0.8027224)),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  np.float32(0.7680669)),
 ('Understanding document structure aids in keyword extraction.',
  np.float32(0.71312535)),
 ('Document analysis involves extracting keywords.', np.float32(0.596101)),
 ('Keyword-based search relies on sparse embeddings.', np.float32(0.5637232)),
 ('Keywords are important for keyword-based search.', np.float32(0.5493834)),
 ('Semantic similarity improves document retrieval performance.',
  np.float32(0.39504266)),
 ('This is a list which containing sample documents.', np.float32(0.19311799))]

In [32]:
print("Ranked Documents:")
for rank, (document, similarity) in enumerate(rankedDocs, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Ranked Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.8027223944664001
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7680668830871582
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.7131253480911255
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.5961009860038757
Rank 5: Document - 'Keyword-based search relies on sparse embeddings.', Similarity Score - 0.5637232065200806
Rank 6: Document - 'Keywords are important for keyword-based search.', Similarity Score - 0.5493834018707275
Rank 7: Document - 'Semantic similarity improves document retrieval performance.', Similarity Score - 0.39504265785217285
Rank 8: Document - 'This is a list which containing sample documents.', Similarity Score - 0.19311799108982086


In [34]:
print("Top 4 Documents:")
for rank, (document, similarity) in enumerate(rankedDocs[:4], start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.8027223944664001
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7680668830871582
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.7131253480911255
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.5961009860038757


### RERANKING USING BM25


In [None]:
%pip install rank_bm25

In [37]:
from rank_bm25 import BM25Okapi
topKDocs = [doc[0] for doc in rankedDocs[:4]]
topKDocs

['Efficient keyword extraction enhances search accuracy.',
 'Machine learning algorithms can optimize keyword extraction methods.',
 'Understanding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [39]:
topKDocsTokenized = [doc.split() for doc in topKDocs]
topKDocsTokenized

[['Efficient', 'keyword', 'extraction', 'enhances', 'search', 'accuracy.'],
 ['Machine',
  'learning',
  'algorithms',
  'can',
  'optimize',
  'keyword',
  'extraction',
  'methods.'],
 ['Understanding',
  'document',
  'structure',
  'aids',
  'in',
  'keyword',
  'extraction.'],
 ['Document', 'analysis', 'involves', 'extracting', 'keywords.']]

In [41]:
tokenizedQuery = query.split()
tokenizedQuery

['Natural',
 'language',
 'processing',
 'techniques',
 'enhance',
 'keyword',
 'extraction',
 'efficiency']

In [42]:
bm25 = BM25Okapi(topKDocsTokenized)
bm25

<rank_bm25.BM25Okapi at 0x7ba035e925a0>

In [44]:
relevenceScores = bm25.get_scores(tokenizedQuery)
relevenceScores

array([0.1907998 , 0.16686672, 0.17803252, 0.        ])

In [45]:
sortedIdx2 = np.argsort(relevenceScores)
sortedIdx2 = sortedIdx2[::-1]
sortedIdx2

array([0, 2, 1, 3])

In [48]:
reRankedDocs = [(topKDocs[i], relevenceScores[i]) for i in sortedIdx2]
reRankedDocs

[('Efficient keyword extraction enhances search accuracy.',
  np.float64(0.19079979534096053)),
 ('Understanding document structure aids in keyword extraction.',
  np.float64(0.1780325227902643)),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  np.float64(0.1668667199671815)),
 ('Document analysis involves extracting keywords.', np.float64(0.0))]

In [49]:
print("Top 4 Documents:")
for rank, (document, similarity) in enumerate(rankedDocs[:4], start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.8027223944664001
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7680668830871582
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.7131253480911255
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.5961009860038757


In [52]:
print("Top 4 ReRanked Documents:")
for rank, (document, similarity) in enumerate(reRankedDocs[:4], start=1):
    print(f"Rank {rank}: Document - '{document}', Relevence Score - {relevenceScores}")

Top 4 ReRanked Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Relevence Score - [0.1907998  0.16686672 0.17803252 0.        ]
Rank 2: Document - 'Understanding document structure aids in keyword extraction.', Relevence Score - [0.1907998  0.16686672 0.17803252 0.        ]
Rank 3: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Relevence Score - [0.1907998  0.16686672 0.17803252 0.        ]
Rank 4: Document - 'Document analysis involves extracting keywords.', Relevence Score - [0.1907998  0.16686672 0.17803252 0.        ]


### USING CROSS ENCODER

In [53]:
from sentence_transformers import CrossEncoder
crossEncoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [54]:
topKDocs

['Efficient keyword extraction enhances search accuracy.',
 'Machine learning algorithms can optimize keyword extraction methods.',
 'Understanding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [55]:
query

'Natural language processing techniques enhance keyword extraction efficiency'

In [57]:
pairs = []
for doc in topKDocs:
  pairs.append([query, doc])
pairs

[['Natural language processing techniques enhance keyword extraction efficiency',
  'Efficient keyword extraction enhances search accuracy.'],
 ['Natural language processing techniques enhance keyword extraction efficiency',
  'Machine learning algorithms can optimize keyword extraction methods.'],
 ['Natural language processing techniques enhance keyword extraction efficiency',
  'Understanding document structure aids in keyword extraction.'],
 ['Natural language processing techniques enhance keyword extraction efficiency',
  'Document analysis involves extracting keywords.']]

In [58]:
crossEncoderRelevenceScores = crossEncoder.predict(pairs)
crossEncoderRelevenceScores

array([ 2.881895 ,  0.4075726, -3.6697335, -3.3118227], dtype=float32)

In [61]:
scoredDocs = zip(crossEncoderRelevenceScores, topKDocs)
scoredDocs

<zip at 0x7ba0351c3c80>

In [62]:
crossEncoderRerankedDocs = sorted(scoredDocs, reverse= True)

In [63]:
crossEncoderRerankedDocs

[(np.float32(2.881895),
  'Efficient keyword extraction enhances search accuracy.'),
 (np.float32(0.4075726),
  'Machine learning algorithms can optimize keyword extraction methods.'),
 (np.float32(-3.3118227), 'Document analysis involves extracting keywords.'),
 (np.float32(-3.6697335),
  'Understanding document structure aids in keyword extraction.')]

In [64]:
reRankedDocs

[('Efficient keyword extraction enhances search accuracy.',
  np.float64(0.19079979534096053)),
 ('Understanding document structure aids in keyword extraction.',
  np.float64(0.1780325227902643)),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  np.float64(0.1668667199671815)),
 ('Document analysis involves extracting keywords.', np.float64(0.0))]