In [None]:

documents = [
    "This is a list which containing sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings.",
    "Understanding document structure aids in keyword extraction.",
    "Efficient keyword extraction enhances search accuracy.",
    "Semantic similarity improves document retrieval performance.",
    "Machine learning algorithms can optimize keyword extraction methods."
]




In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transform

In [None]:
## Load pretrainer sentence Transformer model
model_name="sentence-transformers/paraphrase-xlm-r-multilingual-v1"

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model=SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
##length of docs
len(documents)

8

In [None]:
## creating embedding of the documents
doc_embeddings=model.encode(documents)

In [None]:
len(doc_embeddings)

8

In [None]:
## shape of embedding
len(doc_embeddings[0])

768

In [None]:
## Query
query="Natural language processing techniques enhance keyword extraction efficiency."

In [None]:
## Getting Query for embeddings
query_embedding=model.encode(query)
query_embedding.shape

(768,)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
## Calculating the similarity score
similarities=cosine_similarity(np.array([query_embedding]),doc_embeddings)

In [None]:
similarities

array([[0.16948149, 0.4580228 , 0.5675695 , 0.441233  , 0.6316118 ,
        0.75214136, 0.550352  , 0.74481666]], dtype=float32)

In [None]:
## getting the most similar document index.
most_similar_index=np.argmax(similarities)
most_similar_index

5

In [None]:
sorted_indices=np.argsort(similarities)[0][::-1]
sorted_indices

array([5, 7, 4, 2, 6, 1, 3, 0])

In [None]:
ranked_doc=[(documents[i],similarities[0][i]) for i in sorted_indices]

In [None]:
ranked_doc

[('Efficient keyword extraction enhances search accuracy.', 0.75214136),
 ('Machine learning algorithms can optimize keyword extraction methods.',
  0.74481666),
 ('Understanding document structure aids in keyword extraction.', 0.6316118),
 ('Document analysis involves extracting keywords.', 0.5675695),
 ('Semantic similarity improves document retrieval performance.', 0.550352),
 ('Keywords are important for keyword-based search.', 0.4580228),
 ('Keyword-based search relies on sparse embeddings.', 0.441233),
 ('This is a list which containing sample documents.', 0.16948149)]

In [None]:

print("Top 4 Documents:")
for rank, (documents, similarities) in enumerate(ranked_doc[:4], start=1):
    print(f"Rank {rank}: Document - '{documents}', Similarity Score - {similarities}")

Top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.7521413564682007
Rank 2: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.7448166608810425
Rank 3: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.6316118240356445
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.5675694942474365


In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
from rank_bm25 import BM25Okapi

In [None]:
top_4_documents=[doc[0] for doc in ranked_doc[:4]]

In [None]:
top_4_documents

['Efficient keyword extraction enhances search accuracy.',
 'Machine learning algorithms can optimize keyword extraction methods.',
 'Understanding document structure aids in keyword extraction.',
 'Document analysis involves extracting keywords.']

In [None]:
tokenized_top_4_documents=[doc.split() for doc in top_4_documents]

In [None]:
tokenized_top_4_documents

[['Efficient', 'keyword', 'extraction', 'enhances', 'search', 'accuracy.'],
 ['Machine',
  'learning',
  'algorithms',
  'can',
  'optimize',
  'keyword',
  'extraction',
  'methods.'],
 ['Understanding',
  'document',
  'structure',
  'aids',
  'in',
  'keyword',
  'extraction.'],
 ['Document', 'analysis', 'involves', 'extracting', 'keywords.']]

In [None]:
tokenized_query=query.split()

In [None]:
tokenized_query

['Natural',
 'language',
 'processing',
 'techniques',
 'enhance',
 'keyword',
 'extraction',
 'efficiency.']

In [None]:
bm25=BM25Okapi(tokenized_top_4_documents)

In [None]:
bm25

<rank_bm25.BM25Okapi at 0x7bfe156b78b0>

In [None]:
bm25_scores=bm25.get_scores(tokenized_query)
bm25_scores

array([0.1907998 , 0.16686672, 0.17803252, 0.        ])

In [None]:
sorted_indices2=np.argsort(bm25_scores)[::-1]
sorted_indices2

array([0, 2, 1, 3])

In [None]:

reranked_documents = [(top_4_documents[i], bm25_scores[i]) for i in sorted_indices2]

In [None]:

print("Rerank of top 4 Documents:")
for rank, (document, similarity) in enumerate(reranked_documents, start=1):
    print(f"Rank {rank}: Document - '{document}', Similarity Score - {similarity}")

Rerank of top 4 Documents:
Rank 1: Document - 'Efficient keyword extraction enhances search accuracy.', Similarity Score - 0.19079979534096053
Rank 2: Document - 'Understanding document structure aids in keyword extraction.', Similarity Score - 0.1780325227902643
Rank 3: Document - 'Machine learning algorithms can optimize keyword extraction methods.', Similarity Score - 0.1668667199671815
Rank 4: Document - 'Document analysis involves extracting keywords.', Similarity Score - 0.0
