In [13]:
%%writefile semantic_search.py
import pandas as pd
import numpy as np
import re
import string
import nltk
nltk.download('punkt_tab')

from sentence_transformers import SentenceTransformer, util
from rake_nltk import Rake
from helper import *

class SemanticSearchManager:
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.rake = Rake()
    
    def insert_date(self, texts:list[str]):
        self.data = pd.DataFrame({"article": texts})
        self.data['article'] = self.data['article'].apply(self.text_preprocessing)
        self.data['key_words'] = self.data['article'].apply(self.extract_keywords)
        self.embeddings = self.model.encode(self.data['article'].to_list(), convert_to_tensor=True)
        

    def text_preprocessing(self, text):
        text = remove_punctuations(text)
        text = remove_digits(text)
        text = remove_links(text)
        text = remove_hashtags_mentions(text)
        return text
    
    def similarity_search(self, query:str, top_k:int = 3):
        query_embedding = self.model.encode(query, convert_to_tensor=True)
        similarities = util.pytorch_cos_sim(query_embedding, self.embeddings)
        top_results = similarities.topk(k=top_k)
        results = []
        for idx in top_results[1].tolist()[0]:
            results.append({"article":self.data.iloc[idx].article, "key_words":self.data.iloc[idx].key_words})
        return results
    
    
    def extract_keywords(self, text):
        self.rake.extract_keywords_from_text(text)
        return self.rake.get_ranked_phrases()


Overwriting semantic_search.py


In [4]:
import pandas as pd
import numpy as np
import re
import string
import nltk
nltk.download('punkt_tab')
import string
from sentence_transformers import SentenceTransformer, util
from rake_nltk import Rake

[nltk_data] Downloading package punkt_tab to C:\Users\Ahmed-
[nltk_data]     Basem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
articles = [
    "Artificial Intelligence is revolutionizing technology.",
    "Climate change is a pressing global issue.",
    "Advances in quantum computing are remarkable.",
    "Vaccines are crucial for combating pandemics.",
    "Space exploration inspires innovation."
]
data = pd.DataFrame({"article": articles})

In [6]:
def remove_punctuations(text):
  punctuation = string.punctuation
  text = text.translate(str.maketrans("","",punctuation))
  return text

def remove_digits(text):
  digits = string.digits
  text = text.translate(str.maketrans("","",digits))
  return text

def remove_links(text):
  text = re.sub(r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)","",text)
  return text

def remove_hashtags_mentions(text):
  text = re.sub(r"@[\w]+","",text)
  text = re.sub(r"#\S+","",text)
  return text

In [12]:
def text_preprocessing(text):
    text = remove_punctuations(text)
    text = remove_digits(text)
    text = remove_links(text)
    text = remove_hashtags_mentions(text)
    return text
data['article'] = data['article'].apply(text_preprocessing)

In [49]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(data['article'].to_list(), convert_to_tensor=True)

In [50]:
rake = Rake()

def extract_keywords(text):
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

In [51]:
data['key_words'] = data['article'].apply(extract_keywords)

In [52]:
data.head()

Unnamed: 0,article,key_words
0,Artificial Intelligence is revolutionizing tec...,"[revolutionizing technology, artificial intell..."
1,Climate change is a pressing global issue.,"[pressing global issue, climate change]"
2,Advances in quantum computing are remarkable.,"[quantum computing, remarkable, advances]"
3,Vaccines are crucial for combating pandemics.,"[combating pandemics, vaccines, crucial]"
4,Space exploration inspires innovation.,[space exploration inspires innovation]


In [53]:
def similarity_search(query:str, top_k:int = 3):
    query_embedding = model.encode(query, convert_to_tensor=True)
    similarities = util.pytorch_cos_sim(query_embedding, embeddings)
    top_results = similarities.topk(k=top_k)
    results = []
    for idx in top_results[1].tolist()[0]:
        results.append({"article":data.iloc[idx].article, "key_words":data.iloc[idx].key_words})
    return results

In [54]:
query = "technology and AI"
results = similarity_search(query)
print("Semantic Search Results:", results)

Semantic Search Results: [{'article': 'Artificial Intelligence is revolutionizing technology.', 'key_words': ['revolutionizing technology', 'artificial intelligence']}, {'article': 'Space exploration inspires innovation.', 'key_words': ['space exploration inspires innovation']}, {'article': 'Advances in quantum computing are remarkable.', 'key_words': ['quantum computing', 'remarkable', 'advances']}]
