In [114]:
!pip install numpy pandas faiss-gpu nltk fuzzywuzzy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [115]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [116]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [117]:
import torch 
import pandas as pd
import numpy as np
import faiss
from fuzzywuzzy import fuzz
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModel
import tensorflow as tf
import tensorflow_hub as hub

In [118]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
use_model = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [119]:
def embed(input):
  return use_model(input)

In [120]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [121]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [122]:
dataset = pd.DataFrame({'text': ['The quick brown fox jumps over the lazy fox',
                                 'A quick brown dog jumps over the lazy fox',
                                 'The quick brown fox jumps over the lazy cat',
                                 'The quick brown cat jumps over the lazy dog',
                                 'A quick brown fox jumps over the lazy dog']})

In [123]:
dataset

Unnamed: 0,text
0,The quick brown fox jumps over the lazy fox
1,A quick brown dog jumps over the lazy fox
2,The quick brown fox jumps over the lazy cat
3,The quick brown cat jumps over the lazy dog
4,A quick brown fox jumps over the lazy dog


In [129]:
import tensorflow_hub as hub

class FuzzySemanticSearch:
    def __init__(self, dataset, model_name):
        self.dataset = dataset
        self.model_name = model_name
        self.embed = hub.load(model_name)
        self.vectors = self.get_embedding()
        self.index = faiss.IndexFlatIP(self.vectors.shape[-1])



    def preprocess_text(self, text):
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text)
        tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
        return ' '.join(tokens)

    def encode_text(self, text):
        embedding = self.embed([text]).numpy()
        return embedding

    def get_embedding(self):
        embeddings = []
        for text in self.dataset['text']:
            preprocessed_text = self.preprocess_text(text)
            embedding = self.encode_text(preprocessed_text)
            embeddings.append(embedding)
        embeddings = np.concatenate(embeddings, axis=0)
        return embeddings

    def build_index(self):
        embeddings = np.array(self.vectors)
        self.index.add(embeddings)

    def search(self, query, k=5, threshold=0):
        preprocessed_query = self.preprocess_text(query)
        embedding = self.encode_text(preprocessed_query)
        distances, indices = self.index.search(embedding.reshape(1, -1), k)
        results = []
        for distance, index in zip(distances[0], indices[0]):
            text = self.dataset.iloc[index]['text']
            score = fuzz.token_sort_ratio(preprocessed_query, self.preprocess_text(text))
            if score >= threshold:
                results.append({'text': text, 'score': score, 'distance': distance})
        results = sorted(results, key=lambda x: (x['distance'], x['score']), reverse=True)[:k]
        return results




In [130]:
model_name = 'https://tfhub.dev/google/universal-sentence-encoder-large/5'
fuzzy_semantic_search = FuzzySemanticSearch(dataset, model_name)

In [131]:
fuzzy_semantic_search.build_index()

In [132]:
query = 'fax'
results = fuzzy_semantic_search.search(query)

[{'text': 'The quick brown fox jumps over the lazy cat', 'score': 12, 'distance': 0.087689355}, {'text': 'The quick brown fox jumps over the lazy fox', 'score': 12, 'distance': 0.082839005}, {'text': 'A quick brown fox jumps over the lazy dog', 'score': 12, 'distance': 0.053157043}, {'text': 'A quick brown dog jumps over the lazy fox', 'score': 12, 'distance': 0.051227365}, {'text': 'The quick brown cat jumps over the lazy dog', 'score': 6, 'distance': 0.037477486}]


In [133]:
for i in results:
  print(i)

{'text': 'The quick brown fox jumps over the lazy cat', 'score': 12, 'distance': 0.087689355}
{'text': 'The quick brown fox jumps over the lazy fox', 'score': 12, 'distance': 0.082839005}
{'text': 'A quick brown fox jumps over the lazy dog', 'score': 12, 'distance': 0.053157043}
{'text': 'A quick brown dog jumps over the lazy fox', 'score': 12, 'distance': 0.051227365}
{'text': 'The quick brown cat jumps over the lazy dog', 'score': 6, 'distance': 0.037477486}
