In [25]:
sentence1 = "I love machine learning"
sentence2 = "I enjoy studying machine learning"
sentences = [sentence1, sentence2]


# **BASIC VECTORIZATION TECHNIQUES (frequency Based)**

**1️ One-Hot Encoding (OHE)**

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = CountVectorizer(binary=True)
vectors = vectorizer.fit_transform(sentences)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("OHE vectors:\n", vectors.toarray())

similarity = cosine_similarity(vectors[0], vectors[1])
print("OHE Similarity:", similarity[0][0])


Vocabulary: ['enjoy' 'learning' 'love' 'machine' 'studying']
OHE vectors:
 [[0 1 1 1 0]
 [1 1 0 1 1]]
OHE Similarity: 0.5773502691896258


**2 Bag of Words (BoW)**

In [27]:
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(sentences)

print("BoW vectors:\n", vectors.toarray())

similarity = cosine_similarity(vectors[0], vectors[1])
print("BoW Similarity:", similarity[0][0])


BoW vectors:
 [[0 1 1 1 0]
 [1 1 0 1 1]]
BoW Similarity: 0.5773502691896258


**3 N-grams (Bi-grams)**

In [32]:
vectorizer = CountVectorizer(ngram_range=(2,2))
vectors = vectorizer.fit_transform(sentences)

print("Bi-gram vocabulary:", vectorizer.get_feature_names_out())
print("Bi-gram vectors:\n", vectors.toarray())

similarity = cosine_similarity(vectors[0], vectors[1])
print("N-gram Similarity:", similarity[0][0])


Bi-gram vocabulary: ['enjoy studying' 'love machine' 'machine learning' 'studying machine']
Bi-gram vectors:
 [[0 1 1 0]
 [1 0 1 1]]
N-gram Similarity: 0.408248290463863


**4️ TF-IDF**

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(sentences)

print("TF-IDF vectors:\n", vectors.toarray())

similarity = cosine_similarity(vectors[0], vectors[1])
print("TF-IDF Similarity:", similarity[0][0])


TF-IDF vectors:
 [[0.         0.50154891 0.70490949 0.50154891 0.        ]
 [0.57615236 0.40993715 0.         0.40993715 0.57615236]]
TF-IDF Similarity: 0.4112070550676187


# **VECTOR EMBEDDDING TECHNIQUES (Semantic meaning Based)**

**1 Word2Vec**

In [33]:
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


model = api.load("word2vec-google-news-300")

def sentence_vector(sentence):
    words = sentence.lower().split()
    vectors = [model[word] for word in words if word in model]
    return np.mean(vectors, axis=0)

vec1 = sentence_vector(sentence1)
vec2 = sentence_vector(sentence2)

similarity = cosine_similarity([vec1], [vec2])
print("Word2Vec Similarity:", similarity[0][0])


Word2Vec Similarity: 0.83297575


**2 LLM-Based (Sentence Transformer)**

In [34]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(sentences)

similarity = util.cos_sim(embeddings[0], embeddings[1])
print("LLM-Based Similarity:", similarity.item())

LLM-Based Similarity: 0.7824221849441528


In [23]:
from sentence_transformers import SentenceTransformer, util


sentence1 = "The customer service was excellent and very helpful."
sentence2 = "The support team was great and solved my issue quickly"


# transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# embeddings
embeddings = model.encode([sentence1, sentence2])

# cosine similarity
cosine_score = util.cos_sim(embeddings[0], embeddings[1])


print(f"Sentence 1: {sentence1}")
print(f"Sentence 2: {sentence2}")


print(f"Embeding shape {embeddings.shape}")
print(embeddings[0][:5])
print(embeddings[1][:5])


print(f"Similarity Score: {cosine_score.item():.4f}")

Sentence 1: The customer service was excellent and very helpful.
Sentence 2: The support team was great and solved my issue quickly
Embeding shape (2, 384)
[-0.10404737  0.02839851  0.02214675 -0.03633233 -0.08666304]
[-0.06493975 -0.02756827  0.00640762 -0.02886446  0.04612743]
Similarity Score: 0.3724


# **FAQ Question Matching System**

In [15]:
faqs = [
    {
        "question": "How can I reset my password?",
        "answer": "Go to settings and click on 'Reset Password'."
    },
    {
        "question": "How do I change my email address?",
        "answer": "You can change your email from profile settings."
    },
    {
        "question": "What is the refund policy?",
        "answer": "Refunds are processed within 5–7 working days."
    },
    {
        "question": "How to contact customer support?",
        "answer": "You can contact support via email or chat."
    }
]


In [17]:
faq_questions = [faq["question"] for faq in faqs]

faq_embeddings = model.encode(faq_questions)


In [24]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

user_query = "I forgot my password and cannot login"

query_embedding = model.encode(user_query)

similarities = util.cos_sim(query_embedding, faq_embeddings)


best_score = -1
best_faq = None

for i in range(len(faqs)):
    score = similarities[0][i].item()

    if score > best_score:
        best_score = score
        best_faq = faqs[i]


print("User Question:", user_query)
print("Matched FAQ:", best_faq["question"])
print("Answer:", best_faq["answer"])
print("Similarity Score:", round(best_score, 4))


def similarity_label(score):
    if score > 0.8:
        return "Highly Relevant"
    elif score > 0.6:
        return "Relevant"
    elif score > 0.4:
        return "Weak Match"
    else:
        return "No Good Match"

print("Match Type:", similarity_label(best_score))

User Question: I forgot my password and cannot login
Matched FAQ: How can I reset my password?
Answer: Go to settings and click on 'Reset Password'.
Similarity Score: 0.8119
Match Type: Highly Relevant


In [1]:
pip install sentence-transformers

