<a href="https://colab.research.google.com/github/WaryFriend456/NLP/blob/main/nlplab_p4_22BD1A660W_12_03_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

text1 = "Machine learning is a field of artificial intelligence."
text2 = "Deep learning is a branch of artificial intelligence and machine learning."

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([text1, text2])

cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
print(f"Cosine Similarity (TF-IDF): {cosine_sim[0][0]:.4f}")

Cosine Similarity (TF-IDF): 0.6416


In [25]:
!pip install gensim



In [26]:
import numpy as np
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

glove_model = api.load("glove-wiki-gigaword-50")
def get_embedding(text, model):
  words = text.lower().split()
  word_vectors = [model[word] for word in words if word in model]
  if not word_vectors:
    return np.zeros(model.vector_size)
  return np.mean(word_vectors, axis=0)

embedding1 = get_embedding(text1, glove_model)
embedding2 = get_embedding(text2, glove_model)

cosine_sim = cosine_similarity([embedding1], [embedding2])
print(f"Cosine Similarity (Word Embeddings - GloVe): {cosine_sim[0][0]:.4f}")

Cosine Similarity (Word Embeddings - GloVe): 0.9705


In [27]:
import numpy as np
from scipy.spatial.distance import euclidean, cityblock, hamming
from scipy.stats import pearsonr

euclidean_distance = euclidean(embedding1, embedding2)
print(f"Euclidean Distance: {euclidean_distance:.4f}")

manhattan_distance = cityblock(embedding1, embedding2)
print(f"Manhattan Distance: {manhattan_distance:.4f}")

pearson_correlation, _ = pearsonr(embedding1, embedding2)
print(f"Pearson Correlation: {pearson_correlation:.4f}")

embedding1_binary = np.where(embedding1 > np.mean(embedding1), 1, 0)
embedding2_binary = np.where(embedding2 > np.mean(embedding2), 1, 0)

min_len = min(len(embedding1_binary), len(embedding2_binary))
hamming_distance = hamming(embedding1_binary[:min_len], embedding2_binary[:min_len])


print(f"Hamming Distance: {hamming_distance:.4f}")


Euclidean Distance: 0.9626
Manhattan Distance: 5.4426
Pearson Correlation: 0.9703
Hamming Distance: 0.1000


In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score

vectorizer = CountVectorizer(binary=True)
count_matrix = vectorizer.fit_transform([text1, text2])
jaccard_sim = jaccard_score(count_matrix[0].toarray().flatten(), count_matrix[1].toarray().flatten())
print(f"Jaccard Similarity: {jaccard_sim:.4f}")

Jaccard Similarity: 0.6000


In [29]:
import nltk
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

tokens1 = preprocess_text(text1)
tokens2 = preprocess_text(text2)

vocab = set(tokens1 + tokens2)
word_to_index = {word: i for i, word in enumerate(vocab)}

def get_embedding(tokens, vocab_size):
  embedding = np.zeros(vocab_size)
  for token in tokens:
      if token in word_to_index:
          embedding[word_to_index[token]] += 1
  return embedding

embedding1 = get_embedding(tokens1, len(vocab))
embedding2 = get_embedding(tokens2, len(vocab))


cosine_sim = cosine_similarity([embedding1], [embedding2])
print(f"Cosine Similarity (Word Embeddings - NLTK): {cosine_sim[0][0]:.4f}")


Cosine Similarity (Word Embeddings - NLTK): 0.7454


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
