In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

docs = ["Text of first document.", "Text of second document."]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs)

cos_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
print("Cosine Similarity:", cos_sim[0][0])


Cosine Similarity: 0.6029748160380572


In [2]:
def jaccard_similarity(doc1, doc2):
    words_doc1 = set(doc1.lower().split())
    words_doc2 = set(doc2.lower().split())
    intersection = len(words_doc1.intersection(words_doc2))
    union = len(words_doc1.union(words_doc2))
    return intersection / union

doc1 = "Text of first document."
doc2 = "Text of second document."
jac_sim = jaccard_similarity(doc1, doc2)
print("Jaccard Similarity:", jac_sim)


Jaccard Similarity: 0.6


Compare and Analyze

    Cosine Similarity works well for longer texts where word frequency matters.

    Jaccard Similarity is more effective for short texts or phrase matching.

    Cosine Similarity captures context better with TF-IDF, while Jaccard is a simple word overlap measure.

    If using Word Mover's Distance (WMD), it would capture semantic meaning but is computationally expensive.

 Discuss NLP Applications

    Information Retrieval: Cosine Similarity is widely used in search engines.

    Text Clustering: Jaccard Similarity can help in grouping similar documents.

    Plagiarism Detection: A mix of both can provide a better accuracy.

In [3]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# long paragraphs
doc1 = """Natural Language Processing (NLP) is a subfield of artificial intelligence that deals with 
the interaction between computers and human language. It enables computers to understand, interpret, 
and generate human language in a valuable way. NLP techniques are used in applications such as chatbots, 
text summarization, sentiment analysis, and machine translation."""

doc2 = """NLP, a branch of AI, focuses on making computers understand and process human language. 
It is widely applied in fields like automatic translation, sentiment detection, chatbot development, 
and text summarization. The advancements in deep learning have significantly improved NLP models, 
enhancing their ability to process and generate human-like text."""


In [4]:

### **Preprocessing**
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^\w\s]', '', text)  # Removing punctuation
    return text

doc1_clean = preprocess_text(doc1)
doc2_clean = preprocess_text(doc2)



In [5]:
### **1. Cosine Similarity using TF-IDF**
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([doc1_clean, doc2_clean])
cos_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
print("Cosine Similarity (TF-IDF):", cos_sim[0][0])



Cosine Similarity (TF-IDF): 0.4005821117789597


In [6]:
### **2. Jaccard Similarity using word sets**
def jaccard_similarity(doc1, doc2):
    words_doc1 = set(doc1.split())
    words_doc2 = set(doc2.split())
    intersection = len(words_doc1.intersection(words_doc2))
    union = len(words_doc1.union(words_doc2))
    return intersection / union

jac_sim = jaccard_similarity(doc1_clean, doc2_clean)
print("Jaccard Similarity:", jac_sim)


Jaccard Similarity: 0.2727272727272727
