# Zeeshan Ali (01-134212-197)
# NLP Assignment#03


# Jaccard Similarity (For Two Documents)

In [11]:
with open('doc1.txt', 'r') as f:
    doc1 = f.read().lower()

with open('doc2.txt', 'r') as f:
    doc2 = f.read().lower()

set1 = set(doc1.split())
set2 = set(doc2.split())

intersection = 0
for word in set1:
    if word in set2:
        intersection += 1

union = len(set1) + len(set2) - intersection

jaccard = intersection / union if union != 0 else 0
print("Jaccard Similarity:", jaccard)

Jaccard Similarity: 0.5


# Cosine Similarity (For Two Documents)

In [12]:
words1 = doc1.split()
words2 = doc2.split()

# Create a unique list of words from both documents
all_words = []
for word in words1:
    if word not in all_words:
        all_words.append(word)
for word in words2:
    if word not in all_words:
        all_words.append(word)

# Initialize word count vectors
vector1 = [0] * len(all_words)
vector2 = [0] * len(all_words)

# Populate word count vectors
for i in range(len(all_words)):
    vector1[i] = words1.count(all_words[i])
    vector2[i] = words2.count(all_words[i])

dot_product = 0
magnitude1 = 0
magnitude2 = 0

for i in range(len(vector1)):
    dot_product += vector1[i] * vector2[i]
    magnitude1 += vector1[i] ** 2
    magnitude2 += vector2[i] ** 2

magnitude1 = magnitude1 ** 0.5
magnitude2 = magnitude2 ** 0.5

cosine = dot_product / (magnitude1 * magnitude2) if magnitude1 != 0 and magnitude2 != 0 else 0
print("Cosine Similarity:", cosine)


Cosine Similarity: 0.6761234037828131


# Jaccard Similarity Extension (Multiple Documents)

In [22]:
import pandas as pd

query = "artificial intelligence"

with open('document1.txt', 'r') as f:
    document1 = f.read().lower()

with open('document2.txt', 'r') as f:
    document2 = f.read().lower()

with open('document3.txt', 'r') as f:
    document3 = f.read().lower()

with open('document4.txt', 'r') as f:
    document4 = f.read().lower()

query_tokens = set(query.lower().split())
documents = [document1, document2, document3, document4]

jaccard_scores = []
for i, doc in enumerate(documents):
    doc_tokens = set(doc.split())

    intersection = len(query_tokens & doc_tokens)
    union = len(query_tokens | doc_tokens)

    jaccard = intersection / union if union != 0 else 0
    jaccard_scores.append((i + 1, jaccard))

df = pd.DataFrame(jaccard_scores, columns=['Document', 'Jaccard Similarity'])
df_sorted = df.sort_values(by='Jaccard Similarity', ascending=False)

print(df_sorted)


   Document  Jaccard Similarity
0         1            0.035714
3         4            0.035088
1         2            0.032787
2         3            0.019608


# Cosine Similarity Extension (Multiple Documents)

In [28]:
query = "medical field"

with open('document1.txt', 'r') as f:
    document1 = f.read().lower()

with open('document2.txt', 'r') as f:
    document2 = f.read().lower()

with open('document3.txt', 'r') as f:
    document3 = f.read().lower()

with open('document4.txt', 'r') as f:
    document4 = f.read().lower()

query_tokens = query.lower().split()
query_word_counts = {}
for word in query_tokens:
    if word in query_word_counts:
        query_word_counts[word] += 1
    else:
        query_word_counts[word] = 1

documents = [document1, document2, document3, document4]

cosine_scores = []
for i, doc in enumerate(documents):
    doc_tokens = doc.split()
    doc_word_counts = {}
    for word in doc_tokens:
        if word in doc_word_counts:
            doc_word_counts[word] += 1
        else:
            doc_word_counts[word] = 1

    # Union of words for query and document vectors
    all_words = list(set(query_tokens) | set(doc_tokens))

    # Create vectors for query and document
    query_vector = [query_word_counts.get(word, 0) for word in all_words]
    doc_vector = [doc_word_counts.get(word, 0) for word in all_words]

    # Calculate dot product and magnitudes
    dot_product = sum(q * d for q, d in zip(query_vector, doc_vector))
    magnitude_query = sum(q ** 2 for q in query_vector) ** 0.5
    magnitude_doc = sum(d ** 2 for d in doc_vector) ** 0.5

    # Compute Cosine similarity
    cosine = dot_product / (magnitude_query * magnitude_doc) if magnitude_query and magnitude_doc else 0
    cosine_scores.append((i + 1, cosine))

df = pd.DataFrame(cosine_scores, columns=['Document', 'Cosine Similarity'])
df_sorted = df.sort_values(by='Cosine Similarity', ascending=False)

print(df_sorted)


   Document  Cosine Similarity
3         4           0.208013
2         3           0.071429
0         1           0.000000
1         2           0.000000


# What If Query Is Not Known?
## In a scenario where no specific query is provided, clustering methods such as k-means can help organize the dataset into groups of similar documents. Each cluster represents a group of closely related documents based on their word content.

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

with open('document1.txt', 'r') as f:
    document1 = f.read()

with open('document2.txt', 'r') as f:
    document2 = f.read()

with open('document3.txt', 'r') as f:
    document3 = f.read()

with open('document4.txt', 'r') as f:
    document4 = f.read()

documents = [document1, document2, document3, document4]

# Step 1: Convert documents into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Step 2: Apply KMeans Clustering
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

for i, cluster in enumerate(kmeans.labels_):
    print(f"Document {i+1} is in Cluster {cluster}")


Document 1 is in Cluster 0
Document 2 is in Cluster 0
Document 3 is in Cluster 1
Document 4 is in Cluster 1
