In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
import numpy as np
import matplotlib.pyplot as plt

# List of documents
documents = [
    "It is going to rain today",
    "Today Rama is not going outside to watch rain",
    "I am going to watch the movie tomorrow with Rama",
    "Tomorrow Rama is going to watch the rain at sea shore"
]

# Query document
query = "Rama watching the rain"

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents).toarray()
query_vec = vectorizer.transform([query]).toarray()

# Apply Latent Semantic Analysis (LSA)
lsa = TruncatedSVD(n_components=2, random_state=42)
X_lsa = lsa.fit_transform(X)
query_vec_lsa = lsa.transform(query_vec)

# Calculate similarities using Euclidean distance and Cosine similarity in LSA space
euclidean_distances_lsa = euclidean_distances(query_vec_lsa, X_lsa).flatten()
cosine_similarities_lsa = cosine_similarity(query_vec_lsa, X_lsa).flatten()

# Jaccard and Dice similarity functions
def jaccard_similarity(query_vec, doc_vec):
    intersection = np.logical_and(query_vec, doc_vec).sum()
    union = np.logical_or(query_vec, doc_vec).sum()
    return intersection / union if union else 0

def dice_similarity(query_vec, doc_vec):
    intersection = np.logical_and(query_vec, doc_vec).sum()
    total = query_vec.sum() + doc_vec.sum()
    return 2 * intersection / total if total else 0

# Calculate Jaccard and Dice similarities
jaccard_scores = [jaccard_similarity(query_vec.flatten(), doc) for doc in X]
dice_scores = [dice_similarity(query_vec.flatten(), doc) for doc in X]

# Combine documents with their metrics for ranking
metrics = list(zip(documents, euclidean_distances_lsa, cosine_similarities_lsa, jaccard_scores, dice_scores))

# Function to print top 2 relevant documents
def print_top_2(docs, measure, key, reverse=False):
    top_docs = sorted(docs, key=key, reverse=reverse)[:2]
    print(f"Top 2 relevant documents using {measure}:")
    for i, (doc, *metrics) in enumerate(top_docs, 1):
        print(f"{i}. Document: '{doc}'")
        print(f"   {measure}: {key((doc, *metrics)):.4f}\n")

# Print results
print_top_2(metrics, "Euclidean Distance", key=lambda x: x[1])
print_top_2(metrics, "Cosine Similarity", key=lambda x: x[2], reverse=True)
print_top_2(metrics, "Jaccard Similarity", key=lambda x: x[3], reverse=True)
print_top_2(metrics, "Dice Similarity", key=lambda x: x[4], reverse=True)

# Visualization
labels = ['D1', 'D2', 'D3', 'D4']
scores = {
    'Euclidean': [m[1] for m in metrics],
    'Cosine': [m[2] for m in metrics],
    'Jaccard': [m[3] for m in metrics],
    'Dice': [m[4] for m in metrics]
}

x = np.arange(len(labels))
width = 0.2

fig, axis = plt.subplots()
for i, (name, score) in enumerate(scores.items()):
    axis.bar(x + (i - 1.5) * width, score, width, label=name)

axis.set_ylabel('Scores')
axis.set_title('Similarity scores by document and measure')
axis.set_xticks(x)
axis.set_xticklabels(labels)
axis.legend()

plt.tight_layout()
plt.show()