In [2]:
import os
import math

def read_keywords(input):
    keywords = []
    doc_vectors = []

    # First pass: collect unique keywords
    for fname in input:
        with open(fname, 'r') as f:
            words = []
            while True:
                token = f.readline()
                if not token:
                    break
                parts = token.strip().split()
                if len(parts) != 2:
                    continue
                word, freq = parts[0], int(parts[1])
                words.append((word, freq))
                if word not in keywords:
                    keywords.append(word)
            doc_vectors.append(words)

    # Second pass: build document vectors in fixed keyword order
    vectors = []
    for words in doc_vectors:
        vec = [0] * len(keywords)
        for word, freq in words:
            idx = keywords.index(word)
            vec[idx] = freq
        vectors.append(vec)

    return keywords, vectors


def sim(vec1, vec2):
    num = 2 * sum(a * b for a, b in zip(vec1, vec2))
    den = sum(a ** 2 for a in vec1) + sum(b ** 2 for b in vec2)
    return num / den if den != 0 else 0


def single_pass_clustering(vectors, threshold=0.6):
    clusters = [[0]]  # first document in first cluster
    centroids = [vectors[0][:]]

    for doc_id in range(1, len(vectors)):
        sims = [sim(vectors[doc_id], centroid) for centroid in centroids]
        max_sim = max(sims)
        best_cluster = sims.index(max_sim)

        print(f"\nConsider D{doc_id+1},")
        for i, s in enumerate(sims):
            print(f"Sim(D{doc_id+1}, C{i+1}) = {s:.2f}")

        if max_sim > threshold:
            print(f"Smax(D{doc_id+1}, C{best_cluster+1}) = {max_sim:.2f}")
            clusters[best_cluster].append(doc_id)
            # update centroid
            centroids[best_cluster] = [
                (a + b) / 2 for a, b in zip(centroids[best_cluster], vectors[doc_id])
            ]
        else:
            print(f"Smax(D{doc_id+1}, C{best_cluster+1}) = {max_sim:.2f}")
            print("New cluster created")
            clusters.append([doc_id])
            centroids.append(vectors[doc_id][:])

    return clusters


if __name__ == "__main__":
    no = int(input("Enter number of documents for clustering: "))
    filenames = [input(f"Enter document {i+1} name (without .txt): ") + ".txt" for i in range(no)]

    if no <= 1:
        print("Number of documents should be greater than 1!")
        exit(0)

    keywords, vectors = read_keywords(filenames)
    clusters = single_pass_clustering(vectors)

    print("\nFinal Clusters:")
    for idx, cluster in enumerate(clusters, start=1):
        members = " ".join(f"D{doc_id+1}" for doc_id in cluster)
        print(f"C{idx} = {{ {members} }}")


Enter number of documents for clustering: 2
Enter document 1 name (without .txt): doc1
Enter document 2 name (without .txt): doc2

Consider D2,
Sim(D2, C1) = 0.00
Smax(D2, C1) = 0.00
New cluster created

Final Clusters:
C1 = { D1 }
C2 = { D2 }
