### Clustering Techniques


In [1]:
docs = [
    "Football is the most popular sport in the world",
    "Basketball and football are team sports",
    "Artificial intelligence and machine learning are transforming technology",
    "Python and Java are widely used programming languages",
    "Pizza and burgers are popular fast food items",
    "I love cooking pasta and trying new recipes"
]

In [2]:
import numpy as np
import re
from collections import Counter

# Preprocess: lowercase, remove punctuation, tokenize
def clean_and_tokenize(text):
    text = re.sub(r"[^\w\s]", "", text.lower())
    return text.split()

tokenized_docs = [clean_and_tokenize(d) for d in docs]


vocab = sorted({word for doc in tokenized_docs for word in doc})

def to_vector(tokens, vocab):
    counts = Counter(tokens)
    return np.array([counts.get(word, 0) for word in vocab])

vectors = np.vstack([to_vector(doc, vocab) for doc in tokenized_docs])

#### K-Means Clustering 

In [3]:
def k_means(data, k=3, iterations=50):
    rng = np.random.default_rng(42)
    centers = data[rng.choice(len(data), k, replace=False)]
    
    for _ in range(iterations):
        # Assign step
        labels = [np.argmin([np.linalg.norm(x-c) for c in centers]) for x in data]
        
        # Update step
        new_centers = np.array([
            data[np.array(labels) == i].mean(axis=0) if i in labels else centers[i]
            for i in range(k)
        ])
        
        if np.allclose(new_centers, centers):
            break
        centers = new_centers
    
    return labels, centers

labels_km, centers_km = k_means(vectors, k=3)
print("K-Means Cluster assignments:", labels_km)


K-Means Cluster assignments: [1, 2, 2, 2, 2, 0]


#### K-Medoids Clustering

In [4]:
def k_medoids(data, k=3, max_iter=50):
    rng = np.random.default_rng(42)
    medoids = rng.choice(len(data), k, replace=False)
    
    for _ in range(max_iter):
        # Assign points
        clusters = {i: [] for i in range(k)}
        for idx, x in enumerate(data):
            dists = [np.linalg.norm(x - data[m]) for m in medoids]
            clusters[np.argmin(dists)].append(idx)
        
        # Update medoids
        new_medoids = []
        for i in range(k):
            if not clusters[i]:
                new_medoids.append(medoids[i])
                continue
            # find index minimizing total distance
            intra_dists = [sum(np.linalg.norm(data[p]-data[q]) for q in clusters[i]) for p in clusters[i]]
            new_medoids.append(clusters[i][np.argmin(intra_dists)])
        
        if np.array_equal(new_medoids, medoids):
            break
        medoids = new_medoids
    
    return clusters, medoids

clusters_km, medoid_idx = k_medoids(vectors, k=3)
print("K-Medoids Clusters:", clusters_km)


K-Medoids Clusters: {0: [5], 1: [0], 2: [1, 2, 3, 4]}


#### Text Shingling Using Jaccard's Similarity

In [5]:
def make_shingles(tokens, k=2):
    return {tuple(tokens[i:i+k]) for i in range(len(tokens)-k+1)}

shingles_list = [make_shingles(doc, k=2) for doc in tokenized_docs]

def jaccard(a, b):
    return len(a & b) / len(a | b) if a | b else 0

sim_matrix = np.array([
    [jaccard(shingles_list[i], shingles_list[j]) for j in range(len(shingles_list))]
    for i in range(len(shingles_list))
])

print("Jaccard Similarity Matrix:\n", np.round(sim_matrix, 2))


Jaccard Similarity Matrix:
 [[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]]
