# Text Similarity Metrics

Exercise notebook

Course: Algorytmy Tekstowe at AGH University

## Preprocessing and vectorization

1. Preprocessing: Convert the text documents to lowercase and remove all punctuation marks (using regular expressions, for example).
2. Vocabulary creation: Create a vocabulary by taking all unique words from all text documents.
3. Word frequency vectors: Create two vectors, each representing the frequency of each word in the vocabulary in each text document.

In [5]:
import re
from collections import Counter

def preprocess(text: str) -> str:
    # Convert the text to lowercase
    text = text.lower()
    
    # Remove all punctuation marks
    text = re.sub(r'[^\w\s]', '', text)
    
    return text

def text_to_vec(docs: list[str]) -> list[list[int]]:
    # Create vocabulary
    vocab = set()
    for doc in docs:
        doc = preprocess(doc)
        words = doc.split()
        vocab.update(words)
    
    # Create word frequency vectors
    freq_vecs = []
    for doc in docs:
        doc = preprocess(doc)
        words = doc.split()
        word_counts = Counter(words)
        freq_vec = [word_counts[word] for word in vocab]
        freq_vecs.append(freq_vec)
    
    return freq_vecs

In [6]:
# Tests
text_a = "The quick brown fox jumped over the lazy dog."
text_b = "The lazy dog was jumped over by the quick brown fox."
vec_a, vec_b = text_to_vec([text_a, text_b])


assert(set(vec_a) == set([1, 1, 1, 2, 1, 1, 1, 1, 0, 0]))
assert(set(vec_b) == set([1, 1, 1, 2, 1, 1, 1, 1, 1, 1]))

## Cosine similarity

$$
\begin{equation}
    \cos(\theta) = \frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\| \|\mathbf{B}\|}= \frac{\sum\limits_{i=1}^{n} A_i B_i}{\sqrt{\sum\limits_{i=1}^{n} A_i^2} \sqrt{\sum\limits_{i=1}^{n} B_i^2}}
    \qquad\begin{aligned}
    &\text{where:} \\
    &\mathbf{A}\text{ and }\mathbf{B} \text{ are the two vectors being compared}\\
    &n \text{ is the dimensionality of the vectors}\\
    &\theta \text{ represents the angle between two vectors } \mathbf{A} \text{ and } \mathbf{B} \text{ in a high-dimensional space}
    \end{aligned}
\end{equation}
$$

The dot product of $\mathbf{A}$ and $\mathbf{B}$ is divided by the product of their Euclidean lengths to normalize the result to a range of [-1, 1]. A value of 1 indicates that the two vectors are identical, while a value of -1 indicates that they are completely dissimilar.


In [17]:
import math

def cosine_similarity(text_a: str, text_b: str) -> float:
    freq_vecs = text_to_vec([text_a, text_b])
    dot_product = sum(a * b for a, b in zip(freq_vecs[0], freq_vecs[1]))
    
    norm_a = math.sqrt(sum(a ** 2 for a in freq_vecs[0]))
    norm_b = math.sqrt(sum(b ** 2 for b in freq_vecs[1]))
    
    return dot_product / (norm_a * norm_b)

In [18]:
# Tests
dist = cosine_similarity(text_a, text_b)
assert(abs(dist - 0.91986) < 0.0001)

## Dice coefficient / Sørensen-Dice Index

$$
\begin{equation}
    \text{Dice}(A, B) = \frac{2 |A \cap B|}{|A| + |B|} 
    \qquad\begin{aligned}
    &\text{where:} \\
    &A \text{ and } B \text{ represent the two sets being compared} \\
    &|A| \text{ and } |B| \text{ represent the cardinality (number of elements) of the sets} \\
    &\text{and } |A \cap B| \text{ represents the size of the intersection of the two sets}
    \end{aligned}
\end{equation}
$$


In [19]:
def dice(text_a: str, text_b: str) -> float:
    text_a = preprocess(text_a)
    text_b = preprocess(text_b)
    
    set_a = set(text_a.split())
    set_b = set(text_b.split())
    
    intersection_ = len(set_a.intersection(set_b))
    union_ = len(set_a) + len(set_b)
    
    return 2 * intersection_ / union_

dice(text_a, text_b)

0.8888888888888888

In [20]:
# Tests
dist = dice(text_a, text_b)
assert(abs(dist - 0.88888) < 0.0001)

## Euclidean distance

$$
\begin{equation}
    d(x,y) = \sqrt{\sum_{i=1}^{n}(x_i-y_i)^2}
    \qquad\begin{aligned}
    &\text{where:} \\
    &d(x,y) \text{ is the Euclidean distance} \\
    &x_i, y_i \text{ are the values of the i-th dimension of vectors } x \text{ and } y \\
    &n \text{ is the number of dimensions in the vectors}
    \end{aligned}
\end{equation}
$$

In [25]:
def euclidean_distance(text_a: str, text_b: str) -> float:
    x, y = text_to_vec([text_a, text_b])

    dist = 0
    for i in range(len(x)):
        dist += (x[i] - y[i]) ** 2

    return math.sqrt(dist)

In [26]:
# Tests

dist = euclidean_distance(text_a, text_b)
assert(abs(dist - 1.4142135) < 0.0001)

## LCS - Longest Common Subsequence

Longest, common, continuous subsequence of two sequences, aka "the longest substring".

In [15]:
from typing import Any, Sequence

def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:
    n = len(seq_a)
    m = len(seq_b)
    dp = [[0] * (m+1) for _ in range(n+1)]
    for i in range(n):
        for j in range(m):
            if seq_a[i] == seq_b[j]:
                dp[i+1][j+1] = dp[i][j] + 1
            else:
                dp[i+1][j+1] = max(dp[i+1][j], dp[i][j+1])
    return dp[-1][-1]

def word_lcs(text_a: str, text_b: str) -> int:
    # Split the texts into words
    seq_a = text_a.split()
    seq_b = text_b.split()

    return lcs(seq_a, seq_b)


In [16]:
# Tests
assert lcs("banana", "ananas") == 5
assert word_lcs(text_a, text_b) == 4

# Davies-Bouldin

In [28]:
from typing import List
from math import sqrt

def centroid(points: List[List[float]]) -> List[float]:
    n = len(points[0])
    center = [0.0] * n
    for point in points:
        center = [c + p for c, p in zip(center, point)]

    return [c / len(points) for c in center]