# Text Similarity Metrics

Exercise notebook

Course: Algorytmy Tekstowe at AGH University

# 1. Zaimplementuj przynajmniej 3 "metryki" spośród wymienionych: cosinusowa, LCS, DICE, euklidesowa, Levenshteina.

## Preprocessing and vectorization

1. Preprocessing: Convert the text documents to lowercase and remove all punctuation marks (using regular expressions, for example).
2. Vocabulary creation: Create a vocabulary by taking all unique words from all text documents.
3. Word frequency vectors: Create two vectors, each representing the frequency of each word in the vocabulary in each text document.

In [1]:
import re
from collections import Counter

def preprocess(text: str) -> str:
    # Convert the text to lowercase
    text = text.lower()
    
    # Remove all punctuation marks
    text = re.sub(r'[^\w\s]', '', text)
    
    return text

def text_to_vec(docs: list[str]) -> list[list[int]]:
    # Create vocabulary
    vocab = set()
    for doc in docs:
        doc = preprocess(doc)
        words = doc.split()
        vocab.update(words)
    
    # Create word frequency vectors
    freq_vecs = []
    for doc in docs:
        doc = preprocess(doc)
        words = doc.split()
        word_counts = Counter(words)
        freq_vec = [word_counts[word] for word in vocab]
        freq_vecs.append(freq_vec)
    
    return freq_vecs

In [2]:
# Tests
text_a = "The quick brown fox jumped over the lazy dog."
text_b = "The lazy dog was jumped over by the quick brown fox."
vec_a, vec_b = text_to_vec([text_a, text_b])


assert(set(vec_a) == set([1, 1, 1, 2, 1, 1, 1, 1, 0, 0]))
assert(set(vec_b) == set([1, 1, 1, 2, 1, 1, 1, 1, 1, 1]))

## Cosine similarity

$$
\begin{equation}
    \cos(\theta) = \frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\| \|\mathbf{B}\|}= \frac{\sum\limits_{i=1}^{n} A_i B_i}{\sqrt{\sum\limits_{i=1}^{n} A_i^2} \sqrt{\sum\limits_{i=1}^{n} B_i^2}}
    \qquad\begin{aligned}
    &\text{where:} \\
    &\mathbf{A}\text{ and }\mathbf{B} \text{ are the two vectors being compared}\\
    &n \text{ is the dimensionality of the vectors}\\
    &\theta \text{ represents the angle between two vectors } \mathbf{A} \text{ and } \mathbf{B} \text{ in a high-dimensional space}
    \end{aligned}
\end{equation}
$$

The dot product of $\mathbf{A}$ and $\mathbf{B}$ is divided by the product of their Euclidean lengths to normalize the result to a range of [-1, 1]. A value of 1 indicates that the two vectors are identical, while a value of -1 indicates that they are completely dissimilar.


In [3]:
import math

def cosine_similarity(text_a: str, text_b: str) -> float:
    freq_vecs = text_to_vec([text_a, text_b])
    dot_product = sum(a * b for a, b in zip(freq_vecs[0], freq_vecs[1]))
    
    norm_a = math.sqrt(sum(a ** 2 for a in freq_vecs[0]))
    norm_b = math.sqrt(sum(b ** 2 for b in freq_vecs[1]))
    
    return dot_product / (norm_a * norm_b)

In [4]:
# Tests
dist = cosine_similarity(text_a, text_b)
assert(abs(dist - 0.91986) < 0.0001)

## Dice coefficient / Sørensen-Dice Index

$$
\begin{equation}
    \text{Dice}(A, B) = \frac{2 |A \cap B|}{|A| + |B|} 
    \qquad\begin{aligned}
    &\text{where:} \\
    &A \text{ and } B \text{ represent the two sets being compared} \\
    &|A| \text{ and } |B| \text{ represent the cardinality (number of elements) of the sets} \\
    &\text{and } |A \cap B| \text{ represents the size of the intersection of the two sets}
    \end{aligned}
\end{equation}
$$


In [5]:
def dice(text_a: str, text_b: str) -> float:
    text_a = preprocess(text_a)
    text_b = preprocess(text_b)
    
    set_a = set(text_a.split())
    set_b = set(text_b.split())
    
    intersection_ = len(set_a.intersection(set_b))
    union_ = len(set_a) + len(set_b)
    
    return 2 * intersection_ / union_

dice(text_a, text_b)

0.8888888888888888

In [6]:
# Tests
dist = dice(text_a, text_b)
assert(abs(dist - 0.88888) < 0.0001)

## Euclidean distance

$$
\begin{equation}
    d(x,y) = \sqrt{\sum_{i=1}^{n}(x_i-y_i)^2}
    \qquad\begin{aligned}
    &\text{where:} \\
    &d(x,y) \text{ is the Euclidean distance} \\
    &x_i, y_i \text{ are the values of the i-th dimension of vectors } x \text{ and } y \\
    &n \text{ is the number of dimensions in the vectors}
    \end{aligned}
\end{equation}
$$

In [7]:
def euclidean_distance(text_a: str, text_b: str) -> float:
    x, y = text_to_vec([text_a, text_b])

    dist = 0
    for i in range(len(x)):
        dist += (x[i] - y[i]) ** 2

    return math.sqrt(dist)

In [8]:
# Tests

dist = euclidean_distance(text_a, text_b)
assert(abs(dist - 1.4142135) < 0.0001)

## LCS - Longest Common Subsequence

Longest, common, continuous subsequence of two sequences, aka "the longest substring".

In [9]:
from typing import Any, Sequence

def lcs(seq_a: Sequence[Any], seq_b: Sequence[Any]) -> int:
    n = len(seq_a)
    m = len(seq_b)
    dp = [[0] * (m+1) for _ in range(n+1)]
    for i in range(n):
        for j in range(m):
            if seq_a[i] == seq_b[j]:
                dp[i+1][j+1] = dp[i][j] + 1
            else:
                dp[i+1][j+1] = max(dp[i+1][j], dp[i][j+1])
    return dp[-1][-1]

def word_lcs(text_a: str, text_b: str) -> int:
    # Split the texts into words
    seq_a = text_a.split()
    seq_b = text_b.split()

    return lcs(seq_a, seq_b)


In [10]:
# Tests
assert lcs("banana", "ananas") == 5
assert word_lcs(text_a, text_b) == 4

# 2. Zaimplementuj przynajmniej 1 sposoby oceny jakości klasteryzacji (np. indeks Daviesa-Bouldina).

# Davies-Bouldin

#### Centroid klastra #### 

średnia pozycja wszystkich punktów należących do klastra.
Dla klastra o n punktach i d wymiarach, gdzie $ \mathbf{x}_i $ oznacza i-ty punkt w klastrze:

$  c = \frac{1}{n}\sum_{i=1}^{n} \mathbf{x}_i $

#### Odległość między centroidami ####

odległość euklidesowa pomiędzy centroidami dwóch różnych klastrów.
Dla klastrów $ C_i $ i $ C_j $ :

$  \Delta_{ij} =|| C_{i} - C_{j}||_{2} $

#### Odległość wewnętrzna ####

średnia odległość punktów w klastrze od jego centroidu. Dla klastra $ C_i $ :

$ s_{i} = \frac{1}{n_{i}}\sum_{x \epsilon C_{i}}^{} ||x - c_{i}||_{2} $

#### Współczynnik Daviesa-Bouldina ####

$ R_{i} = \frac{s_{i} + s_{j}}{ \Delta_{ij}} $


gdzie $ j $ jest klastrem różny od $ i $ i dla którego wartość $ \frac{s_i+s_j}{\Delta_{ij}} $ jest maksymalna.
Ostateczna wartość współczynnika Daviesa-Bouldina to średnia wartość $ R_i $ dla wszystkich klastrów.



In [11]:
import numpy as np

def get_cluster_matrices(data_points, labels):
    k = max(labels) + 1
    cluster_matrices = [[] for _ in range(k)]

    for i, label in enumerate(labels):
        cluster_matrices[label].append(data_points[i])

    return [np.vstack(cluster_matrices[i]) for i in range(k)]

def get_centroids(cluster_matrices):
    return [np.mean(matrix, axis=0) for matrix in cluster_matrices]


def get_avg_distances(cluster_matrices, centroids):
    avg_distances = []
    for matrix, centroid in zip(cluster_matrices, centroids):
        distance = np.linalg.norm(matrix - centroid, axis=1)
        avg_distance = np.mean(distance)
        avg_distances.append(avg_distance)
    return avg_distances

def davies_bouldin(data_points, labels):
    cluster_matrices = get_cluster_matrices(data_points, labels)
    centroids = get_centroids(cluster_matrices)
    avg_distance = get_avg_distances(cluster_matrices, centroids)

    k = len(centroids)
    R = np.zeros((k, k))
    
    for i in range(k):
        for j in range(i + 1, k):
            dist = avg_distance[i] + avg_distance[j]
            delta = np.linalg.norm(centroids[i] - centroids[j])
            R[i, j] = dist / delta
            R[j, i] = R[i, j]

    return np.mean(np.max(R, axis=1))

# 3. Stwórz stoplistę najczęściej występujących słów i zastosuj ją jako pre-processing dla nazw. Algorytmy klasteryzacji powinny działać na dwóch wariantach: z pre-processingiem i bez pre-processingu.

In [12]:
def stoplist(text, frequency= 200):
    words = text.split()
    counted = {word: words.count(word) for word in set(words)}
    return {word for word in counted if counted[word] >= frequency}

# 4. Wykonaj klasteryzację zawartości załączonego pliku (lines.txt) przy użyciu  metryk zaimplementowanych w pkt. 1. Każda linia to adres pocztowy firmy, różne sposoby zapisu tego samego adresu powinny się znaleźć w jednym klastrze

In [None]:
file = open('lines.txt')
text = file.read()
texts = text.split('\n')
file.close()

def calculate_similarity(texts, metric='cosine_similarity'):
    similarities = []
    for i in range(len(texts)):
        for j in range(i+1, len(texts)):
            # skip empty or stop-word-only texts
            if not texts[i] or not texts[j] or len(texts[i]) == 0 or len(texts[j])==0:
                continue
            similarity = metric(texts[i], texts[j])
            similarities.append(similarity)
    return similarities

from scipy.cluster.hierarchy import fclusterdata

def clusterf(metric='cosine_similarity', t=1):
    similarities = calculate_similarity(texts, metric)
    T = fclusterdata(np.array(similarities).reshape(-1, 1), t=t, criterion='distance', method='complete')
    clusters = [[] for i in range(max(T))]
    text_clusters = None
    if texts is not None:
        text_clusters = [[] for i in range(max(T))]
        for i in range(len(T)):
            clusters[T[i]-1].append(texts[i])
            text_clusters[T[i]-1].append(texts[i])
    return clusters, text_clusters

def get_result(metric):
    clusters, text_clusters = clusterf(metric)
    for i, cluster in enumerate(clusters):
        print(f'Cluster {i+1}:')
        for text in cluster:
            print(text)
        print('-------------')

## Cosine

### preprocessing 

In [None]:
get_result(cosine_similarity)

### bez preprocessingu

In [49]:
get_result(5, 'cosine', False)




Cluster 1:
''PA INTERIOR'' LTD BOLSHAYA LUBYANKA STREET, 16/4 MOSCOW, 101000, RUSSIA INN/KPP 7704550148//770801001 495-984-8611
''SSONTEX''SP.ZO.O.IMPORT-EXPORT UL:PRZECLAWSKA 5 03-879 WARSZAWA,POLAND NIP 113-01-17-669 TEL./FAX.:0048(022)217 6532--
'MASTER PLUS CO.,LTD.' 143000,RUSSIA,MO,ODINSOVO, MOJAISKOE, SHOSSE,153G TEL:+7495 7273939
"2TIGERS GROUP LIMITED"  ROOM 504 JINSHAZHOU SHANGSHUI ROAD,  GUANGZHOU 510160
"A-LIFT",JSC 1 PROSPEKT MARSHALA ZHUKOVA,MOSCOW 123308,RUSSIA  T: +7(495)784-7961
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG, GAPSALSKAYA STR.,5,OFFICE 1-3; TEL:+78123277732,FAX:+781 23277729.VOLOKNO@YAHOO.COM
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3; TEL.:+78123277732,FAX + 78123277729.
"AVANPORT"  LLC INN: 7839413675 KPP: 783901001 190020, SAINT PETERSBURG, QUAY OF THE OBVODNY CHANNEL, 134-136-138, BUILD. 101, LIT. A"
"AVANPORT"  LLC INN: 7839413675 KPP: 783901001 190020, SAINT PETERSBURG, QUAY OF  THE OBVODNY CHANNEL,134-136-138,  BUILD. 

## Dice

### preprocessing

In [50]:
get_result(5, 'dice', True)




Cluster 1:
"ALISA" LTD, 1/5 Derbenevskaya str., Moscow, Russia Tel./Fax: (495) 987-13-07 postal code: 115114
"ALLIANCE-TRADE" LLC INN: 7816391055 / KPP: 784601001 190020, Saint Petersburg, quay of the Obvodny channel, 138, bulk 1, liter.B
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG, GAPSALSKAYA STR.,5,OFFICE 1-3; TEL.:+78123277732,FAX+ 78123277729.
"AVENTA"LTD. ADDRESS:129 110,MOSCOW,PROSPEKT MIRA,H.52,STR.3,POM,III Tel: 8-968-808-80-11
"ENS" LTD ADDRESS: STAROPETROVSKIYPASSAGE, BLD 7A,  CONSTRUCTION 3 125130, MOSCOW, RUSSIA TEL: (499) 130-7336
"EXPRESS CO. LTD." RUSSIA 155101 IVANOVSKAYA REGION, LEZHNEVSKIY RAION, D. KOROVIHA, CENTRALNAYA STR. 4ARUSSIA
"FILLOGISTIK" PP ZHOVKIVSKA 22 STREET 79019 LVOV,UKRAINE TEL:+380322458030 FAX:+380322458030
"FMG SHIPPING AND FORWARDING, LTD."190020 ST.PETERSBURG, RUSSIA BUMAZHNAYA STR., 18, OFF. 310
"FRUITIMPEX" LLC TEL. (495) 926-74-49
"KM" LTD 197183 RUSSIA,ST. PETERSBURG,SABIROVSKAYA STR.,50,OFF,115 OLEG TEREKHOV ZIP CODE:197183 PHONE:7911239-3

### bez preprocessingu 

In [51]:
get_result(5, 'dice', False)




Cluster 1:
"ALISA" LTD, 1/5 Derbenevskaya str., Moscow, Russia Tel./Fax: (495) 987-13-07 postal code: 115114
"ALLIANCE-TRADE" LLC INN: 7816391055 / KPP: 784601001 190020, Saint Petersburg, quay of the Obvodny channel, 138, bulk 1, liter.B
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG, GAPSALSKAYA STR.,5,OFFICE 1-3; TEL.:+78123277732,FAX+ 78123277729.
"AVENTA"LTD. ADDRESS:129 110,MOSCOW,PROSPEKT MIRA,H.52,STR.3,POM,III Tel: 8-968-808-80-11
"ENS" LTD ADDRESS: STAROPETROVSKIYPASSAGE, BLD 7A,  CONSTRUCTION 3 125130, MOSCOW, RUSSIA TEL: (499) 130-7336
"EXPRESS CO. LTD." RUSSIA 155101 IVANOVSKAYA REGION, LEZHNEVSKIY RAION, D. KOROVIHA, CENTRALNAYA STR. 4ARUSSIA
"FILLOGISTIK" PP ZHOVKIVSKA 22 STREET 79019 LVOV,UKRAINE TEL:+380322458030 FAX:+380322458030
"FMG SHIPPING AND FORWARDING, LTD."190020 ST.PETERSBURG, RUSSIA BUMAZHNAYA STR., 18, OFF. 310
"FRUITIMPEX" LLC TEL. (495) 926-74-49
"KM" LTD 197183 RUSSIA,ST. PETERSBURG,SABIROVSKAYA STR.,50,OFF,115 OLEG TEREKHOV ZIP CODE:197183 PHONE:7911239-3

## LCS

### preprocessing 

In [52]:
get_result(5, 'lcs', True)




Cluster 1:
"ALISA" LTD, 1/5 Derbenevskaya str., Moscow, Russia Tel./Fax: (495) 987-13-07 postal code: 115114
"ALLIANCE-TRADE" LLC INN: 7816391055 / KPP: 784601001 190020, Saint Petersburg, quay of the Obvodny channel, 138, bulk 1, liter.B
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG, GAPSALSKAYA STR.,5,OFFICE 1-3; TEL.:+78123277732,FAX+ 78123277729.
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3; TEL.:+78123277732,FAX + 78123277729.
"ARIVIST", 198035, RUSSIA,  SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3;  TEL.+78123277732, FAX  +78123277729.
"AVANPORT"  LLC INN: 7839413675 KPP: 783901001 190020, SAINT PETERSBURG, QUAY OF  THE OBVODNY CHANNEL,134-136-138,  BUILD. 101, LIT. A"
"AVENTA"LTD. ADDRESS:129 110,MOSCOW,PROSPEKT MIRA,H.52,STR.3,POM,III Tel: 8-968-808-80-11
"AVSON-GROUP" COMPANY LIMITED 190008, ST. PETERSBURG,  RIMSKOGO-KORSAKOVA,  73/33, LIT. A-1
"BIO plus LTD" INN 7805166210 198303, S-Petersburg, Leninskiy Pr.,110/1, lit B, app.53-H, RUSSIA
"CARGOI

### bez preprocessingu

In [53]:
get_result(5, 'lcs', False)




Cluster 1:
"ARIVIST", 198035,RUSSIA,SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3; TEL.:+78123277732,FAX+ 78123277729.
"ARIVIST", 198035, RUSSIA,  SAINT-PETERSBURG,  GAPSALSKAYA STR.,5,OFFICE1-3;  TEL.+78123277732, FAX  +78123277729.
"ARIVIST", 198035, RUSSIA, SAINT-PETERSBURG. GAPSALSKAYA STR.,5. OFFICE1-3; TEL.+78123277732, FAX+78123277729. VOLOKNO@YAHOO.COM
"AVANPORT"  LLC INN: 7839413675 KPP: 783901001 190020, SAINT PETERSBURG, QUAY OF  THE OBVODNY CHANNEL,134-136-138,  BUILD. 101, LIT. A"
"Avson-group" Company Limited  190008, St. Petersburg, Rimskogo-Korsakova, 73/33, lit. A-1
"CARGOIMPORT" LTD VNUKOVSKAYA STREET, BUILDING 2, R.43-H-1, SAINT-PETERSBURG, RUSSIA .
"ELECTROGROUP" (OOO),190068.RUSSIA,SAINT-PETERSBURG,UL.BOLSHAYA PODYACHESKAYA,5,LIT.A,POM.4-N
"ENS" LTD ADDRESS: STAROPETROVSKIYPASSAGE, BLD 7A,  CONSTRUCTION 3 125130, MOSCOW, RUSSIA TEL: (499) 130-7336
"EXPRESS CO. LTD."  RUSSIA 155101 IVANOVSKAYA REGION, LEZHNEVSKIY RAION, D. KOROVIHA, CENTRALNAYA STR. 4ARUSSIA
"F

# 5. Porównaj jakość wyników sposobami zaimplementowanymi w pkt. 2.

In [71]:
import pandas as pd


def get_document_vectors(lines, words):
    return np.array([np.bincount([words.get(token, -1) \
    for token in re.split(r'\W+', line.lower()) if token \
    and token in words], minlength=len(words)) for line in lines])

def get_document_term_matrix(lines, stopwords=None):
    if stopwords is None:
        stopwords = set()

    words = {token: i for i, token in \
             enumerate(sorted(set(token for line in lines \
                for token in re.split(r'\W+', line.lower()) \
                if token and token not in stopwords)))}

    vectors = get_document_vectors(lines, words)

    return csr_matrix(vectors / norm(vectors, axis=1)[:, None])


def evaluate_metrics(n_clusters, metrics):
    results = []
    for metric in metrics:
        for preprocessing in [True, False]:
            preprocessed_label = 'preprocessed' if preprocessing else 'not preprocessed'
            preprocessed_lines, lines = prepare_data(preprocessing)
            similarity_matrix = np.zeros((len(preprocessed_lines), len(preprocessed_lines)))

            for i in range(len(preprocessed_lines)):
                for j in range(i+1, len(preprocessed_lines)):
                    if metric == 'cosine':
                        similarity_matrix[i, j] = cosine_similarity(preprocessed_lines[i], preprocessed_lines[j])
                    elif metric == 'dice':
                        similarity_matrix[i, j] = dice(preprocessed_lines[i], preprocessed_lines[j])
                    elif metric == 'euclidean':
                        similarity_matrix[i, j] = euclidean_distance(preprocessed_lines[i], preprocessed_lines[j])
                    elif metric == 'lcs':
                        similarity_matrix[i, j] = word_lcs(preprocessed_lines[i], preprocessed_lines[j])
                    similarity_matrix[j, i] = similarity_matrix[i, j]

            clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage='complete')
            labels = clustering.fit_predict(similarity_matrix)
            db_index = davies_bouldin(preprocessed_lines, labels)

            result = {'metric': metric, 'preprocessing': preprocessed_label, 'davies_bouldin_index': db_index}
            results.append(result)
    df_results = pd.DataFrame(results)
    return df_results

In [72]:
metrics = ['cosine', 'dice', 'euclidean', 'lcs']
df = evaluate_metrics(5, metrics)
print(df)

TypeError: cannot perform reduce with flexible type