# Задание 1 (5 балла)

Имплементируйте алгоритм Леска (описание есть в семинаре) и оцените качество его работы на датасете `data/corpus_wsd_50k.txt`

В качестве метрики близости вы должны попробовать два подхода:

1) Jaccard score на множествах слов (определений и контекста)
2) Cosine distance на эмбедингах sentence_transformers

В качестве метрики используйте accuracy (% правильных ответов). Предсказывайте только многозначные слова в датасете

Контекст вы можете определить самостоятельно (окно вокруг целевого слова или все предложение). Также можете поэкспериментировать с предобработкой для обоих методов.

In [87]:
corpus_wsd = []
corpus = open('corpus_wsd_50k.txt').read().split('\n\n')
for sent in corpus:
    corpus_wsd.append([s.split('\t') for s in sent.split('\n')])


In [88]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from string import punctuation

from sklearn.metrics.pairwise import cosine_distances, cosine_similarity
from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [89]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embed = model.encode



In [203]:
ps = PorterStemmer()
def stemmer(sentence):
    tokenized_sentence = nltk.word_tokenize(sentence)
    stemmed_sentence = ' '.join([ps.stem(token) for token in tokenized_sentence if token not in punctuation])
    return stemmed_sentence

def definitions(word):
    defs = [d.definition() for d in wn.synsets(word)]
    return defs

def auto_context(i, sent):
    res_context = list()
    for j, word in enumerate(sent):
        if j == i:
            res_context.append("_")
        else:
            res_context.append(ps.stem(word[1]))
    return res_context

In [204]:
def jaccard(j_definition, j_context):
    intersection = (set(j_definition) & set(j_context))
    union = (set(j_definition) | set(j_context))
    j_res =  len(intersection) / len(union)    
    return (j_res)

def result_w_jaccard (word, context):
    res_def = list()
    res_score = list()
    max_index = 0
    for definition in definitions(word):
        res_def.append(definition)
        res_score.append (jaccard(stemmer(definition), context))
    max_score = max(res_score)
    for i, score in enumerate(res_score):
        if score == max_score:     
            max_index = i
    return res_def[max_index]

In [205]:
def cosine(c_definition, c_context):
    emb_context = embed(" ".join(c_context))
    emb_def = embed(" ".join(c_definition))
    c_res = cosine_similarity(emb_context.reshape(1,-1), emb_def.reshape(1, -1))
    return (c_res)

def result_w_cosine (word, context):
    res_def = list()
    res_score = list()
    max_index = 0
    for definition in definitions(word):
        res_def.append(definition)
        res_score.append (cosine(stemmer(definition), context))
    max_score = max(res_score)
    for i, score in enumerate(res_score):
        if score == max_score:     
            max_index = i
    return res_def[max_index]

In [206]:
def accuracy (results):
    jac_accuracy = 0
    cos_accuracy = 0
    length = len(results)
    for result in results:
        target, jac_res, cos_res = result
        if jac_res == target:
            jac_accuracy+=1
        if cos_res == target: 
            cos_accuracy+=1
    jac_accuracy = (jac_accuracy*100)/length
    cos_accuracy = (cos_accuracy*100)/length
    print ("jaccard: ", round(jac_accuracy), "%\ncosine: ", round(cos_accuracy), "%")

In [207]:
context = 'The sign was damaged in an _.'
stemmed_con = stemmer(context)
result = result_w_jaccard ('accident', stemmed_con)
result2 = result_w_cosine ('accident', stemmed_con)

print(result, result2)

an unfortunate mishap; especially one causing damage or injury an unfortunate mishap; especially one causing damage or injury


In [208]:
ress = list()
for sentence in corpus_wsd[:40]:
    for num, word in enumerate (sentence):
        if word[0] == '':
            continue
        else:
            target = wn.lemma_from_key(word[0]).synset().definition()
            context = auto_context(num, sentence)
            jac_res = result_w_jaccard(word[1], context)
            cos_res = result_w_cosine(word[1], context)
            ress.append ((target, jac_res, cos_res))
accuracy(ress)

jaccard:  18 %
cosine:  29 %


# Задание 2 (5 балла)
Попробуйте разные алгоритмы кластеризации на датасете - `https://github.com/nlpub/russe-wsi-kit/blob/initial/data/main/wiki-wiki/train.csv`

Используйте код из семинара как основу. Используйте ARI как метрику качества.

Попробуйте все 4 алгоритма кластеризации, про которые говорилось на семинаре. Для каждого из алгоритмов попробуйте настраивать гиперпараметры (посмотрите их в документации). Прогоните как минимум 5 экспериментов (не обязательно успешных) с разными параметрами на каждый алгоритме кластеризации и оцените: качество кластеризации, скорость работы, интуитивность параметров.

Помимо этого также выберите 1 дополнительный алгоритм кластеризации отсюда - https://scikit-learn.org/stable/modules/clustering.html , опишите своими словами принцип его работы  и проделайте аналогичные эксперименты. 

In [249]:
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AffinityPropagation, AgglomerativeClustering, OPTICS
import numpy as np
from sklearn.metrics import adjusted_rand_score

In [213]:
df = pd.read_csv('https://raw.githubusercontent.com/nlpub/russe-wsi-kit/initial/data/main/wiki-wiki/train.csv', sep='\t')

# DBSCAN

In [221]:
grouped_df = df.groupby('word')[['word', 'context', 'gold_sense_id']]

ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = DBSCAN(min_samples=1, eps=0.1)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.001053019960000099


In [222]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = DBSCAN(min_samples=3, eps=0.1)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

-0.0021290615824144776


In [223]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = DBSCAN(min_samples=5, eps=0.5)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

-0.011271692824715207


In [224]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = DBSCAN(min_samples=1, eps=1)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.0


In [227]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = DBSCAN(min_samples=5, eps=0.5, algorithm='ball_tree')
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

-0.011271692824715207


# AFFINITY PROPAGATION

In [228]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AffinityPropagation(damping=0.5)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.042740969848549505


In [234]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AffinityPropagation(damping=0.6)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.042740969848549505


In [233]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AffinityPropagation(damping=0.9, convergence_iter=5)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.04916074877739414


In [232]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AffinityPropagation(damping=0.7, max_iter=100)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.04154515818974152




In [231]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AffinityPropagation(damping=0.8)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.04154515818974152


# AGGLOMERATIVE CLUSTERING

In [237]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AgglomerativeClustering()
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

-0.011976265536517934


In [239]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AgglomerativeClustering(metric='l1', linkage='average')
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.0030178341081673436


In [242]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AgglomerativeClustering(metric='l2', linkage='average')
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.0030178341081673436


In [241]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AgglomerativeClustering(metric='manhattan', linkage='average')
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.0030178341081673436


In [240]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = AgglomerativeClustering(metric='cosine', linkage='average')
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.0030178341081673436


# KMEANS

In [243]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = KMeans(3)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.05765905738505665


In [244]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = KMeans(3, algorithm='elkan')
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.06389873564638945


In [245]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = KMeans(5)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.05702711328872276


In [246]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = KMeans(5, algorithm='elkan')
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.06956126092867393


In [247]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = KMeans(1)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.0


In [248]:
from IPython.display import Image
from IPython.core.display import HTML 

# OPTICS

Похож на DBSCAN.

В отличие от DBSCAN, сохраняет иерархию кластеров для переменного радиуса окрестности. Лучше подходит для использования с большими наборами данных.

In [256]:
Image(url="https://scikit-learn.org/stable/_images/sphx_glr_plot_optics_001.png",
     width=500, height=500)

In [250]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = OPTICS(min_samples=6, eps=2)
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.008342972897728207


In [251]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = OPTICS(min_samples=6, eps=2, metric = 'cityblock')
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.0022488775628999045


In [252]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = OPTICS(min_samples=6, eps=2, metric = 'cosine')
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.00883068382727673


In [257]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = OPTICS(min_samples=6, eps=2, metric = 'jaccard')
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 



0.0




In [255]:
ARI = []

for key, _ in grouped_df:
    texts = grouped_df.get_group(key)['context'].values

    X = np.zeros((len(texts), 768))

    for i, text in enumerate(texts):
        X[i] = embed(text)

    cluster = OPTICS(min_samples=6, eps=2, metric = 'cosine')
    
    cluster.fit(X)
    labels = np.array(cluster.labels_)+1 

    ARI.append(adjusted_rand_score(grouped_df.get_group(key)['gold_sense_id'], labels))
    
print(np.mean(ARI)) 

0.00883068382727673
