In [None]:
tw = "bærekraft"

# Mount the Disk


In [None]:
from google.colab import drive
import os


drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Helpers

In [None]:
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jensenshannon

def affinity_prop(embeddings_r, embeddings_f):
    all_embeddings = np.vstack([embeddings_r, embeddings_f])
    sim_matrix = cosine_similarity(all_embeddings)

    aff = AffinityPropagation(affinity='precomputed', random_state=42)
    aff.fit(sim_matrix)

    return aff.labels_

def compute_binary_change(labels, embeddings1_len, label1='cr', label2='cf', k=0, n=1):
    labels1 = [label1] * embeddings1_len
    labels2 = [label2] * (len(labels) - embeddings1_len)
    full_labels = labels1 + labels2

    clusters = {}
    for idx, cluster_id in enumerate(labels):
        if cluster_id not in clusters:
            clusters[cluster_id] = []
        clusters[cluster_id].append(full_labels[idx])

    cluster_list = list(clusters.values())
    D, E = compute_sfd(cluster_list, label1, label2)
    binary = classify_change_binary(D, E, k, n)
    return binary, D, E, cluster_list

def compute_sfd(clusters, c1_label, c2_label):
    D = []
    E = []

    for cluster in clusters:
        count_c1 = sum(1 for u in cluster if u == c1_label)
        count_c2 = sum(1 for u in cluster if u == c2_label)
        D.append(count_c1)
        E.append(count_c2)

    return D, E

def classify_change_binary(D, E, k, n):
    for d_i, e_i in zip(D, E):
        if (d_i <= k and e_i >= n) or (d_i >= n and e_i <= k):
            return 1
    return 0

def compute_jsd(labels, embeddings1_len):
    labels = np.array(labels)
    labels1 = labels[:embeddings1_len]
    labels2 = labels[embeddings1_len:]

    all_cluster_ids = np.unique(labels)

    counts1 = np.array([np.sum(labels1 == cid) for cid in all_cluster_ids])
    counts2 = np.array([np.sum(labels2 == cid) for cid in all_cluster_ids])

    print('counts1/ref', counts1)
    print('counts2/foc', counts2)

    prob1 = counts1 / counts1.sum()
    prob2 = counts2 / counts2.sum()

    jsd = jensenshannon(prob1, prob2)
    return jsd, prob1, prob2


In [None]:
from scipy.spatial.distance import cdist
import numpy as np

def compute_apd(embeddings_r, embeddings_f):
    cdistances = cdist(embeddings_r, embeddings_f, metric='cosine')
    return cdistances.mean()

In [None]:
import numpy as np
import torch
from tqdm import tqdm

def generate_and_save_word_embeddings(
    sentences,
    target_forms,
    model,
    tokenizer,
    filename="word_embeddings.npy",
    batch_size=16,
    device="cuda"
):
    model = model.to(device)
    model.eval()

    tokenized_target_forms = [tokenizer.tokenize(form) for form in target_forms]
    all_embeddings = []

    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i + batch_size]
        batch = [s.lower() for s in batch]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
            return_offsets_mapping=True,
            add_special_tokens=True
        )

        offset_mapping = inputs.pop("offset_mapping", None)
        inputs = inputs.to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            hidden_states = outputs.hidden_states

        selected_layers = torch.stack(hidden_states[8:11]).sum(dim=0)
        input_ids = inputs["input_ids"]

        for index, sentence in enumerate(batch):
            tokens = tokenizer.convert_ids_to_tokens(input_ids[index])
            embeddings = selected_layers[index]

            matching_indices = []
            for target_token_seq in tokenized_target_forms:
                for j in range(len(tokens) - len(target_token_seq) + 1):
                    if tokens[j:j + len(target_token_seq)] == target_token_seq:
                        matching_indices.extend(range(j, j + len(target_token_seq)))

            matching_indices = sorted(set(matching_indices))
            usage_embedding = embeddings[matching_indices].mean(dim=0)
            all_embeddings.append(usage_embedding.cpu())

            print("sentence", sentence)
            print("matched token indices", matching_indices)
            print("matched tokens", [tokens[idx] for idx in matching_indices])


        all_embeddings = torch.stack(all_embeddings)
        np.save(filename, all_embeddings.numpy())


In [None]:
import re

def get_word_forms_by_stem(usages_a, usages_b, stem):
    all_usages = usages_a + usages_b
    word_forms = set()

    for text in all_usages:
        tokens = re.findall(r'\b\w+\b', text.lower())
        for token in tokens:
            if token.startswith(stem):
                word_forms.add(token)

    return sorted(word_forms)

# Only preliminary - Do not run again!



## Load trial_data from excel

In [None]:
folder_path_trial = '/content/drive/My Drive/MSc Computer Science/Master Thesis/trial_data/'

for f in os.listdir(folder_path_trial):
    print(f)

diachronic.xlsx
speaker1.xlsx
speaker2.xlsx
speaker3.xlsx
speaker4.xlsx


In [None]:
import pandas as pd

trial_dia = pd.read_excel(folder_path_trial+'diachronic.xlsx')
tg1_usages = trial_dia[trial_dia['usage_id'].str.contains("tg1", na=False)]['text'].tolist()
tg2_usages = trial_dia[trial_dia['usage_id'].str.contains("tg2", na=False)]['text'].tolist()

trial_s1 = pd.read_excel(folder_path_trial+'speaker1.xlsx')
gs1_usages = trial_s1[trial_s1['usage_id'].str.contains("general", na=False)]['text'].tolist()
s1_usages = trial_s1[trial_s1['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

trial_s2 = pd.read_excel(folder_path_trial+'speaker2.xlsx')
gs2_usages = trial_s2[trial_s2['usage_id'].str.contains("general", na=False)]['text'].tolist()
s2_usages = trial_s2[trial_s2['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

trial_s3 = pd.read_excel(folder_path_trial+'speaker3.xlsx')
gs3_usages = trial_s3[trial_s3['usage_id'].str.contains("general", na=False)]['text'].tolist()
s3_usages = trial_s3[trial_s3['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

trial_s4 = pd.read_excel(folder_path_trial+'speaker4.xlsx')
gs4_usages = trial_s4[trial_s4['usage_id'].str.contains("general", na=False)]['text'].tolist()
s4_usages = trial_s4[trial_s4['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

In [None]:
print(len(tg1_usages))
print(len(tg2_usages))
print(len(gs1_usages))
print(len(s1_usages))
print(len(gs2_usages))
print(len(s2_usages))
print(len(gs3_usages))
print(len(s3_usages))
print(len(gs4_usages))
print(len(s4_usages))

30
30
30
30
30
30
30
30
30
30


## Word forms of corpus pairs

In [None]:
tw = "bærekraft"

dia_word_forms = get_word_forms_by_stem(tg1_usages, tg2_usages, tw)
print('dia word forms', dia_word_forms)

s1_word_forms = get_word_forms_by_stem(gs1_usages, s1_usages, tw)
print('s1 word forms', s1_word_forms)

s2_word_forms = get_word_forms_by_stem(gs2_usages, s2_usages, tw)
print('s2 word fomrs', s2_word_forms)

s3_word_forms = get_word_forms_by_stem(gs3_usages, s3_usages, tw)
print('s3 word forms', s3_word_forms)

s4_word_forms = get_word_forms_by_stem(gs4_usages, s4_usages, tw)
print('s4 word forms', s4_word_forms)

dia word forms ['bærekraft', 'bærekraftig', 'bærekraftige', 'bærekraftmål', 'bærekraftsmålene', 'bærekraftutfordringene']
s1 word forms ['bærekraft', 'bærekraftbegrepene', 'bærekraften', 'bærekraftig', 'bærekraftige', 'bærekraftindikator', 'bærekraftindikatorer', 'bærekraftperspektiv', 'bærekraftsindikatorer', 'bærekraftskriteriene', 'bærekraftskvalitet', 'bærekraftsmålet']
s2 word fomrs ['bærekraft', 'bærekraftig', 'bærekraftige']
s3 word forms ['bærekraft', 'bærekraften', 'bærekraftig', 'bærekraftige', 'bærekraftmål', 'bærekraftmålene', 'bærekraftsmål', 'bærekraftsmålene']
s4 word forms ['bærekraft', 'bærekraften', 'bærekraftig', 'bærekraftige', 'bærekraftighetsspørsmålene', 'bærekraftsinformasjon', 'bærekraftskrav', 'bærekraftsmål', 'bærekraftsmålene']


## Generate embeddings and save

### PT: Embeddings trial_data

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "NbAiLab/nb-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_hidden_states=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
generate_and_save_word_embeddings(tg1_usages, dia_word_forms, model, tokenizer, filename="base_tg1_wembeddings.npy")
generate_and_save_word_embeddings(tg2_usages, dia_word_forms, model, tokenizer, filename="base_tg2_wembeddings.npy")
generate_and_save_word_embeddings(gs1_usages, s1_word_forms, model, tokenizer, filename="base_gs1_wembeddings.npy")
generate_and_save_word_embeddings(s1_usages, s1_word_forms, model, tokenizer, filename="base_s1_wembeddings.npy")
generate_and_save_word_embeddings(gs2_usages, s2_word_forms, model, tokenizer, filename="base_gs2_wembeddings.npy")
generate_and_save_word_embeddings(s2_usages, s2_word_forms, model, tokenizer, filename="base_s2_wembeddings.npy")
generate_and_save_word_embeddings(gs3_usages, s3_word_forms, model, tokenizer, filename="base_gs3_wembeddings.npy")
generate_and_save_word_embeddings(s3_usages, s3_word_forms, model, tokenizer, filename="base_s3_wembeddings.npy")
generate_and_save_word_embeddings(gs4_usages, s4_word_forms, model, tokenizer, filename="base_gs4_wembeddings.npy")
generate_and_save_word_embeddings(s4_usages, s4_word_forms, model, tokenizer, filename="base_s4_wembeddings.npy")

###FT: Embeddings trial_data

In [None]:
model_dir = "/content/drive/MyDrive/nb-bert-finetuned"

In [None]:
from transformers import AutoModel, AutoTokenizer

ft_tokenizer = AutoTokenizer.from_pretrained(model_dir)
ft_model = AutoModel.from_pretrained(model_dir, output_hidden_states=True)
ft_model.eval()

Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/nb-bert-finetuned and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [None]:
generate_and_save_word_embeddings(tg1_usages, dia_word_forms, ft_model, ft_tokenizer, filename="ft_tg1_wembeddings.npy")
generate_and_save_word_embeddings(tg2_usages, dia_word_forms, ft_model, ft_tokenizer, filename="ft_tg2_wembeddings.npy")
generate_and_save_word_embeddings(gs1_usages, s1_word_forms, ft_model, ft_tokenizer, filename="ft_gs1_wembeddings.npy")
generate_and_save_word_embeddings(s1_usages, s1_word_forms, ft_model, ft_tokenizer, filename="ft_s1_wembeddings.npy")
generate_and_save_word_embeddings(gs2_usages, s2_word_forms, ft_model, ft_tokenizer, filename="ft_gs2_wembeddings.npy")
generate_and_save_word_embeddings(s2_usages, s2_word_forms, ft_model, ft_tokenizer, filename="ft_s2_wembeddings.npy")
generate_and_save_word_embeddings(gs3_usages, s3_word_forms, ft_model, ft_tokenizer, filename="ft_gs3_wembeddings.npy")
generate_and_save_word_embeddings(s3_usages, s3_word_forms, ft_model, ft_tokenizer, filename="ft_s3_wembeddings.npy")
generate_and_save_word_embeddings(gs4_usages, s4_word_forms, ft_model, ft_tokenizer, filename="ft_gs4_wembeddings.npy")
generate_and_save_word_embeddings(s4_usages, s4_word_forms, ft_model, ft_tokenizer, filename="ft_s4_wembeddings.npy")

# Pre-trained Embeddings

## Load embeddings from Disk

In [None]:
import numpy as np

folder_path = '/content/drive/My Drive/MSc Computer Science/Master Thesis/pt-wembeddings'

tg1_wembeddings = np.load(folder_path+'/base_tg1_wembeddings.npy')
tg2_wembeddings = np.load(folder_path+'/base_tg2_wembeddings.npy')

s1_wembeddings = np.load(folder_path+'/base_s1_wembeddings.npy')
gs1_wembeddings = np.load(folder_path+'/base_gs1_wembeddings.npy')

s2_wembeddings = np.load(folder_path+'/base_s2_wembeddings.npy')
gs2_wembeddings = np.load(folder_path+'/base_gs2_wembeddings.npy')

s3_wembeddings = np.load(folder_path+'/base_s3_wembeddings.npy')
gs3_wembeddings = np.load(folder_path+'/base_gs3_wembeddings.npy')

s4_wembeddings = np.load(folder_path+'/base_s4_wembeddings.npy')
gs4_wembeddings = np.load(folder_path+'/base_gs4_wembeddings.npy')

In [None]:
tg1_wembeddings.shape, tg2_wembeddings.shape

((30, 768), (30, 768))

In [None]:
s1_wembeddings.shape, gs1_wembeddings.shape

((30, 768), (30, 768))

In [None]:
s2_wembeddings.shape, gs2_wembeddings.shape

((30, 768), (30, 768))

In [None]:
s3_wembeddings.shape, gs3_wembeddings.shape

((30, 768), (30, 768))

In [None]:
s4_wembeddings.shape, gs4_wembeddings.shape

((30, 768), (30, 768))

## LSC

In [None]:
d_labels = affinity_prop(tg1_wembeddings, tg2_wembeddings)

binary, D, E, cluster_list = compute_binary_change(d_labels, len(tg1_wembeddings))
jsd, _, _ = compute_jsd(d_labels, len(tg1_wembeddings))

print('d binary', binary)
print('d jsd', jsd)

counts1/ref [1 8 4 7 1 1 0 0 0 6 2 0 0]
counts2/foc [0 1 4 1 0 3 2 2 1 7 5 3 1]
d binary 1
d jsd 0.4912630278113394


In [None]:
s1_labels = affinity_prop(gs1_wembeddings, s1_wembeddings)

binary, D, E, cluster_list = compute_binary_change(s1_labels, len(gs1_wembeddings))
jsd, _, _ = compute_jsd(s1_labels, len(gs1_wembeddings))

print('s1 binary', binary)
print('s1 jsd', jsd)

counts1/ref [ 1  5  1 10  6  1  6  0]
counts2/foc [ 0  1  3  0  8  3 10  5]
s1 binary 1
s1 jsd 0.4871867736832577


In [None]:
s2_labels = affinity_prop(gs2_wembeddings, s2_wembeddings)

binary, D, E, cluster_list = compute_binary_change(s2_labels, len(gs2_wembeddings))
jsd, _, _ = compute_jsd(s2_labels, len(gs2_wembeddings))

print('s2 binary', binary)
print('s2 jsd', jsd)

counts1/ref [7 2 6 1 3 2 0 2 2 2 3]
counts2/foc [2 1 0 2 2 2 1 7 4 2 7]
s2 binary 1
s2 jsd 0.395749719539728


In [None]:
s3_labels = affinity_prop(gs3_wembeddings, s3_wembeddings)

binary, D, E, cluster_list = compute_binary_change(s3_labels, len(gs3_wembeddings))
jsd, _, _ = compute_jsd(s3_labels, len(gs3_wembeddings))

print('s3 binary', binary)
print('s3 jsd', jsd)

counts1/ref [6 1 2 2 2 3 0 4 6 0 4]
counts2/foc [0 0 0 1 1 4 1 6 6 9 2]
s3 binary 1
s3 jsd 0.4851532617696486


In [None]:
s4_labels = affinity_prop(gs4_wembeddings, s4_wembeddings)

binary, D, E, cluster_list = compute_binary_change(s4_labels, len(gs4_wembeddings))
jsd, _, _ = compute_jsd(s4_labels, len(gs4_wembeddings))

print('s4 binary', binary)
print('s4 jsd', jsd)

counts1/ref [3 9 2 5 1 2 3 3 0 0 2]
counts2/foc [2 3 2 1 3 2 9 4 1 1 2]
s4 binary 1
s4 jsd 0.33358902600863366


In [None]:
print('dia', compute_apd(tg1_wembeddings, tg2_wembeddings))
print('s1', compute_apd(gs1_wembeddings, s1_wembeddings))
print('s2', compute_apd(gs2_wembeddings, s2_wembeddings))
print('s3', compute_apd(gs3_wembeddings, s3_wembeddings))
print('s4', compute_apd(gs4_wembeddings, s4_wembeddings))

dia 0.21523845138271708
s1 0.25743089279124476
s2 0.23038038709456088
s3 0.24082329318894116
s4 0.27179107343114395


# Fine-tuned Embeddings

## Load embeddings from Disk

In [None]:
import numpy as np

folder_path_ft = '/content/drive/MyDrive/MSc Computer Science/Master Thesis/ft-wembeddings/'

ft_tg1_wembeddings = np.load(folder_path_ft+"ft_tg1_wembeddings.npy")
ft_tg2_wembeddings = np.load(folder_path_ft+"ft_tg2_wembeddings.npy")

ft_gs1_wembeddings = np.load(folder_path_ft+"ft_gs1_wembeddings.npy")
ft_s1_wembeddings = np.load(folder_path_ft+"ft_s1_wembeddings.npy")

ft_gs2_wembeddings = np.load(folder_path_ft+"ft_gs2_wembeddings.npy")
ft_s2_wembeddings = np.load(folder_path_ft+"ft_s2_wembeddings.npy")

ft_gs3_wembeddings = np.load(folder_path_ft+"ft_gs3_wembeddings.npy")
ft_s3_wembeddings = np.load(folder_path_ft+"ft_s3_wembeddings.npy")

ft_gs4_wembeddings = np.load(folder_path_ft+"ft_gs4_wembeddings.npy")
ft_s4_wembeddings = np.load(folder_path_ft+"ft_s4_wembeddings.npy")

## LSC

In [None]:
ft_d_labels = affinity_prop(ft_tg1_wembeddings, ft_tg2_wembeddings)

bc_d_ftw, D_d_ftw, E_d_ftw, cluster_list_d_ftw = compute_binary_change(ft_d_labels, len(ft_tg1_wembeddings))
jsd_d_ftw, _, _ = compute_jsd(ft_d_labels, len(ft_tg1_wembeddings))

print('d binary', binary)
print('d clusters', cluster_list_d_ftw)
print('d jsd', jsd)

counts1/ref [6 1 4 2 9 1 0 0 5 0 0 2]
counts2/foc [1 0 5 1 2 3 2 1 6 4 3 2]
d binary 1
d clusters [['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf'], ['cr'], ['cr', 'cr', 'cf', 'cf'], ['cr', 'cr', 'cf'], ['cr', 'cf', 'cf', 'cf'], ['cf', 'cf', 'cf'], ['cf', 'cf'], ['cf', 'cf', 'cf', 'cf'], ['cf']]
d jsd 0.33358902600863366


In [None]:
ft_s1_labels = affinity_prop(ft_gs1_wembeddings, ft_s1_wembeddings)

bc_s1_ftw, D_s1_ftw, E_s1_ftw, cluster_list_s1_ftw = compute_binary_change(ft_s1_labels, len(ft_gs1_wembeddings))
jsd_s1_ftw, _, _ = compute_jsd(ft_s1_labels, len(ft_gs1_wembeddings))

print('s1 binary', bc_s1_ftw)
print('s1 clusters', cluster_list_s1_ftw)
print('s1 jsd', jsd_s1_ftw)

counts1/ref [ 6 10  6  2  1  5  0]
counts2/foc [1 1 7 6 3 8 4]
s1 binary 1
s1 clusters [['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'], ['cr', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'], ['cf', 'cf', 'cf', 'cf']]
s1 jsd 0.42788168652240177


In [None]:
ft_s2_labels = affinity_prop(ft_gs2_wembeddings, ft_s2_wembeddings)

bc_s2_ftw, D_s2_ftw, E_s2_ftw, cluster_list_s2_ftw = compute_binary_change(ft_s2_labels, len(ft_gs2_wembeddings))
jsd_s2_ftw, _, _ = compute_jsd(ft_s2_labels, len(ft_gs2_wembeddings))

print('s2 binary', bc_s2_ftw)
print('s2 clusters', cluster_list_s2_ftw)
print('s2 jsd', jsd_s2_ftw)

counts1/ref [6 2 1 5 3 4 4 0 2 3]
counts2/foc [2 3 2 3 3 7 2 1 3 4]
s2 binary 1
s2 clusters [['cr', 'cr', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'], ['cr', 'cf', 'cf'], ['cf']]
s2 jsd 0.23055585937245307


In [None]:
ft_s3_labels = affinity_prop(ft_gs3_wembeddings, ft_s3_wembeddings)

bc_s3_ftw, D_s3_ftw, E_s3_ftw, cluster_list_s3_ftw = compute_binary_change(ft_s3_labels, len(ft_gs3_wembeddings))
jsd_s3_ftw, _, _ = compute_jsd(ft_s3_labels, len(ft_gs3_wembeddings))

print('s3 binary', bc_s3_ftw)
print('s3 clusters', cluster_list_s3_ftw)
print('s3 jsd', jsd_s3_ftw)

counts1/ref [6 4 1 2 4 6 2 0 5 0]
counts2/foc [4 0 0 1 0 1 5 4 7 8]
s3 binary 1
s3 clusters [['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cr'], ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf'], ['cr'], ['cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cf'], ['cr', 'cr', 'cr', 'cr'], ['cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'], ['cf', 'cf', 'cf', 'cf']]
s3 jsd 0.5437547106095686


In [None]:
ft_s4_labels = affinity_prop(ft_gs4_wembeddings, ft_s4_wembeddings)

bc_s4_ftw, D_s4_ftw, E_s4_ftw, cluster_list_s4_ftw = compute_binary_change(ft_s4_labels, len(ft_gs4_wembeddings))
jsd_s4_ftw, _, _ = compute_jsd(ft_s4_labels, len(ft_gs4_wembeddings))

print('s4 binary', bc_s4_ftw)
print('s4 clusters', cluster_list_s4_ftw)
print('s4 jsd', jsd_s4_ftw)

counts1/ref [4 5 8 1 2 5 0 0 2 3 0]
counts2/foc [3 3 3 3 2 9 1 1 3 1 1]
s4 binary 1
s4 clusters [['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cr', 'cf'], ['cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'], ['cr', 'cr', 'cf', 'cf'], ['cr', 'cr', 'cf', 'cf', 'cf'], ['cr', 'cf', 'cf', 'cf'], ['cf'], ['cf'], ['cf']]
s4 jsd 0.297477431172989


In [None]:
print('dia', compute_apd(ft_tg1_wembeddings, ft_tg2_wembeddings))
print('s1', compute_apd(ft_gs1_wembeddings, ft_s1_wembeddings))
print('s2', compute_apd(ft_gs2_wembeddings, ft_s2_wembeddings))
print('s3', compute_apd(ft_gs3_wembeddings, ft_s3_wembeddings))
print('s4', compute_apd(ft_gs4_wembeddings, ft_s4_wembeddings))

dia 0.22757938878637252
s1 0.27165569436746945
s2 0.23531247244830306
s3 0.2575774477257153
s4 0.2940779636004815
