In [2]:
from sklearn.cluster import AffinityPropagation
aff = AffinityPropagation(affinity='precomputed', random_state=42, damping=0.505)

# Mount the Disk


In [3]:
from google.colab import drive
import os


drive.mount('/content/drive', force_remount=True)
folder_path = '/content/drive/My Drive/MSc Computer Science/Master Thesis/trial_base_embeddings'

for f in os.listdir(folder_path):
    print(f)

Mounted at /content/drive
tr_gs1_base_embeddings.npy
tr_gs2_base_embeddings.npy
tr_gs3_base_embeddings.npy
tr_gs4_base_embeddings.npy
tr_s1_base_embeddings.npy
tr_s2_base_embeddings.npy
tr_s3_base_embeddings.npy
tr_s4_base_embeddings.npy
tr_tg1_base_embeddings.npy
tr_tg2_base_embeddings.npy


# Helpers

In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def cluster_embeddings_binary(embeddings1, embeddings2, aff_model, label1='cr', label2='cf', k=0, n=2):
    labels = [label1] * len(embeddings1) + [label2] * len(embeddings2)
    all_embeddings = np.vstack([embeddings1, embeddings2])

    sim_matrix = cosine_similarity(all_embeddings)
    aff_model.fit(sim_matrix)

    clusters = {}
    for idx, cluster_id in enumerate(aff_model.labels_):
        if cluster_id not in clusters:
            clusters[cluster_id] = []
        clusters[cluster_id].append(labels[idx])

    cluster_list = list(clusters.values())

    D, E = compute_sfd(cluster_list, label1, label2)

    binary_change = classify_change_binary(D, E, k=k, n=n)
    return binary_change, D, E, cluster_list

def compute_sfd(clusters, c1_label, c2_label):
    D = []
    E = []

    for cluster in clusters:
        count_c1 = sum(1 for u in cluster if u == c1_label)
        count_c2 = sum(1 for u in cluster if u == c2_label)
        D.append(count_c1)
        E.append(count_c2)
    return D, E

def classify_change_binary(D, E, k, n):
    for d_i, e_i in zip(D, E):
        if (d_i <= k and e_i >= n) or (d_i >= n and e_i <= k):
            return 1
    return 0

In [6]:
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jensenshannon
from sklearn.preprocessing import StandardScaler

def cluster_and_compare_jsd(aff, embeddings1, embeddings2):
    stacked_embeddings = np.vstack([embeddings1, embeddings2])
    sim_matrix = cosine_similarity(stacked_embeddings)

    aff.fit(sim_matrix)

    all_labels = aff.labels_
    n1 = embeddings1.shape[0]

    labels1 = all_labels[:n1]
    labels2 = all_labels[n1:]

    all_cluster_ids = np.unique(all_labels)

    counts1 = np.array([np.sum(labels1 == cid) for cid in all_cluster_ids])
    counts2 = np.array([np.sum(labels2 == cid) for cid in all_cluster_ids])
    print('counts1', counts1)
    print('counts2', counts2)

    prob1 = counts1 / counts1.sum()
    prob2 = counts2 / counts2.sum()

    jsd = jensenshannon(prob1, prob2)
    return jsd, prob1, prob2


In [7]:
from scipy.spatial.distance import cdist
import numpy as np

def compute_apd(embeddings_r, embeddings_f):
    cdistances = cdist(embeddings_r, embeddings_f, metric='cosine')
    return cdistances.mean()

In [8]:
import numpy as np
import torch
from tqdm import tqdm

def generate_and_save_embeddings(
    sentences,
    model,
    tokenizer,
    filename="embeddings.npy",
    batch_size=16,
    device="cuda"
):
    model = model.to(device)
    model.eval()

    token_lengths = [len(tokenizer.tokenize(sent)) for sent in sentences]
    max_length = min(512, max(token_lengths))
    print(f"[INFO] Using max_length = {max_length} (based on longest tokenized sentence)")

    all_embeddings = []

    for i in tqdm(range(0, len(sentences), batch_size)):
        batch = sentences[i:i+batch_size]

        inputs = tokenizer(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            hidden_states = outputs.hidden_states

        selected_layers = torch.stack(hidden_states[8:11]).sum(dim=0) # layers 8, 9, and 10
        attention_mask = inputs["attention_mask"].unsqueeze(-1)
        masked_embeddings = selected_layers * attention_mask
        sum_embeddings = masked_embeddings.sum(dim=1)
        token_counts = attention_mask.sum(dim=1)
        mean_embeddings = sum_embeddings / token_counts

        all_embeddings.append(mean_embeddings.cpu())

    all_embeddings = torch.cat(all_embeddings, dim=0)
    np.save(filename, all_embeddings.numpy())


# Only preliminary - Do not run again!



## Load trial_data from excel

In [None]:
folder_path_trial = '/content/drive/My Drive/MSc Computer Science/Master Thesis/trial_data/'

for f in os.listdir(folder_path_trial):
    print(f)

diachronic.xlsx
speaker1.xlsx
speaker2.xlsx
speaker3.xlsx
speaker4.xlsx


In [None]:
import pandas as pd

trial_dia = pd.read_excel(folder_path_trial+'diachronic.xlsx')
tg1_usages = trial_dia[trial_dia['usage_id'].str.contains("tg1", na=False)]['text'].tolist()
tg2_usages = trial_dia[trial_dia['usage_id'].str.contains("tg2", na=False)]['text'].tolist()

trial_s1 = pd.read_excel(folder_path_trial+'speaker1.xlsx')
gs1_usages = trial_s1[trial_s1['usage_id'].str.contains("general", na=False)]['text'].tolist()
s1_usages = trial_s1[trial_s1['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

trial_s2 = pd.read_excel(folder_path_trial+'speaker2.xlsx')
gs2_usages = trial_s2[trial_s2['usage_id'].str.contains("general", na=False)]['text'].tolist()
s2_usages = trial_s2[trial_s2['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

trial_s3 = pd.read_excel(folder_path_trial+'speaker3.xlsx')
gs3_usages = trial_s3[trial_s3['usage_id'].str.contains("general", na=False)]['text'].tolist()
s3_usages = trial_s3[trial_s3['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

trial_s4 = pd.read_excel(folder_path_trial+'speaker4.xlsx')
gs4_usages = trial_s4[trial_s4['usage_id'].str.contains("general", na=False)]['text'].tolist()
s4_usages = trial_s4[trial_s4['usage_id'].str.contains("speaker", na=False)]['text'].tolist()

In [None]:
print(len(tg1_usages))
print(len(tg2_usages))
print(len(gs1_usages))
print(len(s1_usages))
print(len(gs2_usages))
print(len(s2_usages))
print(len(gs3_usages))
print(len(s3_usages))
print(len(gs4_usages))
print(len(s4_usages))

30
30
30
30
30
30
30
30
30
30


## Generate embeddings and save

### PT: Embeddings trial_data

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "NbAiLab/nb-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, output_hidden_states=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [None]:
generate_and_save_embeddings(tg1_usages, model, tokenizer, filename="tr_tg1_base_embeddings.npy")
generate_and_save_embeddings(tg2_usages, model, tokenizer, filename="tr_tg2_base_embeddings.npy")
generate_and_save_embeddings(gs1_usages, model, tokenizer, filename="tr_gs1_base_embeddings.npy")
generate_and_save_embeddings(s1_usages, model, tokenizer, filename="tr_s1_base_embeddings.npy")
generate_and_save_embeddings(gs2_usages, model, tokenizer, filename="tr_gs2_base_embeddings.npy")
generate_and_save_embeddings(s2_usages, model, tokenizer, filename="tr_s2_base_embeddings.npy")
generate_and_save_embeddings(gs3_usages, model, tokenizer, filename="tr_gs3_base_embeddings.npy")
generate_and_save_embeddings(s3_usages, model, tokenizer, filename="tr_s3_base_embeddings.npy")
generate_and_save_embeddings(gs4_usages, model, tokenizer, filename="tr_gs4_base_embeddings.npy")
generate_and_save_embeddings(s4_usages, model, tokenizer, filename="tr_s4_base_embeddings.npy")

[INFO] Using max_length = 83 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00,  2.61it/s]


[INFO] Using max_length = 98 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 11.42it/s]


[INFO] Using max_length = 92 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 15.74it/s]


[INFO] Using max_length = 56 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 24.89it/s]


[INFO] Using max_length = 125 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 11.34it/s]


[INFO] Using max_length = 177 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 10.92it/s]


[INFO] Using max_length = 96 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 16.57it/s]


[INFO] Using max_length = 153 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 11.34it/s]


[INFO] Using max_length = 107 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 13.72it/s]


[INFO] Using max_length = 78 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 20.02it/s]


###FT: Embeddings trial_data

In [None]:
model_dir = "/content/drive/MyDrive/nb-bert-finetuned"

In [None]:
from transformers import AutoModel, AutoTokenizer

ft_tokenizer = AutoTokenizer.from_pretrained(model_dir)
ft_model = AutoModel.from_pretrained(model_dir, output_hidden_states=True)
ft_model.eval()

Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/nb-bert-finetuned and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [None]:
generate_and_save_embeddings(tg1_usages, ft_model, ft_tokenizer, filename="ft_tg1_base_embeddings.npy")
generate_and_save_embeddings(tg2_usages, ft_model, ft_tokenizer, filename="ft_tg2_base_embeddings.npy")
generate_and_save_embeddings(gs1_usages, ft_model, ft_tokenizer, filename="ft_gs1_base_embeddings.npy")
generate_and_save_embeddings(s1_usages, ft_model, ft_tokenizer, filename="ft_s1_base_embeddings.npy")
generate_and_save_embeddings(gs2_usages, ft_model, ft_tokenizer, filename="ft_gs2_base_embeddings.npy")
generate_and_save_embeddings(s2_usages, ft_model, ft_tokenizer, filename="ft_s2_base_embeddings.npy")
generate_and_save_embeddings(gs3_usages, ft_model, ft_tokenizer, filename="ft_gs3_base_embeddings.npy")
generate_and_save_embeddings(s3_usages, ft_model, ft_tokenizer, filename="ft_s3_base_embeddings.npy")
generate_and_save_embeddings(gs4_usages, ft_model, ft_tokenizer, filename="ft_gs4_base_embeddings.npy")
generate_and_save_embeddings(s4_usages, ft_model, ft_tokenizer, filename="ft_s4_base_embeddings.npy")

[INFO] Using max_length = 83 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00,  8.43it/s]


[INFO] Using max_length = 98 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 11.26it/s]


[INFO] Using max_length = 92 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 13.56it/s]


[INFO] Using max_length = 56 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 19.95it/s]


[INFO] Using max_length = 125 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00,  9.13it/s]


[INFO] Using max_length = 177 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00,  9.06it/s]


[INFO] Using max_length = 96 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 13.06it/s]


[INFO] Using max_length = 153 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00,  9.32it/s]


[INFO] Using max_length = 107 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 11.42it/s]


[INFO] Using max_length = 78 (based on longest tokenized sentence)


100%|██████████| 2/2 [00:00<00:00, 17.24it/s]


# Pre-trained Embeddings

## Load embeddings from Disk

In [10]:
import numpy as np

folder_path = '/content/drive/My Drive/MSc Computer Science/Master Thesis/trial_base_embeddings'

tg1_embeddings = np.load(folder_path+'/tr_tg1_base_embeddings.npy')
tg2_embeddings = np.load(folder_path+'/tr_tg2_base_embeddings.npy')

s1_embeddings = np.load(folder_path+'/tr_s1_base_embeddings.npy')
gs1_embeddings = np.load(folder_path+'/tr_gs1_base_embeddings.npy')

s2_embeddings = np.load(folder_path+'/tr_s2_base_embeddings.npy')
gs2_embeddings = np.load(folder_path+'/tr_gs2_base_embeddings.npy')

s3_embeddings = np.load(folder_path+'/tr_s3_base_embeddings.npy')
gs3_embeddings = np.load(folder_path+'/tr_gs3_base_embeddings.npy')

s4_embeddings = np.load(folder_path+'/tr_s4_base_embeddings.npy')
gs4_embeddings = np.load(folder_path+'/tr_gs4_base_embeddings.npy')

In [11]:
tg1_embeddings.shape, tg2_embeddings.shape

((30, 768), (30, 768))

In [12]:
s1_embeddings.shape, gs1_embeddings.shape

((30, 768), (30, 768))

In [13]:
s2_embeddings.shape, gs2_embeddings.shape

((30, 768), (30, 768))

In [14]:
s3_embeddings.shape, gs3_embeddings.shape

((30, 768), (30, 768))

In [15]:
s4_embeddings.shape, gs4_embeddings.shape

((30, 768), (30, 768))

## Cluster and graded LSC (JSD)

In [16]:
pt_d_jsd, _, _ = cluster_and_compare_jsd(aff, tg1_embeddings, tg2_embeddings)
pt_d_jsd
# np.float64(0.28762842937480565)

counts1 [6 3 6 1 1 7 2 4 0]
counts2 [6 9 2 0 0 5 3 4 1]


np.float64(0.28762842937480565)

In [17]:
pt_s1_jsd, _, _ = cluster_and_compare_jsd(aff, s1_embeddings, gs1_embeddings)
pt_s1_jsd
# np.float64(0.5024905442575683)

counts1 [ 2  2  2 10  7  6  0  1  0  0  0  0]
counts2 [0 1 1 3 3 5 1 2 5 1 3 5]


np.float64(0.5024905442575683)

In [18]:
pt_s2_jsd, _, _ = cluster_and_compare_jsd(aff, s2_embeddings, gs2_embeddings)
pt_s2_jsd
# np.float64(0.29100035275770453)

counts1 [ 1  6 10  3  4  3  3  0]
counts2 [0 5 7 1 8 2 3 4]


np.float64(0.29100035275770453)

In [19]:
pt_s3_jsd, _, _ = cluster_and_compare_jsd(aff, s3_embeddings, gs3_embeddings)
pt_s3_jsd
# np.float64(0.31448108089053484)

counts1 [ 5  4  8 10  1  0  2  0]
counts2 [4 1 7 5 6 1 5 1]


np.float64(0.31448108089053484)

In [20]:
pt_s4_jsd, _, _  = cluster_and_compare_jsd(aff, s4_embeddings, gs4_embeddings)
pt_s4_jsd
# np.float64(0.39782780640754384)

counts1 [ 2  5 14  4  1  0  0  0  4  0]
counts2 [1 3 5 6 0 4 1 1 7 2]


np.float64(0.39782780640754384)

## Cluster and binary LSC

In [21]:
binary_change_d, D_d, E_d, clusters_d = cluster_embeddings_binary(tg1_embeddings, tg2_embeddings, aff_model=aff, k=0, n=2)
binary_change_d, D_d, E_d, clusters_d

(0,
 [3, 6, 7, 6, 2, 4, 1, 1, 0],
 [9, 2, 5, 6, 3, 4, 0, 0, 1],
 [['cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf'],
  ['cr'],
  ['cr'],
  ['cf']])

(0,
 [3, 6, 7, 6, 2, 4, 1, 1, 0],
 [9, 2, 5, 6, 3, 4, 0, 0, 1],
 [['cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf'],
  ['cr'],
  ['cr'],
  ['cf']])

In [22]:
binary_change_s1, D_s1, E_s1, clusters_s1 = cluster_embeddings_binary(s1_embeddings, gs1_embeddings, aff_model=aff, k=0, n=2)
binary_change_s1, D_s1, E_s1, clusters_s1

(1,
 [2, 10, 7, 6, 2, 2, 1, 0, 0, 0, 0, 0],
 [0, 3, 3, 5, 1, 1, 2, 5, 5, 1, 3, 1],
 [['cr', 'cr'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cf'],
  ['cr', 'cr', 'cf'],
  ['cr', 'cf', 'cf'],
  ['cf', 'cf', 'cf', 'cf', 'cf'],
  ['cf', 'cf', 'cf', 'cf', 'cf'],
  ['cf'],
  ['cf', 'cf', 'cf'],
  ['cf']])

(1,
 [2, 10, 7, 6, 2, 2, 1, 0, 0, 0, 0, 0],
 [0, 3, 3, 5, 1, 1, 2, 5, 5, 1, 3, 1],
 [['cr', 'cr'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cf'],
  ['cr', 'cr', 'cf'],
  ['cr', 'cf', 'cf'],
  ['cf', 'cf', 'cf', 'cf', 'cf'],
  ['cf', 'cf', 'cf', 'cf', 'cf'],
  ['cf'],
  ['cf', 'cf', 'cf'],
  ['cf']])

In [23]:
binary_change_s2, D_s2, E_s2, clusters_s2 = cluster_embeddings_binary(s2_embeddings, gs2_embeddings, aff_model=aff, k=0, n=2)
binary_change_s2, D_s2, E_s2, clusters_s2

(1,
 [4, 6, 10, 1, 3, 3, 3, 0],
 [8, 5, 7, 0, 2, 3, 1, 4],
 [['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr'],
  ['cr', 'cr', 'cr', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cf'],
  ['cf', 'cf', 'cf', 'cf']])

(1,
 [4, 6, 10, 1, 3, 3, 3, 0],
 [8, 5, 7, 0, 2, 3, 1, 4],
 [['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr'],
  ['cr', 'cr', 'cr', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cf'],
  ['cf', 'cf', 'cf', 'cf']])

In [24]:
binary_change_s3, D_s3, E_s3, clusters_s3 = cluster_embeddings_binary(s3_embeddings, gs3_embeddings, aff_model=aff, k=0, n=2)
binary_change_s3, D_s3, E_s3, clusters_s3

(0,
 [8, 5, 4, 10, 1, 2, 0, 0],
 [7, 4, 1, 5, 6, 5, 1, 1],
 [['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cf'],
  ['cf']])

(0,
 [8, 5, 4, 10, 1, 2, 0, 0],
 [7, 4, 1, 5, 6, 5, 1, 1],
 [['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cf'],
  ['cf']])

In [25]:
binary_change_s4, D_s4, E_s4, clusters_s4 = cluster_embeddings_binary(s4_embeddings, gs4_embeddings, aff_model=aff, k=0, n=2)
binary_change_s4, D_s4, E_s4, clusters_s4

(1,
 [5, 14, 2, 4, 4, 1, 0, 0, 0, 0],
 [3, 5, 1, 6, 7, 0, 4, 1, 1, 2],
 [['cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cr', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr'],
  ['cf', 'cf', 'cf', 'cf'],
  ['cf'],
  ['cf'],
  ['cf', 'cf']])

(1,
 [5, 14, 2, 4, 4, 1, 0, 0, 0, 0],
 [3, 5, 1, 6, 7, 0, 4, 1, 1, 2],
 [['cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cr', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr'],
  ['cf', 'cf', 'cf', 'cf'],
  ['cf'],
  ['cf'],
  ['cf', 'cf']])

## APD

In [26]:
# Pre-trained embeddings
print('dia', compute_apd(tg1_embeddings, tg2_embeddings))
print('s1', compute_apd(gs1_embeddings, s1_embeddings))
print('s2', compute_apd(gs2_embeddings, s2_embeddings))
print('s3', compute_apd(gs3_embeddings, s3_embeddings))
print('s4', compute_apd(gs4_embeddings, s4_embeddings))

dia 0.20095693515997842
s1 0.21373258741378562
s2 0.21893470070994883
s3 0.20913882849119797
s4 0.2134432500436538


# Fine-tuned Embeddings

## Load embeddings from Disk

In [27]:
import numpy as np

folder_path_ft = '/content/drive/MyDrive/MSc Computer Science/Master Thesis/ft-trial-base-embeddings/'

ft_tg1_embeddings = np.load(folder_path_ft+"ft_tg1_base_embeddings.npy")
ft_tg2_embeddings = np.load(folder_path_ft+"ft_tg2_base_embeddings.npy")
ft_gs1_embeddings = np.load(folder_path_ft+"ft_gs1_base_embeddings.npy")
ft_s1_embeddings = np.load(folder_path_ft+"ft_s1_base_embeddings.npy")
ft_gs2_embeddings = np.load(folder_path_ft+"ft_gs2_base_embeddings.npy")
ft_s2_embeddings = np.load(folder_path_ft+"ft_s2_base_embeddings.npy")
ft_gs3_embeddings = np.load(folder_path_ft+"ft_gs3_base_embeddings.npy")
ft_s3_embeddings = np.load(folder_path_ft+"ft_s3_base_embeddings.npy")
ft_gs4_embeddings = np.load(folder_path_ft+"ft_gs4_base_embeddings.npy")
ft_s4_embeddings = np.load(folder_path_ft+"ft_s4_base_embeddings.npy")

## Cluster and binary LSC

In [28]:
bc_d_ft, D_d_ft, E_d_ft, clusters_d_ft = cluster_embeddings_binary(ft_tg1_embeddings, ft_tg2_embeddings, aff_model=aff)
bc_d_ft, D_d_ft, E_d_ft, clusters_d_ft

(0,
 [2, 7, 4, 4, 4, 1, 6, 1, 1],
 [8, 3, 5, 3, 4, 4, 1, 0, 2],
 [['cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf'],
  ['cr'],
  ['cr', 'cf', 'cf']])

(0,
 [2, 7, 4, 4, 4, 1, 6, 1, 1],
 [8, 3, 5, 3, 4, 4, 1, 0, 2],
 [['cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf'],
  ['cr'],
  ['cr', 'cf', 'cf']])

In [29]:
bc_s1_ft, D_s1_ft, E_s1_ft, clusters_s1_ft = cluster_embeddings_binary(ft_gs1_embeddings, ft_s1_embeddings, aff_model=aff)
bc_s1_ft, D_s1_ft, E_s1_ft, clusters_s1_ft

(1,
 [6, 3, 6, 4, 2, 1, 4, 2, 1, 1, 0],
 [10, 7, 0, 3, 5, 0, 0, 1, 2, 0, 2],
 [['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr'],
  ['cr', 'cr', 'cr', 'cr'],
  ['cr', 'cr', 'cf'],
  ['cr', 'cf', 'cf'],
  ['cr'],
  ['cf', 'cf']])

(1,
 [6, 3, 6, 4, 2, 1, 4, 2, 1, 1, 0],
 [10, 7, 0, 3, 5, 0, 0, 1, 2, 0, 2],
 [['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr'],
  ['cr', 'cr', 'cr', 'cr'],
  ['cr', 'cr', 'cf'],
  ['cr', 'cf', 'cf'],
  ['cr'],
  ['cf', 'cf']])

In [30]:
bc_s2_ft, D_s2_ft, E_s2_ft, clusters_s2_ft = cluster_embeddings_binary(ft_gs2_embeddings, ft_s2_embeddings, aff_model=aff)
bc_s2_ft, D_s2_ft, E_s2_ft, clusters_s2_ft

(1,
 [1, 5, 4, 4, 4, 10, 2, 0],
 [3, 0, 2, 5, 12, 6, 1, 1],
 [['cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cr', 'cf'],
  ['cf']])

In [31]:
bc_s3_ft, D_s3_ft, E_s3_ft, clusters_s3_ft = cluster_embeddings_binary(ft_gs3_embeddings, ft_s3_embeddings, aff_model=aff)
bc_s3_ft, D_s3_ft, E_s3_ft, clusters_s3_ft

(1,
 [7, 3, 3, 5, 6, 4, 1, 1],
 [5, 0, 8, 10, 1, 3, 0, 3],
 [['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr'],
  ['cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr'],
  ['cr', 'cf', 'cf', 'cf']])

In [32]:
bc_s4_ft, D_s4_ft, E_s4_ft, clusters_s4_ft = cluster_embeddings_binary(ft_gs4_embeddings, ft_s4_embeddings, aff_model=aff)
bc_s4_ft, D_s4_ft, E_s4_ft, clusters_s4_ft

(1,
 [4, 5, 2, 7, 3, 4, 4, 1],
 [8, 0, 0, 3, 3, 0, 14, 2],
 [['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr'],
  ['cr', 'cr'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cf', 'cf']])

(1,
 [4, 5, 2, 7, 3, 4, 4, 1],
 [8, 0, 0, 3, 3, 0, 14, 2],
 [['cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr', 'cr'],
  ['cr', 'cr'],
  ['cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cf', 'cf', 'cf'],
  ['cr', 'cr', 'cr', 'cr'],
  ['cr',
   'cr',
   'cr',
   'cr',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf',
   'cf'],
  ['cr', 'cf', 'cf']])

## Cluster and graded LSC (JSD)

In [37]:
d_ft_jsd, _, _  = cluster_and_compare_jsd(aff, ft_tg1_embeddings, ft_tg2_embeddings)
d_ft_jsd
# np.float64(0.3338064898588424)

counts1 [4 4 7 1 6 4 1 1 2]
counts2 [3 5 3 0 1 4 2 4 8]


np.float64(0.3338064898588424)

In [38]:
ft_s1_jsd, _, _  = cluster_and_compare_jsd(aff, ft_gs1_embeddings, ft_s1_embeddings)
ft_s1_jsd
# np.float64(0.44921871324465645

counts1 [3 1 2 4 1 6 0 4 2 6 1]
counts2 [ 7  0  1  0  0  0  2  3  5 10  2]


np.float64(0.44921871324465645)

In [39]:
ft_s2_jsd, _, _  = cluster_and_compare_jsd(aff, ft_gs2_embeddings, ft_s2_embeddings)
ft_s2_jsd
# np.float64(0.3616129244248057

counts1 [ 4  5  2  0  4  4  1 10]
counts2 [ 2  0  1  1 12  5  3  6]


np.float64(0.3616129244248057)

In [40]:
ft_s3_jsd, _, _= cluster_and_compare_jsd(aff, ft_gs3_embeddings, ft_s3_embeddings)
ft_s3_jsd
# np.float64(0.3546079707822841

counts1 [6 3 7 1 1 5 3 4]
counts2 [ 1  0  5  0  3 10  8  3]


np.float64(0.3546079707822841)

In [41]:
ft_s4_jsd, _, _ = cluster_and_compare_jsd(aff, ft_gs4_embeddings, ft_s4_embeddings)
ft_s4_jsd
# np.float64(0.45164293828621116

counts1 [5 2 4 7 1 4 4 3]
counts2 [ 0  0  0  3  2  8 14  3]


np.float64(0.45164293828621116)

## APD

In [33]:
# Fine-tuned embeddings
print('dia', compute_apd(ft_tg1_embeddings, ft_tg2_embeddings))
print('s1', compute_apd(ft_gs1_embeddings, ft_s1_embeddings))
print('s2', compute_apd(ft_gs2_embeddings, ft_s2_embeddings))
print('s3', compute_apd(ft_gs3_embeddings, ft_s3_embeddings))
print('s4', compute_apd(ft_gs4_embeddings, ft_s4_embeddings))

dia 0.2342694398817323
s1 0.2447261262153447
s2 0.2526628800535178
s3 0.24530404686352136
s4 0.25772409003511876


# Ranks

In [34]:
# pt_d_tuple = ('pt_dia', pt_d_jsd)
pt_s1_tuple = ('pt_s1', pt_s1_jsd)
pt_s2_tuple = ('pt_s2', pt_s2_jsd)
pt_s3_tuple = ('pt_s3', pt_s3_jsd)
pt_s4_tuple = ('pt_s4', pt_s4_jsd)

pt_tuples = [pt_s1_tuple, pt_s2_tuple, pt_s3_tuple, pt_s4_tuple]
pt_tuples

[('pt_s1', np.float64(0.5024905442575683)),
 ('pt_s2', np.float64(0.29100035275770453)),
 ('pt_s3', np.float64(0.31448108089053484)),
 ('pt_s4', np.float64(0.39782780640754384))]

In [35]:
import operator

pt_ranked = sorted(pt_tuples, key=operator.itemgetter(1), reverse=True)

for tup in pt_ranked:
    print(tup)

('pt_s1', np.float64(0.5024905442575683))
('pt_s4', np.float64(0.39782780640754384))
('pt_s3', np.float64(0.31448108089053484))
('pt_s2', np.float64(0.29100035275770453))


In [44]:
ft_s1_tuple = ('ft_s1', ft_s1_jsd)
ft_s2_tuple = ('ft_s2', ft_s2_jsd)
ft_s3_tuple = ('ft_s3', ft_s3_jsd)
ft_s4_tuple = ('ft_s4', ft_s4_jsd)

ft_tuples = [ft_s1_tuple, ft_s2_tuple, ft_s3_tuple, ft_s4_tuple]
ft_tuples

[('ft_s1', np.float64(0.44921871324465645)),
 ('ft_s2', np.float64(0.3616129244248057)),
 ('ft_s3', np.float64(0.3546079707822841)),
 ('ft_s4', np.float64(0.45164293828621116))]

In [43]:
import operator

ft_ranked = sorted(ft_tuples, key=operator.itemgetter(1), reverse=True)

for tup in ft_ranked:
    print(tup)

('ft_s4', np.float64(0.45164293828621116))
('ft_s1', np.float64(0.44921871324465645))
('ft_s2', np.float64(0.3616129244248057))
('ft_s3', np.float64(0.3546079707822841))
