In [None]:
!pip install mlrose-hiive joblib==1.2.0

Collecting mlrose-hiive
  Downloading mlrose_hiive-2.2.4.tar.gz (49 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/49.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting joblib==1.2.0
  Downloading joblib-1.2.0-py3-none-any.whl.metadata (5.3 kB)
Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mlrose-hiive
  Building wheel for mlrose-hiive (setup.py) ... [?25l[?25hdone
  Created wheel for mlrose-hiive: filename=mlrose_hiive-2.2.4-py3-none-any.whl size=98335 sha256=1e3853767aa6ac770987d86ac2b5a33d93b8c1e7c31f48d574acd33dad94efad
  Stored in directory: /root/.cache/pip/wheels/7d/37/ec/8f4f01ed6712f4784ba7bcb6c666a6a20e

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive', force_remount=True)
folder_path = '/content/drive/My Drive/MSc Computer Science/Master Thesis/Annotations'

for f in os.listdir(folder_path):
    print(f)

Mounted at /content/drive
SA_Annotations_temporal-comparison.xlsx
SA_Annotations_temporal-comparison_r3.xlsx
Speaker1_annotation_r2.xlsx
Speaker1_annotation_r3.xlsx
Speaker_annotation_r4.xlsx
t_s1_annotation_r1.xlsx
t_s1_annotation_r2.xlsx
t_s2_annotations_r1.xlsx
t_s2_annotation_r2.xlsx
t_s2_annotation_r3.xlsx
t_s2_annotation_r4.xlsx
t_s2_annotation_r6.xlsx
t_s1_annotation_r4_do not use.xlsx
t_s3_annotation_r2.xlsx
t_s3_annotation_r3.xlsx
t_s4_annotation_r1.xlsx
t_s4_annotation_r2.xlsx
t_s4_annotation_r3.xlsx
t_s3_annotation_r4.xlsx
t_s3_annnotation_r1.xlsx
t_s3_annotation_r2_ny.xlsx
t_s2_annotation_r5.xlsx
t_s3_annotation_r3_ny.xlsx
t_s4_annotation_r4.xlsx
Speaker1_annotations.xlsx
t_s1_annotation_r4.xlsx
SA_Annotations_temporal-comparison_r4.xlsx
agreements_author.xlsx
agreements.xlsx
t_s3_annotation_r4_ny.xlsx


# Helpers

In [None]:
from scipy.spatial.distance import jensenshannon

def compute_sfd(clusters, c1_label, c2_label):
    D = []
    E = []
    for cluster in clusters:
        count_c1 = sum(1 for u in cluster if c1_label in u)
        count_c2 = sum(1 for u in cluster if c2_label in u)
        D.append(count_c1)
        E.append(count_c2)
    return D, E

def classify_change_binary(D, E, k=0, n=1):
    for d_i, e_i in zip(D, E):
        if (d_i <= k and e_i >= n) or (d_i >= n and e_i <= k):
            return 1
    return 0

def normalize_distribution(dist):
    total = sum(dist)
    return [f / total for f in dist]

def classify_change_graded(D, E):
    P = normalize_distribution(D)
    Q = normalize_distribution(E)
    graded = jensenshannon(P, Q)
    return graded

In [None]:
import mlrose_hiive as mlrose
import pandas as pd
import numpy as np

def create_fitness_fn(graph, shift=2.5):
    vertices = list(set(u for u, v in graph.get_edges()).union(v for u, v in graph.get_edges()))

    def fitness(state):
        cluster_assignments = {v: state[i] for i, v in enumerate(vertices)}
        positive_between = 0
        negative_within = 0

        for u, v in graph.get_edges():
            original_weight = graph.get_edge_weight(u, v)

            # Handle NaN values and non-convertible weights
            if pd.isna(original_weight):
                continue
            try:
                original_weight = float(original_weight)
            except (ValueError, TypeError):
                continue

            shifted_weight = original_weight - shift
            same_cluster = cluster_assignments[u] == cluster_assignments[v]

            if same_cluster and shifted_weight < 0:
                negative_within += abs(shifted_weight)
            elif not same_cluster and shifted_weight >= 0:
                positive_between += shifted_weight

        return positive_between + negative_within

    return fitness, vertices

def optimize_with_mlrose(graph, num_clusters=3, max_iter=10000):
    np.random.seed(42)

    fitness_fn, vertices = create_fitness_fn(graph)
    custom_fitness = mlrose.CustomFitness(fitness_fn)

    problem = mlrose.DiscreteOpt(length=len(vertices),
                                 fitness_fn=custom_fitness,
                                 maximize=False,
                                 max_val=num_clusters)

    best_state, best_loss, _ = mlrose.simulated_annealing(problem,
                                                       schedule=mlrose.ExpDecay(),
                                                       max_attempts=100,
                                                       max_iters=max_iter,
                                                       init_state=None)

    clusters = [[] for _ in range(num_clusters)]
    for i, cluster_id in enumerate(best_state):
        clusters[cluster_id].append(vertices[i])

    non_empty_clusters = [set(cluster) for cluster in clusters if cluster]

    return non_empty_clusters, best_loss


def optimize_clusters_mlrose(graph, max_clusters=30, runs_per_k=20, max_iter=10000):
    best_clusters = None
    best_loss = float('inf')

    for k in range(2, max_clusters + 1):
        for _ in range(runs_per_k):
            clusters, loss = optimize_with_mlrose(graph, num_clusters=k, max_iter=max_iter)
            if loss < best_loss:
                best_loss = loss
                best_clusters = clusters

    return best_clusters, best_loss

In [None]:
class Graph:
    def __init__(self, gdict=None):
        if gdict is None:
            gdict = {}
        self.gdict = gdict
        self.edge_weights = {}

    def add_vertex(self, vertices):
        for v in vertices:
            if v not in self.gdict:
                self.gdict[v] = []

    def add_edge(self, v, neighbors, weight=None):
        for ne in neighbors:
            # add v -> neighbor
            if v not in self.gdict:
                self.gdict[v] = []
            if ne not in self.gdict[v]:
                self.gdict[v].append(ne)

            # add neighbor -> v (undirected)
            if ne not in self.gdict:
                self.gdict[ne] = []
            if v not in self.gdict[ne]:
                self.gdict[ne].append(v)

            if weight is not None:
                pair = tuple(sorted([v, ne])) # sort before store
                self.edge_weights[pair] = weight

    def get_edges(self):
        seen = set()
        edges = []
        for v in self.gdict:
            for ne in self.gdict[v]:
                pair = tuple(sorted([v, ne]))
                if pair not in seen:
                    seen.add(pair)
                    edges.append(pair)
        return edges

    def get_vertices(self):
        return list(self.gdict.keys())

    def get_edge_weight(self, u1, u2):
        return self.edge_weights.get(tuple(sorted([u1, u2])))

    def add_edges_from_annotation_dict(self, annotation_dict, round_number):
        if round_number in annotation_dict:
            for (u1, u2), weight in annotation_dict[round_number].items():
                self.add_edge(u1, [u2], weight)


In [None]:
def add_annotation_pair(annotation_dict, round_num, usage1, usage2, annotation):

    if round_num not in annotation_dict:
        annotation_dict[round_num] = {}

    pair = tuple(sorted([usage1, usage2])) # pair is unsorted

    annotation_dict[round_num][pair] = annotation


def add_annotations_from_df_to_dict(df, annotation_dict, round_number):
    for _, row in df.iterrows():
        if row['round'] == round_number:
            u1 = row['usage_1_id']
            u2 = row['usage_2_id']
            anno = row['anno']
            add_annotation_pair(annotation_dict, round_number, u1, u2, anno)

def add_annotation_from_df_to_dict_and_graph(df, graph, annotation_dict, round_number):
    add_annotations_from_df_to_dict(df, annotation_dict, round_number)
    graph.add_edges_from_annotation_dict(annotation_dict, round_number)


# Diachronic

In [None]:
clusters_dict = {}
annotation_dict = {}
usage_graph = Graph()

In [None]:
filename = '/SA_Annotations_temporal-comparison_r4.xlsx'
df = pd.read_excel(folder_path+filename)
df

Unnamed: 0,round,usage_1_id,usage_1_text,usage_2_id,usage_2_text,anno,Unnamed: 6
0,1.0,u5543_tg2,,u456_tg1,,1,
1,1.0,u571_tg1,,u260_tg2,,3,
2,1.0,u260_tg2,,u122_tg1,,3,
3,1.0,u122_tg1,,u1728_tg1,,1,
4,1.0,u1728_tg1,,u5543_tg2,,1,
...,...,...,...,...,...,...,...
150,4.0,u2006_tg2,Vi har hele tiden vært opptatt av at det er øk...,u6067_tg2,"Ja, det er vel det samme, at man kan sikkert f...",3,
151,4.0,u5238_tg2,Jenters og gutters rett til utdanning er et vi...,u6033_tg2,Bærekraftig bruk av naturlige økosystemer er e...,2,
152,4.0,u1126_tg1,Jeg vil også understreke at det å nå de ambisi...,u3456_tg2,"Avslutningsvis vil jeg gjerne dra fram, som je...",2,
153,4.0,u456_tg1,Møtet vil sette fokus på nordisk bærekraftig u...,u571_tg1,Det skal legge opp til en økonomisk politikk s...,4,


In [None]:
add_annotation_from_df_to_dict_and_graph(df, usage_graph, annotation_dict, 1)
add_annotation_from_df_to_dict_and_graph(df, usage_graph, annotation_dict, 2)
add_annotation_from_df_to_dict_and_graph(df, usage_graph, annotation_dict, 3)
add_annotation_from_df_to_dict_and_graph(df, usage_graph, annotation_dict, 4)
annotation_dict

{1: {('u456_tg1', 'u5543_tg2'): 1,
  ('u260_tg2', 'u571_tg1'): 3,
  ('u122_tg1', 'u260_tg2'): 3,
  ('u122_tg1', 'u1728_tg1'): 1,
  ('u1728_tg1', 'u5543_tg2'): 1},
 2: {('u1728_tg1', 'u2619_tg1'): 2,
  ('u260_tg2', 'u2619_tg1'): 3,
  ('u102_tg1', 'u456_tg1'): 3,
  ('u102_tg1', 'u122_tg1'): 3,
  ('u1728_tg1', 'u3037_tg1'): 1,
  ('u122_tg1', 'u3037_tg1'): 2,
  ('u1126_tg1', 'u1728_tg1'): 2,
  ('u1126_tg1', 'u260_tg2'): 4,
  ('u1003_tg1', 'u456_tg1'): 4,
  ('u1003_tg1', 'u260_tg2'): 3,
  ('u1728_tg1', 'u914_tg1'): 2,
  ('u260_tg2', 'u914_tg1'): 3,
  ('u1728_tg1', 'u3016_tg1'): 1,
  ('u122_tg1', 'u3016_tg1'): 3,
  ('u1728_tg1', 'u419_tg1'): 1,
  ('u122_tg1', 'u419_tg1'): 1,
  ('u2771_tg1', 'u456_tg1'): 3,
  ('u122_tg1', 'u2771_tg1'): 3,
  ('u3033_tg1', 'u456_tg1'): 3,
  ('u3033_tg1', 'u5543_tg2'): 4,
  ('u1728_tg1', 'u2233_tg1'): 2,
  ('u122_tg1', 'u2233_tg1'): 3,
  ('u356_tg1', 'u456_tg1'): 1,
  ('u356_tg1', 'u5543_tg2'): 4,
  ('u2418_tg1', 'u456_tg1'): 2,
  ('u122_tg1', 'u2418_tg1'): 3,
 

In [None]:
dia_clusters, dia_best_loss = optimize_clusters_mlrose(usage_graph, max_clusters=30, runs_per_k=20)

print("Best clusters:", dia_clusters)
print("Best loss:", dia_best_loss)

Best clusters: [{'u5745_tg2'}, {'u204_tg2'}, {'u4931_tg2'}, {'u102_tg1', 'u122_tg1', 'u2298_tg1', 'u2932_tg1', 'u7308_tg2', 'u2418_tg1', 'u2233_tg1', 'u3016_tg1', 'u3456_tg2', 'u2771_tg1', 'u4467_tg2', 'u6033_tg2'}, {'u1143_tg2', 'u4597_tg2', 'u244_tg2', 'u4139_tg2', 'u712_tg2', 'u1728_tg1', 'u2006_tg2', 'u6067_tg2', 'u1628_tg2', 'u130_tg1', 'u912_tg2'}, {'u1791_tg2', 'u2465_tg1', 'u6074_tg2', 'u814_tg1', 'u2619_tg1', 'u914_tg1', 'u383_tg1', 'u1126_tg1', 'u260_tg2', 'u1828_tg2', 'u5865_tg2', 'u2661_tg1', 'u2872_tg1'}, {'u571_tg1', 'u456_tg1', 'u217_tg2', 'u108_tg1', 'u952_tg1', 'u2069_tg1', 'u767_tg2', 'u895_tg1', 'u1905_tg2', 'u4837_tg2', 'u1003_tg1', 'u839_tg2', 'u5238_tg2'}, {'u2232_tg1', 'u3033_tg1', 'u419_tg1', 'u5323_tg2', 'u356_tg1', 'u5543_tg2'}, {'u2253_tg2', 'u3037_tg1'}]
Best loss: 9.0


Best clusters: [{'u5745_tg2'}, {'u204_tg2'}, {'u4931_tg2'}, {'u102_tg1', 'u122_tg1', 'u2298_tg1', 'u2932_tg1', 'u7308_tg2', 'u2418_tg1', 'u2233_tg1', 'u3016_tg1', 'u3456_tg2', 'u2771_tg1', 'u4467_tg2', 'u6033_tg2'}, {'u1143_tg2', 'u4597_tg2', 'u244_tg2', 'u4139_tg2', 'u712_tg2', 'u1728_tg1', 'u2006_tg2', 'u6067_tg2', 'u1628_tg2', 'u130_tg1', 'u912_tg2'}, {'u1791_tg2', 'u2465_tg1', 'u6074_tg2', 'u814_tg1', 'u2619_tg1', 'u914_tg1', 'u383_tg1', 'u1126_tg1', 'u260_tg2', 'u1828_tg2', 'u5865_tg2', 'u2661_tg1', 'u2872_tg1'}, {'u571_tg1', 'u456_tg1', 'u217_tg2', 'u108_tg1', 'u952_tg1', 'u2069_tg1', 'u767_tg2', 'u895_tg1', 'u1905_tg2', 'u4837_tg2', 'u1003_tg1', 'u839_tg2', 'u5238_tg2'}, {'u2232_tg1', 'u3033_tg1', 'u419_tg1', 'u5323_tg2', 'u356_tg1', 'u5543_tg2'}, {'u2253_tg2', 'u3037_tg1'}]
Best loss: 9.0

In [None]:
dia_clusters = [{'u5745_tg2'}, {'u204_tg2'}, {'u4931_tg2'}, {'u102_tg1', 'u122_tg1', 'u2298_tg1', 'u2932_tg1', 'u7308_tg2', 'u2418_tg1', 'u2233_tg1', 'u3016_tg1', 'u3456_tg2', 'u2771_tg1', 'u4467_tg2', 'u6033_tg2'}, {'u1143_tg2', 'u4597_tg2', 'u244_tg2', 'u4139_tg2', 'u712_tg2', 'u1728_tg1', 'u2006_tg2', 'u6067_tg2', 'u1628_tg2', 'u130_tg1', 'u912_tg2'}, {'u1791_tg2', 'u2465_tg1', 'u6074_tg2', 'u814_tg1', 'u2619_tg1', 'u914_tg1', 'u383_tg1', 'u1126_tg1', 'u260_tg2', 'u1828_tg2', 'u5865_tg2', 'u2661_tg1', 'u2872_tg1'}, {'u571_tg1', 'u456_tg1', 'u217_tg2', 'u108_tg1', 'u952_tg1', 'u2069_tg1', 'u767_tg2', 'u895_tg1', 'u1905_tg2', 'u4837_tg2', 'u1003_tg1', 'u839_tg2', 'u5238_tg2'}, {'u2232_tg1', 'u3033_tg1', 'u419_tg1', 'u5323_tg2', 'u356_tg1', 'u5543_tg2'}, {'u2253_tg2', 'u3037_tg1'}]


In [None]:
D_d, E_d = compute_sfd(dia_clusters, "tg1", "tg2")
D_d, E_d # ([0, 0, 0, 8, 2, 8, 7, 4, 1], [1, 1, 1, 4, 9, 5, 6, 2, 1])

([0, 0, 0, 8, 2, 8, 7, 4, 1], [1, 1, 1, 4, 9, 5, 6, 2, 1])

In [None]:
classify_change_binary(D_d, E_d) # 1

1

In [None]:
dia_graded = classify_change_graded(D_d, E_d) # np.float64(0.31346758098987804)
dia_graded

np.float64(0.31346758098987804)

# Speaker 1

In [None]:
import pandas as pd

filename = 't_s1_annotation_r4.xlsx'
folder_path = '/content/drive/My Drive/MSc Computer Science/Master Thesis/Annotations/'

df_s1 = pd.read_excel(folder_path + filename)
df_s1

Unnamed: 0,round,usage_1_id,usage_1_text,usage_2_id,usage_2_text,anno
0,1.0,u5543_general,,u35_speaker,,1.0
1,1.0,u35_speaker,,u28_speaker,,1.0
2,1.0,u28_speaker,,u244_general,,1.0
3,1.0,u244_general,,u23_speaker,,1.0
4,1.0,u23_speaker,,u8_speaker,,1.0
...,...,...,...,...,...,...
148,4.0,u1905_general,Disse er: Et grønt Norden: Det er et ønske om ...,u2006_general,Det er bred enighet om at den nåværende avtale...,3.0
149,4.0,u1143_general,"Om vi spiser mindre kjøtt, men betaler mer for...",u5865_general,Jeg er veldig stolt av å være med og sikre et ...,2.0
150,4.0,u1828_general,Stortinget satte som forutsetning for videre v...,u7_speaker,I et bærekraftperspektiv snakker man om økonom...,3.0
151,4.0,u204_general,Det skal lanseres en bærekraftig innvandringsp...,u712_general,Vi synes det er positivt at norsk industri skr...,2.0


In [None]:
s1_clusters_dict = {}
s1_annotation_dict = {}
s1_usage_graph = Graph()

add_annotation_from_df_to_dict_and_graph(df_s1, s1_usage_graph, s1_annotation_dict, 1)
add_annotation_from_df_to_dict_and_graph(df_s1, s1_usage_graph, s1_annotation_dict, 2)
add_annotation_from_df_to_dict_and_graph(df_s1, s1_usage_graph, s1_annotation_dict, 3)
add_annotation_from_df_to_dict_and_graph(df_s1, s1_usage_graph, s1_annotation_dict, 4)

s1_clusters, s1_best_loss = optimize_clusters_mlrose(s1_usage_graph, max_clusters=30, runs_per_k=20)

print("Best clusters:", s1_clusters)
print("Best loss:", s1_best_loss)

Best clusters: [{'u23_speaker', 'u4139_general', 'u839_general', 'u6_speaker', 'u166_speaker'}, {'u107_speaker', 'u28_speaker', 'u26_speaker', 'u5543_general', 'u59_speaker', 'u260_general', 'u195_speaker', 'u1791_general'}, {'u154_speaker', 'u217_general', 'u5865_general', 'u35_speaker', 'u3456_general', 'u4467_general', 'u6074_general', 'u50_speaker'}, {'u912_general', 'u767_general', 'u204_general', 'u4931_general'}, {'u62_speaker', 'u70_speaker', 'u5323_general', 'u1828_general', 'u1143_general', 'u108_speaker', 'u163_speaker', 'u1905_general', 'u7_speaker', 'u186_speaker', 'u2006_general', 'u244_general', 'u56_speaker', 'u114_speaker', 'u151_speaker', 'u4837_general', 'u55_speaker'}, {'u5238_general', 'u143_speaker', 'u2253_general', 'u8_speaker', 'u129_speaker', 'u4464_general', 'u1628_general', 'u189_speaker', 'u22_speaker'}, {'u57_speaker', 'u712_general', 'u139_speaker', 'u5745_general', 'u4597_general', 'u6067_general'}, {'u188_speaker', 'u173_speaker', 'u6033_general'}]
Best

In [None]:
s1_clusters = [{'u23_speaker', 'u4139_general', 'u839_general', 'u6_speaker', 'u166_speaker'}, {'u107_speaker', 'u28_speaker', 'u26_speaker', 'u5543_general', 'u59_speaker', 'u260_general', 'u195_speaker', 'u1791_general'}, {'u154_speaker', 'u217_general', 'u5865_general', 'u35_speaker', 'u3456_general', 'u4467_general', 'u6074_general', 'u50_speaker'}, {'u912_general', 'u767_general', 'u204_general', 'u4931_general'}, {'u62_speaker', 'u70_speaker', 'u5323_general', 'u1828_general', 'u1143_general', 'u108_speaker', 'u163_speaker', 'u1905_general', 'u7_speaker', 'u186_speaker', 'u2006_general', 'u244_general', 'u56_speaker', 'u114_speaker', 'u151_speaker', 'u4837_general', 'u55_speaker'}, {'u5238_general', 'u143_speaker', 'u2253_general', 'u8_speaker', 'u129_speaker', 'u4464_general', 'u1628_general', 'u189_speaker', 'u22_speaker'}, {'u57_speaker', 'u712_general', 'u139_speaker', 'u5745_general', 'u4597_general', 'u6067_general'}, {'u188_speaker', 'u173_speaker', 'u6033_general'}]


In [None]:
D_s1, E_s1 = compute_sfd(s1_clusters, "general", "speaker")
D_s1, E_s1
# ([2, 3, 5, 4, 7, 4, 4, 1], [3, 5, 3, 0, 10, 5, 2, 2])

([2, 3, 5, 4, 7, 4, 4, 1], [3, 5, 3, 0, 10, 5, 2, 2])

In [None]:
classify_change_binary(D_s1, E_s1) # 1

1

In [None]:
s1_graded = classify_change_graded(D_s1, E_s1)
s1_graded # np.float64(0.2648922456511161)

np.float64(0.2648922456511161)

# Speaker 2

In [None]:
import pandas as pd

filename = 't_s2_annotation_r5.xlsx'
folder_path = '/content/drive/My Drive/MSc Computer Science/Master Thesis/Annotations/'

df_s2 = pd.read_excel(folder_path + filename)
df_s2

Unnamed: 0,round,usage_1_id,usage_1_text,usage_2_id,usage_2_text,anno,Unnamed: 6
0,1.0,u1143_general,,u70_speaker,,3.0,
1,1.0,u70_speaker,,u130_speaker,,4.0,
2,1.0,u130_speaker,,u5238_general,,2.0,
3,1.0,u5238_general,,u2278_general,,2.0,
4,1.0,u2278_general,,u28_speaker,,3.0,
...,...,...,...,...,...,...,...
134,5.0,u4931_general,Bærekraftig vekst og høyere sysselsetting er f...,u5323_general,Samtidig snakker de om miljøvern og bærekrafti...,2.0,
135,5.0,u4_speaker,Slike planer og oppfølgingen av disse er med p...,u6_speaker,"Når det gjelder fiskeriforvaltningen, er bærek...",3.0,
136,5.0,u1791_general,Nordisk Ministerråd har vedtatt «Ny nordisk da...,u35_speaker,Til slutt: For Kristelig Folkeparti er det pri...,2.0,
137,5.0,u130_speaker,De var for så vidt enig i at vi skal forvalte ...,u4139_general,Forvaltningen av de levende marine ressursene ...,4.0,


In [None]:
s2_clusters_dict = {}
s2_annotation_dict = {}
s2_usage_graph = Graph()

add_annotation_from_df_to_dict_and_graph(df_s2, s2_usage_graph, s2_annotation_dict, 1)
add_annotation_from_df_to_dict_and_graph(df_s2, s2_usage_graph, s2_annotation_dict, 2)
add_annotation_from_df_to_dict_and_graph(df_s2, s2_usage_graph, s2_annotation_dict, 3)
add_annotation_from_df_to_dict_and_graph(df_s2, s2_usage_graph, s2_annotation_dict, 4)
add_annotation_from_df_to_dict_and_graph(df_s2, s2_usage_graph, s2_annotation_dict, 5)

s2_clusters, s2_best_loss = optimize_clusters_mlrose(s2_usage_graph, max_clusters=30, runs_per_k=20)

print("Best clusters:", s2_clusters)
print("Best loss:", s2_best_loss)

Best clusters: [{'u260_general', 'u108_speaker', 'u1628_general', 'u53_speaker'}, {'u69_speaker', 'u131_speaker', 'u11_speaker', 'u6_speaker', 'u3_speaker', 'u1805_general', 'u4139_general', 'u4_speaker', 'u244_general', 'u89_speaker', 'u5323_general', 'u3679_general', 'u767_general', 'u130_speaker', 'u83_speaker', 'u26_speaker', 'u91_speaker', 'u70_speaker', 'u1828_general', 'u2006_general', 'u22_speaker'}, {'u712_general', 'u0_speaker', 'u2253_general', 'u5543_general', 'u4827_general', 'u1791_general'}, {'u35_speaker', 'u5238_general', 'u4464_general', 'u71_speaker'}, {'u103_speaker', 'u4597_general'}, {'u4467_general'}, {'u62_speaker', 'u125_speaker'}, {'u1905_general', 'u25_speaker'}, {'u217_general'}, {'u77_speaker', 'u4931_general'}, {'u135_speaker'}, {'u839_general'}, {'u912_general', 'u1143_general', 'u29_speaker', 'u4837_general'}, {'u64_speaker'}, {'u57_speaker', 'u75_speaker', 'u204_general', 'u2278_general', 'u3456_general', 'u27_speaker', 'u3436_general', 'u28_speaker'}]


Best clusters: [{'u260_general', 'u108_speaker', 'u1628_general', 'u53_speaker'}, {'u69_speaker', 'u131_speaker', 'u11_speaker', 'u6_speaker', 'u3_speaker', 'u1805_general', 'u4139_general', 'u4_speaker', 'u244_general', 'u89_speaker', 'u5323_general', 'u3679_general', 'u767_general', 'u130_speaker', 'u83_speaker', 'u26_speaker', 'u91_speaker', 'u70_speaker', 'u1828_general', 'u2006_general', 'u22_speaker'}, {'u712_general', 'u0_speaker', 'u2253_general', 'u5543_general', 'u4827_general', 'u1791_general'}, {'u35_speaker', 'u5238_general', 'u4464_general', 'u71_speaker'}, {'u103_speaker', 'u4597_general'}, {'u4467_general'}, {'u62_speaker', 'u125_speaker'}, {'u1905_general', 'u25_speaker'}, {'u217_general'}, {'u77_speaker', 'u4931_general'}, {'u135_speaker'}, {'u839_general'}, {'u912_general', 'u1143_general', 'u29_speaker', 'u4837_general'}, {'u64_speaker'}, {'u57_speaker', 'u75_speaker', 'u204_general', 'u2278_general', 'u3456_general', 'u27_speaker', 'u3436_general', 'u28_speaker'}]
Best loss: 6.5

In [None]:
s2_clusters = [{'u260_general', 'u108_speaker', 'u1628_general', 'u53_speaker'}, {'u69_speaker', 'u131_speaker', 'u11_speaker', 'u6_speaker', 'u3_speaker', 'u1805_general', 'u4139_general', 'u4_speaker', 'u244_general', 'u89_speaker', 'u5323_general', 'u3679_general', 'u767_general', 'u130_speaker', 'u83_speaker', 'u26_speaker', 'u91_speaker', 'u70_speaker', 'u1828_general', 'u2006_general', 'u22_speaker'}, {'u712_general', 'u0_speaker', 'u2253_general', 'u5543_general', 'u4827_general', 'u1791_general'}, {'u35_speaker', 'u5238_general', 'u4464_general', 'u71_speaker'}, {'u103_speaker', 'u4597_general'}, {'u4467_general'}, {'u62_speaker', 'u125_speaker'}, {'u1905_general', 'u25_speaker'}, {'u217_general'}, {'u77_speaker', 'u4931_general'}, {'u135_speaker'}, {'u839_general'}, {'u912_general', 'u1143_general', 'u29_speaker', 'u4837_general'}, {'u64_speaker'}, {'u57_speaker', 'u75_speaker', 'u204_general', 'u2278_general', 'u3456_general', 'u27_speaker', 'u3436_general', 'u28_speaker'}]


In [None]:
D_s2, E_s2 = compute_sfd(s2_clusters, "general", "speaker")
D_s2, E_s2
# ([2, 8, 5, 2, 1, 1, 0, 1, 1, 1, 0, 1, 3, 0, 4], [2, 13, 1, 2, 1, 0, 2, 1, 0, 1, 1, 0, 1, 1, 4])

([2, 8, 5, 2, 1, 1, 0, 1, 1, 1, 0, 1, 3, 0, 4],
 [2, 13, 1, 2, 1, 0, 2, 1, 0, 1, 1, 0, 1, 1, 4])

In [None]:
classify_change_binary(D_s2, E_s2) # 1

1

In [None]:
s2_graded = classify_change_graded(D_s2, E_s2)
s2_graded # np.float64(0.3519419416573242)

np.float64(0.3519419416573242)

# Speaker 3

In [None]:
import pandas as pd

filename = 't_s3_annotation_r4.xlsx'
folder_path = '/content/drive/My Drive/MSc Computer Science/Master Thesis/Annotations/'

df_s3 = pd.read_excel(folder_path + filename)
df_s3

Unnamed: 0,round,usage_1_id,usage_1_text,usage_2_id,usage_2_text,anno,Unnamed: 6
0,1,u839_general,,u6_speaker,,3.0,
1,1,u6_speaker,,u107_speaker,,3.0,
2,1,u107_speaker,,u912_general,,2.0,
3,1,u912_general,,u837_general,,3.0,
4,1,u837_general,,u163_speaker,,2.0,
...,...,...,...,...,...,...,...
223,4,u6_speaker,Og vi finner mange eksempler på det aller vikt...,u70_speaker,Vi ser fram til at stortingsmeldingen faktisk ...,3.0,
224,4,u163_speaker,Vi må investere i bærekraftig utvikling og fje...,u1763_general,"I Bergen, hvor jeg bor, har vi selvfølgelig vå...",3.0,
225,4,u108_speaker,Mitt spørsmål til finansministeren lyder: «Kap...,u154_speaker,"Vi trenger et bærekraftig samfunn, og det er e...",3.0,
226,4,u3679_general,Da må vi satse på fellesskapsløsninger som sik...,u56_speaker,"I det framtidige bærekraftige Norge, som alle ...",2.0,


In [None]:
s3_clusters_dict = {}
s3_annotation_dict = {}
s3_usage_graph = Graph()

add_annotation_from_df_to_dict_and_graph(df_s3, s3_usage_graph, s3_annotation_dict, 1)
add_annotation_from_df_to_dict_and_graph(df_s3, s3_usage_graph, s3_annotation_dict, 2)
add_annotation_from_df_to_dict_and_graph(df_s3, s3_usage_graph, s3_annotation_dict, 3)
add_annotation_from_df_to_dict_and_graph(df_s3, s3_usage_graph, s3_annotation_dict, 4)

s3_clusters, s3_best_loss = optimize_clusters_mlrose(s3_usage_graph, max_clusters=30, runs_per_k=20)

print("Best clusters:", s3_clusters)
print("Best loss:", s3_best_loss)

Best clusters: [{'u1307_general', 'u1828_general'}, {'u204_general', 'u1763_general', 'u3462_general', 'u2006_general', 'u2757_general', 'u59_speaker', 'u1791_general'}, {'u217_general', 'u114_speaker'}, {'u1273_general', 'u3456_general', 'u56_speaker', 'u195_speaker', 'u1805_general'}, {'u712_general', 'u70_speaker', 'u3436_general', 'u189_speaker', 'u28_speaker'}, {'u107_speaker', 'u2278_general', 'u163_speaker', 'u108_speaker', 'u6_speaker', 'u260_general', 'u50_speaker', 'u55_speaker', 'u4139_general', 'u2253_general', 'u129_speaker', 'u1143_general', 'u244_general', 'u2276_general', 'u8_speaker', 'u3679_general', 'u7_speaker', 'u23_speaker', 'u26_speaker', 'u154_speaker', 'u188_speaker', 'u143_speaker', 'u2787_general', 'u186_speaker', 'u839_general', 'u166_speaker', 'u22_speaker'}, {'u53_general', 'u57_speaker', 'u173_speaker', 'u62_speaker', 'u35_speaker', 'u837_general', 'u139_speaker', 'u1628_general', 'u1905_general', 'u912_general', 'u767_general', 'u151_speaker'}]
Best loss

In [None]:
s3_clusters = [{'u1307_general', 'u1828_general'}, {'u204_general', 'u1763_general', 'u3462_general', 'u2006_general', 'u2757_general', 'u59_speaker', 'u1791_general'}, {'u217_general', 'u114_speaker'}, {'u1273_general', 'u3456_general', 'u56_speaker', 'u195_speaker', 'u1805_general'}, {'u712_general', 'u70_speaker', 'u3436_general', 'u189_speaker', 'u28_speaker'}, {'u107_speaker', 'u2278_general', 'u163_speaker', 'u108_speaker', 'u6_speaker', 'u260_general', 'u50_speaker', 'u55_speaker', 'u4139_general', 'u2253_general', 'u129_speaker', 'u1143_general', 'u244_general', 'u2276_general', 'u8_speaker', 'u3679_general', 'u7_speaker', 'u23_speaker', 'u26_speaker', 'u154_speaker', 'u188_speaker', 'u143_speaker', 'u2787_general', 'u186_speaker', 'u839_general', 'u166_speaker', 'u22_speaker'}, {'u53_general', 'u57_speaker', 'u173_speaker', 'u62_speaker', 'u35_speaker', 'u837_general', 'u139_speaker', 'u1628_general', 'u1905_general', 'u912_general', 'u767_general', 'u151_speaker'}]


In [None]:
D_s3, E_s3 = compute_sfd(s3_clusters, "general", "speaker")
D_s3, E_s3 # ([2, 6, 1, 3, 2, 10, 6], [0, 1, 1, 2, 3, 17, 6])

([2, 6, 1, 3, 2, 10, 6], [0, 1, 1, 2, 3, 17, 6])

In [None]:
classify_change_binary(D_s3, E_s3) # 1

1

In [None]:
s3_graded = classify_change_graded(D_s3, E_s3)
s3_graded # np.float64(0.2734569376344687)

np.float64(0.2734569376344687)

# Speaker 4

In [None]:
import pandas as pd

filename = 't_s4_annotation_r4.xlsx'
folder_path = '/content/drive/My Drive/MSc Computer Science/Master Thesis/Annotations/'

df_s4 = pd.read_excel(folder_path + filename)
df_s4

Unnamed: 0,round,usage_1_id,usage_1_text,usage_2_id,usage_2_text,anno,Unnamed: 6
0,1.0,u8_speaker,,u23_speaker,,2.0,
1,1.0,u23_speaker,,u1535_general,,3.0,
2,1.0,u1535_general,,u106_general,,1.0,
3,1.0,u106_general,,u8928_general,,2.0,
4,1.0,u8928_general,,u7359_general,,1.0,
...,...,...,...,...,...,...,...
159,4.0,u204_speaker,Turisme er sannsynligvis en virkelig voksende ...,u23_speaker,Regjeringen vil arbeide for sosial bærekraft g...,2.0,--
160,4.0,u143_speaker,"Det skal være bærekraftig, hvis man skal nå kl...",u1535_general,Det var noe av hele tanken bak samhandlingsref...,2.0,
161,4.0,u108_speaker,"Skal vi ha bærekraftige kommuner, der fraflytt...",u1824_general,Ingen andre vil støtte bærekraftig bevegelsesf...,3.0,
162,4.0,u188_speaker,Særlig er jeg opptatt av at når vi skal bygge ...,u203_speaker,Kompromisset i seg selv inneholder elementer s...,3.0,


In [None]:
s4_clusters_dict = {}
s4_annotation_dict = {}
s4_usage_graph = Graph()

add_annotation_from_df_to_dict_and_graph(df_s4, s4_usage_graph, s4_annotation_dict, 1)
add_annotation_from_df_to_dict_and_graph(df_s4, s4_usage_graph, s4_annotation_dict, 2)
add_annotation_from_df_to_dict_and_graph(df_s4, s4_usage_graph, s4_annotation_dict, 3)
add_annotation_from_df_to_dict_and_graph(df_s4, s4_usage_graph, s4_annotation_dict, 4)

s4_clusters, s4_best_loss = optimize_clusters_mlrose(s4_usage_graph, max_clusters=30, runs_per_k=20)

print("Best clusters:", s4_clusters)
print("Best loss:", s4_best_loss)

Best clusters: [{'u488_general', 'u2286_general', 'u62_speaker', 'u106_general', 'u6912_general', 'u7359_general', 'u139_speaker', 'u108_speaker', 'u3257_general', 'u2615_general', 'u1824_general', 'u8279_general', 'u28_speaker', 'u22_speaker'}, {'u4506_general', 'u3611_general', 'u70_speaker', 'u8_speaker', 'u1424_general', 'u6_speaker', 'u520_general', 'u6873_general', 'u151_speaker', 'u434_general'}, {'u107_speaker', 'u3811_general', 'u194_speaker', 'u173_speaker', 'u154_speaker', 'u4012_general', 'u3657_general', 'u189_speaker'}, {'u203_speaker', 'u6924_general', 'u9674_general', 'u3582_general', 'u163_speaker', 'u8928_general', 'u59_speaker', 'u8935_general', 'u55_speaker', 'u129_speaker', 'u409_general', 'u35_speaker', 'u7_speaker', 'u23_speaker', 'u26_speaker', 'u188_speaker', 'u143_speaker', 'u1535_general', 'u166_speaker'}, {'u57_speaker', 'u1679_general', 'u4557_general', 'u204_speaker', 'u9863_general', 'u9654_general', 'u179_speaker', 'u9195_general', 'u50_speaker'}]
Best l

In [None]:
s4_clusters = [{'u488_general', 'u2286_general', 'u62_speaker', 'u106_general', 'u6912_general', 'u7359_general', 'u139_speaker', 'u108_speaker', 'u3257_general', 'u2615_general', 'u1824_general', 'u8279_general', 'u28_speaker', 'u22_speaker'}, {'u4506_general', 'u3611_general', 'u70_speaker', 'u8_speaker', 'u1424_general', 'u6_speaker', 'u520_general', 'u6873_general', 'u151_speaker', 'u434_general'}, {'u107_speaker', 'u3811_general', 'u194_speaker', 'u173_speaker', 'u154_speaker', 'u4012_general', 'u3657_general', 'u189_speaker'}, {'u203_speaker', 'u6924_general', 'u9674_general', 'u3582_general', 'u163_speaker', 'u8928_general', 'u59_speaker', 'u8935_general', 'u55_speaker', 'u129_speaker', 'u409_general', 'u35_speaker', 'u7_speaker', 'u23_speaker', 'u26_speaker', 'u188_speaker', 'u143_speaker', 'u1535_general', 'u166_speaker'}, {'u57_speaker', 'u1679_general', 'u4557_general', 'u204_speaker', 'u9863_general', 'u9654_general', 'u179_speaker', 'u9195_general', 'u50_speaker'}]


In [None]:
D_s4, E_s4 = compute_sfd(s4_clusters, "general", "speaker")
D_s4, E_s4 # ([9, 6, 3, 7, 5], [5, 4, 5, 12, 4])

([9, 6, 3, 7, 5], [5, 4, 5, 12, 4])

In [None]:
classify_change_binary(D_s4, E_s4) # 0

0

In [None]:
s4_graded = classify_change_graded(D_s4, E_s4)
s4_graded # np.float64(0.17102009867888196)

np.float64(0.17102009867888196)

Tuples - Downstream

In [None]:
anno_tuples = [('s1', s1_graded), ('s2', s2_graded), ('s3', s3_graded), ('s4', s4_graded)]
anno_tuples

[('s1', np.float64(0.2648922456511161)),
 ('s2', np.float64(0.3519419416573242)),
 ('s3', np.float64(0.2734569376344687)),
 ('s4', np.float64(0.17102009867888196))]