In [1]:
import os
import json
import torch
import numpy as np
from typing import List, Union

from torch.utils.data import Dataset
from transformers import RobertaConfig, RobertaTokenizer, RobertaModel

from umap import UMAP

from sklearn.manifold import TSNE
from sklearn.cluster import HDBSCAN
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score

from tqdm import tqdm

In [2]:
train_json_file = '/mnt/d/Datasets/superseg/segmentation_file_train.json'
val_json_file = '/mnt/d/Datasets/superseg/segmentation_file_validation.json'
with open(train_json_file, 'r') as train_f:
    train_ds = json.load(train_f)['dial_data']['superseg-v2']

In [3]:
class ClusteringPipeline:
    def __init__(self, tokenizer, feature_extractor, reducer, clustering):
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.reducer = reducer
        self.clustering = clustering  

    def fit_predict(self, dialog, topics):
        self.topics = topics
        input_ids = [self.tokenizer.encode(i, return_tensors='pt') for i in dialog]
        
        sentence_vectors = []
        with torch.no_grad():
            for item in input_ids:
                output_states = self.feature_extractor(item).last_hidden_state[0][0]
                sentence_vectors.append(output_states.detach().numpy())
                
        reduced_vector = self.reducer.fit_transform(np.array(sentence_vectors))
        
        self.clustering.fit(reduced_vector)
        self.predicted_labels_ = self.clustering.labels_
        self.compute_metrics(senteces=self.predicted_labels_, topics_id=topics)
        
        return self.predicted_labels_

    @staticmethod
    def preprocess_dialseg(data):
        dialogs = []
        topics = []
        for item in tqdm(data):
            dialogs.append([turn['utterance'] for turn in item['turns']])
            topics.append([turn['topic_id'] for turn in item['turns']])
        return dialogs, topics
    
    @staticmethod
    def compute_metrics(senteces, topics_id):
        print(f'Adjusted rand score: {adjusted_rand_score(labels_true=topics_id, labels_pred=senteces)}')
        print(f'Adjusted mutual indo score: {adjusted_mutual_info_score(labels_true=topics_id, labels_pred=senteces)}')
           


In [22]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
model.eval()

umap_reducer = UMAP(n_components=15, n_neighbors=3)

hdb = HDBSCAN(min_cluster_size=2)

pipeline = ClusteringPipeline(tokenizer=tokenizer, feature_extractor=model, reducer=umap_reducer, clustering=hdb)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
dialogs, topics = pipeline.preprocess_dialseg(train_ds)

100%|██████████| 6948/6948 [00:00<00:00, 241690.09it/s]


In [24]:
index = 1
pred_labels = pipeline.fit_predict(dialog=dialogs[index], topics=topics[index])
print("True lables: ", topics[index])
print("Predicted labels: ", pred_labels)

Adjusted rand score: 0.010494752623688156
Adjusted mutual indo score: 0.07993635285827581
True lables:  [0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2]
Predicted labels:  [1 1 0 0 0 2 0 2 0 2 0]
