In [16]:
import sys
import os
import pickle

import importlib
import clustering_utils
importlib.reload(clustering_utils)
import utils
importlib.reload(utils)
sys.path.append('../chat-intents/chatintents')
import chatintents
importlib.reload(chatintents)

import numpy as np
import pandas as pd
from hyperopt import hp
from sentence_transformers import SentenceTransformer
import json
import argparse
import yaml

import plotly.express as px
import matplotlib.pyplot as plt
import umap
import fast_hdbscan
import time
import torch
import re
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from clustering_utils import remove_duplicates, get_bert_topics, get_chat_intents_topics, get_clustering_metrics, get_most_common_company, save_input_with_params, get_clustering_metrics, plot_best_clusters, save_embeddings, perform_clustering, remove_stopwords
from clustering_utils import is_supervised_training, split_supervised_training_df
from utils import load_model, create_replace_no_tags_embeddings
from chatintents import ChatIntents

from nltk.corpus import stopwords
import nltk
from bertopic.representation import MaximalMarginalRelevance
import openai
from bertopic.representation import OpenAI
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bartekjezierski/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bartekjezierski/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
params = {}
params['DROP_DUPLICATES'] = True
params['TOPIC_MODEL'] = "bert" # "chat_intents"
params['INPUT_DIR'] = '../glanos-data/clustering/input/'
params['OUTPUT_DIR'] = '../glanos-data/clustering/output/'
params['BAYESIAN_SEARCH_METRIC'] = 'topic_diversity+bert' # 'outliers', 'topic_overlap+bert', 'topic_diversity+bert'
params['REMOVE_STOPWORDS'] = True
params['KEEP_ONLY_BAD_LABELS'] = True
params['INFERENCE_LABELS'] = ['OTHER|ENTITY', 'OTHER', 'ENTITY', 'ENTITY|OTHER']
params['SEMI_SUPERVISED'] = True
params['PENALTY'] = 'label_count' # ['label_count', 'outside_range']
params['LABEL_RANGE'] = (28, 50) # applicable only if outside_range in PENALTY
params['OUTSIDE_RANGE_PENALTY'] = 10 # applicable only if outside_range in PENALTY
params['TOP_K_TOPICS'] = 10
params['N_NEIGHBORS_RANGE'] = range(50, 80)
params['N_COMPONENTS_RANGE'] = range(4, 15)
params['MIN_CLUSTER_SIZE_RANGE'] = range(25, 60)
params['MIN_SAMPLE_SIZE_RANGE'] = range(1, 4)
params['MAX_EVALS'] = 15 # best: 20, only relevant for the hyperparameter search


In [18]:
def run_bayesian_search(model, params, df=None, param_output_file_path=None):
    hspace = {
        "n_neighbors": hp.choice('n_neighbors', params['N_NEIGHBORS_RANGE']),
        "n_components": hp.choice('n_components', params['N_COMPONENTS_RANGE']),
        "min_cluster_size": hp.choice('min_cluster_size', params['MIN_CLUSTER_SIZE_RANGE']),
        "min_samples": hp.choice('min_samples', params['MIN_SAMPLE_SIZE_RANGE']), # the higher it is, the more strict will clustering be and have more outliers
        "random_state": 42
    }

    label_lower = params['LABEL_RANGE'][0]
    label_upper = params['LABEL_RANGE'][1]
    best_params, best_clusters, trials = model.bayesian_search(space=hspace,
                          label_lower=label_lower,
                          label_upper=label_upper,
                          params=params,
                          df=df
                          )
    if param_output_file_path:
        save_input_with_params(best_params, df, param_output_file_path)
    return best_params

In [19]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation

def cluster(input_file_name, params, hf_model_name="", topic_model=params['TOPIC_MODEL']):
    params['HF_MODEL_NAME'] = hf_model_name
    input_file_path = params['INPUT_DIR'] + input_file_name
    print('Input file', input_file_path)
    
    if "data2" in input_file_path and "replace_no_tags" in input_file_path and 'seb' in input_file_path:
        dataset_file_path = '../glanos-data/clustering/big_consulting_export_replace_seb.tsv' 
    elif "data2" in input_file_path and "replace_no_tags" in input_file_path:
        dataset_file_path = '../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv' 
    elif "data2" in input_file_path:
        dataset_file_path = '../glanos-data/clustering/big_consulting_export_data2_no_duplicates.tsv' 
    else:
        dataset_file_path = '../glanos-data/clustering/big_consulting_export.tsv'
        
    if not os.path.exists(input_file_path):
        if os.path.exists(input_file_path.replace('_params', '')):
            input_file_path = input_file_path.replace('_params', '')
        else:
            save_embeddings(hf_model_name, 
                        dataset_file_path, 
                        input_file_path, 
                        "snippet" if not "replace" in input_file_path else ("replace_no_tags" if "replace_no_tags" in input_file_path else "replace"),
                        params)

    with open(input_file_path, 'r') as f:
        data = json.load(f)

    if 'items' in data:
        df = pd.json_normalize(data['items'])
        df = df.dropna()
        if params['DROP_DUPLICATES']:
            df = remove_duplicates(df)

        params['SNIPPET_COLUMN_NAME'] = "snippet" if not "replace_no_tags" in input_file_path else "replace_no_tags"
        snippets = df[params['SNIPPET_COLUMN_NAME']].tolist()
        
        model_name = globals()["model"] if "model" in globals() else "sbert"
        if 'embedding' in df.columns:
            embeddings = df['embedding'].tolist()
        else:
            print('Embeddings snippets like ', snippets[0])
            if model_name == "sbert":
                embeddings = embed_sbert(snippets)
            elif model_name == "glove":
                embeddings = encode_glove(snippets)
            else:
                return "Invalid data format.", 400
            df['embedding'] = embeddings.tolist()

        if is_supervised_training(df, params):
            print('Supervised')
            training_df, inference_df = split_supervised_training_df(df, params)
            training_embeddings = training_df["embedding"].to_list()
            training_titles = training_df["snippet"].to_list()
            inference_embeddings = inference_df["embedding"].to_list()
            inference_titles = inference_df["snippet"].to_list()

            model = ChatIntents(training_embeddings, inference_embeddings, training_df, inference_df, model_name)
        else:
            model = ChatIntents(embeddings, embeddings, df, df, model_name)
            inference_df = df

        if 'parameters' in data:
            hdbscan_umap_params = data['parameters']
#             hdbscan_umap_params = {"min_cluster_size": 38, "min_samples": 3, "n_components": 5, "n_neighbors": 51, "outside_range_penalty": 2, "random_state": 42}
            if topic_model == "chat_intents":
                hdbscan_clustering = perform_clustering(model, 
                                                    hdbscan_umap_params['n_neighbors'], 
                                                    hdbscan_umap_params['n_components'], 
                                                    hdbscan_umap_params['min_cluster_size'],
                                                    hdbscan_umap_params['min_samples'] if 'min_samples' in hdbscan_umap_params else 1
                                                )
        else:
            hdbscan_umap_params = run_bayesian_search(model, 
                                params,
                                df=df, 
                                param_output_file_path=input_file_path.replace("_params", "").replace(".json", "_params.json")
                               )
            print()
            hdbscan_clustering = model.best_clusters
        
        params.update(hdbscan_umap_params)

        start_time_topic_modelling = time.time()
        if topic_model == "chat_intents":
            print("Topic modelling method: ChatIntents")
            cluster_topics = get_chat_intents_topics(model, inference_df)
            probabilities = hdbscan_clustering.probabilities_
            cluster_labels = hdbscan_clustering.labels_
#         elif topic_model == "bert":
        else:
            print("Topic modelling method: BERTopic")
            cluster_topics, inference_df, topic_model = get_bert_topics(df, params)
            probabilities = inference_df['Probability'].to_numpy()
            cluster_labels = inference_df['cluster_id'].to_numpy()

        print(f"Elapsed total time for topic modelling: {time.time() - start_time_topic_modelling} seconds")
        if len(inference_df) != len(cluster_labels):
            print(f"Cluster labels not collected correctly ({len(inference_df)} vs. {len(cluster_labels)})")

        label_count, cost = model.score_clusters(probabilities, cluster_labels)
        print(f"{model_name} results")
        print(f"Label count: {label_count}")
        print(f"Loss: {cost}")
        
            
        if "SHOW" in params and params["SHOW"]:
            if is_supervised_training(inference_df, params):
                print("is supervised")
                umap_model_supervised = umap.UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit(training_embeddings)
                reduced_embeddings_supervised = umap_model_supervised.transform(inference_embeddings)
                fig = topic_model.visualize_documents(inference_titles, reduced_embeddings=reduced_embeddings_supervised, custom_labels=True, hide_annotations=True)
                fig.show()  
            else:
                df = plot_best_clusters(model, inference_df, cluster_labels, params, filename=input_file_name.replace('.json', ''))
#         df = plot_best_clusters(model, inference_df, hdbscan_clustering.labels_, params, filename=input_file_name.replace('.json', ''))
#         df['probability'] = hdbscan_clustering.probabilities_
#         label_count = df['cluster_id'].nunique()

        params.update({'label_count': label_count, 'cost': cost})

        print("dataset_file_path", dataset_file_path)
        get_clustering_metrics(inference_df, cluster_topics, params, input_file=dataset_file_path)

        return model, inference_df, cluster_topics, params, topic_model
    else:
        return jsonify({"error_msg": "Invalid data format."})


globals()["model"] = "sbert"

In [20]:
def print_results(input_file):
    if 'sbert_output_df' in globals():
        print("Baseline SBERT:")
        sbert_overlaps = get_clustering_metrics(sbert_output_df, sbert_cluster_topics, sbert_params, input_file=input_file)

    if 'baai_output_df' in globals():
        print("Baseline BAAI:")
        baai_overlaps = get_clustering_metrics(baai_output_df, baai_cluster_topics, baai_params, input_file=input_file)


    if 'similarity_baai_output_df' in globals():
        print("Similarity (SBERT):")
        similarity_baai_overlaps = get_clustering_metrics(similarity_baai_output_df, similarity_baai_cluster_topics, similarity_baai_params, input_file=input_file)

    if 'similarity_sbert_output_df' in globals():
        print("Similarity (BAAI):")
        similarity_sbert_overlaps = get_clustering_metrics(similarity_sbert_output_df, similarity_sbert_cluster_topics, similarity_sbert_params, input_file=input_file)

    print("Below only SBERT:")

    if 'similarity_ai_car_sbert_output_df' in globals():
        print("Similarity trained on ai+car dataset:")
        similarity_ai_car_sbert_overlaps = get_clustering_metrics(similarity_ai_car_sbert_output_df, similarity_ai_car_sbert_cluster_topics, similarity_ai_car_sbert_params, input_file=input_file)

    if 'similarity_distill_consistency_output_df' in globals():
        print("Similarity distill-consistency model trained on consulting dataset:")
        similarity_distill_consistency_overlaps = get_clustering_metrics(similarity_distill_consistency_output_df, similarity_distill_consistency_cluster_topics, similarity_distill_consistency_params, input_file=input_file)

    if 'similarity_ai_car_distill_consistency_output_df' in globals():
        print("Similarity distill-consistency model trained on ai+car dataset:")
        similarity_ai_car_distill_consistency_overlaps = get_clustering_metrics(similarity_ai_car_distill_consistency_output_df, similarity_ai_car_distill_consistency_cluster_topics, similarity_ai_car_distill_consistency_params, input_file=input_file)


    if 'classification_ai_car_sbert_output_df' in globals():
        print("Classification on ai+car:")
        classification_ai_car_sbert_overlaps = get_clustering_metrics(classification_ai_car_sbert_output_df, classification_ai_car_sbert_cluster_topics, classification_ai_car_sbert_params, input_file=input_file)

    if 'classification_output_df' in globals():
        print("Classification on consulting:")
        classification_overlaps = get_clustering_metrics(classification_output_df, classification_cluster_topics, classification_params, input_file=input_file)


    print("Combined training")

    if 'combined_ai_car_class_consulting_sim_output_df' in globals():
        print("First classification on ai+car dataset, then similarity on consulting dataset:")
        combined_ai_car_class_consulting_sim_overlaps = get_clustering_metrics(combined_ai_car_class_consulting_sim_output_df, combined_ai_car_class_consulting_sim_cluster_topics, combined_ai_car_class_consulting_sim_params, input_file=input_file)

    if 'combined_consulting_sim_ai_car_class_output_df' in globals():
        print("First similarity on consulting dataset, then classification on ai+car dataset:")
        combined_consulting_sim_ai_car_class_overlaps = get_clustering_metrics(combined_consulting_sim_ai_car_class_output_df, combined_consulting_sim_ai_car_class_cluster_topics, combined_consulting_sim_ai_car_class_params, input_file=input_file)

    if 'combined_consulting_sim_and_ai_car_class_output_df' in globals():
        print("Together similarity on consulting dataset, and classification on ai+car dataset:")
        combined_consulting_sim_and_ai_car_class_overlaps = get_clustering_metrics(combined_consulting_sim_and_ai_car_class_output_df, combined_consulting_sim_and_ai_car_class_cluster_topics, combined_consulting_sim_and_ai_car_class_params, input_file=input_file)

    if 'combined_ai_car_sim_and_class_output_df' in globals():
        print("Together similarity and classification on ai+car dataset:")
        combined_ai_car_sim_and_class_overlaps = get_clustering_metrics(combined_ai_car_sim_and_class_output_df, combined_ai_car_sim_and_class_cluster_topics, combined_ai_car_sim_and_class_params, input_file=input_file)

    if 'combined_ai_car_sim_class_output_df' in globals():
        print("First similarity, then classification on ai+car dataset:")
        combined_ai_car_sim_class_overlaps = get_clustering_metrics(combined_ai_car_sim_class_output_df, combined_ai_car_sim_class_cluster_topics, combined_ai_car_sim_class_params, input_file=input_file)

    if 'combined_ai_car_class_sim_output_df' in globals():
        print("First classification, then similarity on ai+car dataset:")
        combined_ai_car_class_sim_overlaps = get_clustering_metrics(combined_ai_car_class_sim_output_df, combined_ai_car_class_sim_cluster_topics, combined_ai_car_class_sim_params, input_file=input_file)

    if 'combined_consulting_output_df' in globals():
        print("Together classification and similarity on consulting dataset:")
        combined_consulting_overlaps = get_clustering_metrics(combined_consulting_output_df, combined_consulting_cluster_topics, combined_consulting_params, input_file=input_file)
        
    if 'classification_ai_car_data_aug_output_df' in globals():
        print("Classification on augmented dataset:")
        classification_ai_car_data_aug_overlaps = get_clustering_metrics(classification_ai_car_data_aug_output_df, classification_ai_car_data_aug_cluster_topics, classification_ai_car_data_aug_params, input_file=input_file)
        

# Experiment 1

In [24]:
# on replace_no_tags dataset
# diversity loss

data2_input_file = '../glanos-data/clustering/big_consulting_export_replace.tsv'
params['BAYESIAN_SEARCH_METRIC'] = 'topic_diversity+bert'
params["SHOW"] = False

# sbert_model, sbert_output_df, sbert_cluster_topics, sbert_params, sbert_topic_model = cluster('baseline_data2_semi_replace_no_tags_td_params.json', params, hf_model_name='sentence-transformers/all-MiniLM-L12-v2')
# baai_model, baai_output_df, baai_cluster_topics, baai_params, baai_topic_model = cluster('baai_baseline_data2_semi_replace_no_tags_td_params.json', params, hf_model_name='BAAI/bge-base-en-v1.5')

# similarity_sbert_model, similarity_sbert_output_df, similarity_sbert_cluster_topics, similarity_sbert_params, similarity_sbert_topic_model = cluster('similarity_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-similarity")
# similarity_baai_model, similarity_baai_output_df, similarity_baai_cluster_topics, similarity_baai_params, similarity_baai_topic_model = cluster('best_similarity_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-similarity-best-baai")
# similarity_ai_car_sbert_model, similarity_ai_car_sbert_output_df, similarity_ai_car_sbert_cluster_topics, similarity_ai_car_sbert_params, similarity_ai_car_sbert_topic_model = cluster('similarity_ai_car_data2_semi_replace_no_tags_td_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-ai-car')
# similarity_distill_consistency_model, similarity_distill_consistency_output_df, similarity_distill_consistency_cluster_topics, similarity_distill_consistency_params, similarity_distill_consistency_topic_model = cluster('similarity_distill_consistency_data2_semi_replace_no_tags_td_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-distill-consistency')
# similarity_ai_car_distill_consistency_model, similarity_ai_car_distill_consistency_output_df, similarity_ai_car_distill_consistency_cluster_topics, similarity_ai_car_distill_consistency_params, similarity_ai_car_distill_consistency_topic_model = cluster('similarity_ai_car_distill_consistency_data2_semi_replace_no_tags_td_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-ai_car-distill-consistency')


# classification_model, classification_output_df, classification_cluster_topics, classification_params, classification_topic_model = cluster('classification_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification")
# classification_ai_car_sbert_model, classification_ai_car_sbert_output_df, classification_ai_car_sbert_cluster_topics, classification_ai_car_sbert_params, classification_ai_car_sbert_topic_model = cluster('classification_ai_car_sbert_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification-ai_car-sbert")
# combined_ai_car_class_consulting_sim_model, combined_ai_car_class_consulting_sim_output_df, combined_ai_car_class_consulting_sim_cluster_topics, combined_ai_car_class_consulting_sim_params, combined_ai_car_class_consulting_sim_topic_model = cluster('combined_ai_car_class_consulting_sim_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-class-consulting-sim")
# combined_consulting_sim_ai_car_class_model, combined_consulting_sim_ai_car_class_output_df, combined_consulting_sim_ai_car_class_cluster_topics, combined_consulting_sim_ai_car_class_params, combined_consulting_sim_ai_car_class_topic_model = cluster('combined_consulting_sim_ai_car_class_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-consulting-sim-ai_car-class")
# combined_consulting_sim_and_ai_car_class_model, combined_consulting_sim_and_ai_car_class_output_df, combined_consulting_sim_and_ai_car_class_cluster_topics, combined_consulting_sim_and_ai_car_class_params, combined_consulting_sim_and_ai_car_class_topic_model = cluster('combined_consulting_sim_and_ai_car_class_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-consulting-sim-and-ai_car-class")
# combined_ai_car_sim_and_class_model, combined_ai_car_sim_and_class_output_df, combined_ai_car_sim_and_class_cluster_topics, combined_ai_car_sim_and_class_params, combined_ai_car_sim_and_class_topic_model = cluster('combined_ai_car_sim_and_class_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-sim-and-class")
# combined_ai_car_sim_class_model, combined_ai_car_sim_class_output_df, combined_ai_car_sim_class_cluster_topics, combined_ai_car_sim_class_params, combined_ai_car_sim_class_topic_model = cluster('combined_ai_car_sim_class_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-sim-class")
# combined_ai_car_class_sim_model, combined_ai_car_class_sim_output_df, combined_ai_car_class_sim_cluster_topics, combined_ai_car_class_sim_params, combined_ai_car_class_sim_topic_model = cluster('combined_ai_car_class_sim_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-class-sim")
# combined_consulting_model, combined_consulting_output_df, combined_consulting_cluster_topics, combined_consulting_params, combined_consulting_topic_model = cluster('combined_consulting_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/combined-consulting")

ex1_classification_ai_car_data_aug_model, ex1_classification_ai_car_data_aug_output_df, ex1_classification_ai_car_data_aug_cluster_topics, ex1_classification_ai_car_data_aug_params, ex1_classification_ai_car_data_aug_topic_model = cluster('classification_ai_car_data_aug_data2_semi_replace_no_tags_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification-ai_car-data_aug")



Input file ../glanos-data/clustering/input/classification_ai_car_data_aug_data2_semi_replace_no_tags_td_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-27 18:25:34,792 - BERTopic - Transformed documents to Embeddings
2023-10-27 18:25:37,268 - BERTopic - Reduced dimensionality
2023-10-27 18:25:37,283 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-27 18:26:13,425 - BERTopic - Reduced dimensionality
2023-10-27 18:26:13,475 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 55.39846897125244 seconds
sbert results
Label count: 8
Loss: 0.03807106598984772
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 1
Overlap count of length 2: 2
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.28
Diversity loss:  1.06
Number of clusters: 8
Avg cluster size: 437.77777777777777
Median cluster size: 372.0 , standard deviation: 477.9167239820431
Median top company occurence: 44.776119402985074
Cost (outliers): 3.81%



In [27]:
ex1_classification_ai_car_data_aug_cluster_topics, ex3_classification_ai_car_data_aug_cluster_topics

([['fined',
   'state',
   'role',
   'issued',
   'consultancy',
   'linked',
   'welcomes opportunity',
   'concerns',
   'denying',
   'didn raise'],
  ['ai',
   'cloud',
   'announced',
   'technology',
   'partnership',
   'platform',
   'services',
   'new',
   'business',
   'digital'],
  ['results',
   'reported',
   'revenue',
   'net',
   'quarter',
   'fiscal',
   'year',
   'growth',
   'investment',
   'share'],
  ['served',
   'prior',
   'joining',
   'prior joining',
   'partner',
   'years',
   'practice',
   'roles',
   'spent',
   'audit'],
  ['acquisition',
   'acquire',
   'sustainability',
   'announced',
   'services',
   'domus',
   'green domus',
   'sustainability consultancy',
   'green',
   'company'],
  ['tax',
   'law',
   'laws',
   'trade',
   'tax laws',
   'pay',
   'scandal',
   'agreed',
   'information',
   'confidential'],
  ['tax',
   'government',
   'partners',
   'confidential',
   'treasury',
   'leave',
   'internal',
   'confidential governm

In [7]:
data2_input_file = '../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv'
print_results(input_file=data2_input_file)


Baseline SBERT:
Overlap count of length 1: 11
Overlap count of length 2: 4
Overlap count of length 3: 2
Overlap count of length 4 or more: 0
Overlap loss:  1.84
Diversity loss:  1.15
Number of clusters: 8
Median cluster size: 137.0 , standard deviation: 392.4709189509743
Median top company occurence: 46.875
Cost (outliers): 3.81%

Baseline BAAI:
Overlap count of length 1: 9
Overlap count of length 2: 2
Overlap count of length 3: 1
Overlap count of length 4 or more: 1
Overlap loss:  2.03
Diversity loss:  1.15
Number of clusters: 8
Median cluster size: 142.0 , standard deviation: 499.85835863594895
Median top company occurence: 31.164209456024405
Cost (outliers): 3.81%

Similarity (SBERT):
Overlap count of length 1: 9
Overlap count of length 2: 4
Overlap count of length 3: 1
Overlap count of length 4 or more: 1
Overlap loss:  2.28
Diversity loss:  1.18
Number of clusters: 8
Median cluster size: 145.0 , standard deviation: 397.3414461795686
Median top company occurence: 28.205128205128204

# Experiment 2

In [6]:
# on replace_no_tags dataset
# overlap loss

data2_input_file = '../glanos-data/clustering/big_consulting_export_replace.tsv'
params['BAYESIAN_SEARCH_METRIC'] = 'topic_overlap+bert'

sbert_model, sbert_output_df, sbert_cluster_topics, sbert_params, sbert_topic_model = cluster('baseline_data2_semi_replace_no_tags_to_params.json', params, hf_model_name='sentence-transformers/all-MiniLM-L12-v2')
baai_model, baai_output_df, baai_cluster_topics, baai_params, baai_topic_model = cluster('baai_baseline_data2_semi_replace_no_tags_to_params.json', params, hf_model_name='BAAI/bge-base-en-v1.5')

similarity_sbert_model, similarity_sbert_output_df, similarity_sbert_cluster_topics, similarity_sbert_params, similarity_sbert_topic_model = cluster('similarity_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/sentence-embeddings-similarity")
similarity_baai_model, similarity_baai_output_df, similarity_baai_cluster_topics, similarity_baai_params, similarity_baai_topic_model = cluster('best_similarity_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/sentence-embeddings-similarity-best-baai")
similarity_ai_car_sbert_model, similarity_ai_car_sbert_output_df, similarity_ai_car_sbert_cluster_topics, similarity_ai_car_sbert_params, similarity_ai_car_sbert_topic_model = cluster('similarity_ai_car_data2_semi_replace_no_tags_to_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-ai-car')
similarity_distill_consistency_model, similarity_distill_consistency_output_df, similarity_distill_consistency_cluster_topics, similarity_distill_consistency_params, similarity_distill_consistency_topic_model = cluster('similarity_distill_consistency_data2_semi_replace_no_tags_to_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-distill-consistency')
similarity_ai_car_distill_consistency_model, similarity_ai_car_distill_consistency_output_df, similarity_ai_car_distill_consistency_cluster_topics, similarity_ai_car_distill_consistency_params, similarity_ai_car_distill_consistency_topic_model = cluster('similarity_ai_car_distill_consistency_data2_semi_replace_no_tags_to_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-ai_car-distill-consistency')


classification_model, classification_output_df, classification_cluster_topics, classification_params, classification_topic_model = cluster('classification_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification")
classification_ai_car_sbert_model, classification_ai_car_sbert_output_df, classification_ai_car_sbert_cluster_topics, classification_ai_car_sbert_params, classification_ai_car_sbert_topic_model = cluster('classification_ai_car_sbert_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification-ai_car-sbert")
combined_ai_car_class_consulting_sim_model, combined_ai_car_class_consulting_sim_output_df, combined_ai_car_class_consulting_sim_cluster_topics, combined_ai_car_class_consulting_sim_params, combined_ai_car_class_consulting_sim_topic_model = cluster('combined_ai_car_class_consulting_sim_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-class-consulting-sim")
combined_consulting_sim_ai_car_class_model, combined_consulting_sim_ai_car_class_output_df, combined_consulting_sim_ai_car_class_cluster_topics, combined_consulting_sim_ai_car_class_params, combined_consulting_sim_ai_car_class_topic_model = cluster('combined_consulting_sim_ai_car_class_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-consulting-sim-ai_car-class")
combined_consulting_sim_and_ai_car_class_model, combined_consulting_sim_and_ai_car_class_output_df, combined_consulting_sim_and_ai_car_class_cluster_topics, combined_consulting_sim_and_ai_car_class_params, combined_consulting_sim_and_ai_car_class_topic_model = cluster('combined_consulting_sim_and_ai_car_class_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-consulting-sim-and-ai_car-class")
combined_ai_car_sim_and_class_model, combined_ai_car_sim_and_class_output_df, combined_ai_car_sim_and_class_cluster_topics, combined_ai_car_sim_and_class_params, combined_ai_car_sim_and_class_topic_model = cluster('combined_ai_car_sim_and_class_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-sim-and-class")
combined_ai_car_sim_class_model, combined_ai_car_sim_class_output_df, combined_ai_car_sim_class_cluster_topics, combined_ai_car_sim_class_params, combined_ai_car_sim_class_topic_model = cluster('combined_ai_car_sim_class_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-sim-class")
combined_ai_car_class_sim_model, combined_ai_car_class_sim_output_df, combined_ai_car_class_sim_cluster_topics, combined_ai_car_class_sim_params, combined_ai_car_class_sim_topic_model = cluster('combined_ai_car_class_sim_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-class-sim")
combined_consulting_model, combined_consulting_output_df, combined_consulting_cluster_topics, combined_consulting_params, combined_consulting_topic_model = cluster('combined_consulting_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/combined-consulting")
classification_ai_car_data_aug_model, classification_ai_car_data_aug_output_df, classification_ai_car_data_aug_cluster_topics, classification_ai_car_data_aug_params, classification_ai_car_data_aug_topic_model = cluster('classification_ai_car_data_aug_data2_semi_replace_no_tags_to_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification-ai_car-data_aug")



Input file ../glanos-data/clustering/input/baseline_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:21:45,668 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:21:50,092 - BERTopic - Reduced dimensionality
2023-10-26 19:21:50,116 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:22:34,941 - BERTopic - Reduced dimensionality
2023-10-26 19:22:34,999 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 67.38714694976807 seconds
sbert results
Label count: 9
Loss: 0.4012690355329949
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 6
Overlap count of length 2: 4
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.61
Diversity loss:  1.15
Number of clusters: 9
Median cluster size: 175.0 , standard deviation: 462.2328417583502
Median top company occurence: 35.81002331002331
Cost (outliers): 40.13%

Input file ../glanos-data/clustering/input/baai_baseline_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:23:02,939 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:23:05,843 - BERTopic - Reduced dimensionality
2023-10-26 19:23:05,859 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:24:07,887 - BERTopic - Reduced dimensionality
2023-10-26 19:24:07,944 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 92.07394003868103 seconds
sbert results
Label count: 10
Loss: 0.3522842639593909
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 6
Overlap count of length 2: 3
Overlap count of length 3: 0
Overlap count of length 4 or more: 1
Overlap loss:  1.25
Diversity loss:  1.15
Number of clusters: 10
Median cluster size: 177.0 , standard deviation: 406.4347914111457
Median top company occurence: 46.835443037974684
Cost (outliers): 35.23%

Input file ../glanos-data/clustering/input/similarity_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:24:25,902 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:24:28,672 - BERTopic - Reduced dimensionality
2023-10-26 19:24:28,691 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:25:23,605 - BERTopic - Reduced dimensionality
2023-10-26 19:25:23,669 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 75.07845997810364 seconds
sbert results
Label count: 11
Loss: 0.4873096446700508
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 9
Overlap count of length 2: 7
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.84
Diversity loss:  1.18
Number of clusters: 11
Median cluster size: 188.5 , standard deviation: 496.83017442806573
Median top company occurence: 28.63467261904762
Cost (outliers): 48.73%

Input file ../glanos-data/clustering/input/best_similarity_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:25:50,982 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:25:53,829 - BERTopic - Reduced dimensionality
2023-10-26 19:25:53,846 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:26:57,437 - BERTopic - Reduced dimensionality
2023-10-26 19:26:57,490 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 92.65459609031677 seconds
sbert results
Label count: 8
Loss: 0.4289340101522843
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 6
Overlap count of length 2: 4
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.69
Diversity loss:  1.14
Number of clusters: 8
Median cluster size: 352.0 , standard deviation: 470.36788859071146
Median top company occurence: 26.50887573964497
Cost (outliers): 42.89%

Input file ../glanos-data/clustering/input/similarity_ai_car_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:27:15,097 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:27:18,107 - BERTopic - Reduced dimensionality
2023-10-26 19:27:18,125 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:28:09,720 - BERTopic - Reduced dimensionality
2023-10-26 19:28:09,790 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 71.70980715751648 seconds
sbert results
Label count: 12
Loss: 0.4289340101522843
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 15
Overlap count of length 2: 4
Overlap count of length 3: 2
Overlap count of length 4 or more: 0
Overlap loss:  1.31
Diversity loss:  1.18
Number of clusters: 12
Median cluster size: 171.0 , standard deviation: 424.8307285602044
Median top company occurence: 54.45544554455446
Cost (outliers): 42.89%

Input file ../glanos-data/clustering/input/similarity_distill_consistency_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:28:27,548 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:28:30,107 - BERTopic - Reduced dimensionality
2023-10-26 19:28:30,124 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:29:15,126 - BERTopic - Reduced dimensionality
2023-10-26 19:29:15,189 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 64.81055498123169 seconds
sbert results
Label count: 17
Loss: 0.33375634517766495
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 12
Overlap count of length 2: 7
Overlap count of length 3: 1
Overlap count of length 4 or more: 1
Overlap loss:  1.29
Diversity loss:  1.15
Number of clusters: 17
Median cluster size: 108.0 , standard deviation: 333.8803659477929
Median top company occurence: 43.27380952380952
Cost (outliers): 33.38%

Input file ../glanos-data/clustering/input/similarity_ai_car_distill_consistency_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:29:32,502 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:29:34,840 - BERTopic - Reduced dimensionality
2023-10-26 19:29:34,855 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:30:13,596 - BERTopic - Reduced dimensionality
2023-10-26 19:30:13,649 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 57.93068313598633 seconds
sbert results
Label count: 9
Loss: 0.04568527918781726
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 6
Overlap count of length 2: 4
Overlap count of length 3: 4
Overlap count of length 4 or more: 0
Overlap loss:  2.39
Diversity loss:  1.27
Number of clusters: 9
Median cluster size: 210.0 , standard deviation: 617.1361276088121
Median top company occurence: 31.74119241192412
Cost (outliers): 4.57%

Input file ../glanos-data/clustering/input/classification_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:30:31,978 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:30:34,632 - BERTopic - Reduced dimensionality
2023-10-26 19:30:34,650 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:31:14,562 - BERTopic - Reduced dimensionality
2023-10-26 19:31:14,618 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 60.4031879901886 seconds
sbert results
Label count: 9
Loss: 0.27715736040609135
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 7
Overlap count of length 2: 1
Overlap count of length 3: 1
Overlap count of length 4 or more: 0
Overlap loss:  0.75
Diversity loss:  1.10
Number of clusters: 9
Median cluster size: 276.5 , standard deviation: 306.5426560855764
Median top company occurence: 32.85588021212436
Cost (outliers): 27.72%

Input file ../glanos-data/clustering/input/classification_ai_car_sbert_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:31:31,702 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:31:34,188 - BERTopic - Reduced dimensionality
2023-10-26 19:31:34,202 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:32:07,557 - BERTopic - Reduced dimensionality
2023-10-26 19:32:07,609 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 52.457539796829224 seconds
sbert results
Label count: 7
Loss: 0.011421319796954314
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 4
Overlap count of length 2: 1
Overlap count of length 3: 1
Overlap count of length 4 or more: 0
Overlap loss:  0.86
Diversity loss:  1.11
Number of clusters: 7
Median cluster size: 257.5 , standard deviation: 482.22738412495823
Median top company occurence: 32.43432452503249
Cost (outliers): 1.14%

Input file ../glanos-data/clustering/input/combined_ai_car_class_consulting_sim_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:32:24,582 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:32:27,496 - BERTopic - Reduced dimensionality
2023-10-26 19:32:27,513 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:33:07,396 - BERTopic - Reduced dimensionality
2023-10-26 19:33:07,453 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 59.30124807357788 seconds
sbert results
Label count: 15
Loss: 0.25862944162436546
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 7
Overlap count of length 2: 3
Overlap count of length 3: 1
Overlap count of length 4 or more: 0
Overlap loss:  0.58
Diversity loss:  1.10
Number of clusters: 15
Median cluster size: 117.0 , standard deviation: 292.41355560233524
Median top company occurence: 32.47126436781609
Cost (outliers): 25.86%

Input file ../glanos-data/clustering/input/combined_consulting_sim_ai_car_class_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:33:24,419 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:33:26,973 - BERTopic - Reduced dimensionality
2023-10-26 19:33:26,986 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:34:01,924 - BERTopic - Reduced dimensionality
2023-10-26 19:34:01,972 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 53.80828285217285 seconds
sbert results
Label count: 7
Loss: 0.01903553299492386
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 6
Overlap count of length 2: 1
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.36
Diversity loss:  1.10
Number of clusters: 7
Median cluster size: 260.5 , standard deviation: 437.01201356484466
Median top company occurence: 28.633197213547923
Cost (outliers): 1.90%

Input file ../glanos-data/clustering/input/combined_consulting_sim_and_ai_car_class_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:34:19,314 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:34:21,728 - BERTopic - Reduced dimensionality
2023-10-26 19:34:21,741 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:34:53,217 - BERTopic - Reduced dimensionality
2023-10-26 19:34:53,267 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 50.71807384490967 seconds
sbert results
Label count: 8
Loss: 0.3725888324873096
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 3
Overlap count of length 2: 4
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.59
Diversity loss:  1.12
Number of clusters: 8
Median cluster size: 347.0 , standard deviation: 381.3035471402389
Median top company occurence: 30.701754385964914
Cost (outliers): 37.26%

Input file ../glanos-data/clustering/input/combined_ai_car_sim_and_class_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:35:11,203 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:35:13,651 - BERTopic - Reduced dimensionality
2023-10-26 19:35:13,665 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:35:45,421 - BERTopic - Reduced dimensionality
2023-10-26 19:35:45,473 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 51.22251582145691 seconds
sbert results
Label count: 8
Loss: 0.4482233502538071
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 4
Overlap count of length 2: 4
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.62
Diversity loss:  1.15
Number of clusters: 8
Median cluster size: 256.0 , standard deviation: 503.63253309835113
Median top company occurence: 31.874999999999996
Cost (outliers): 44.82%

Input file ../glanos-data/clustering/input/combined_ai_car_sim_class_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:36:02,800 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:36:05,509 - BERTopic - Reduced dimensionality
2023-10-26 19:36:05,524 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:36:46,816 - BERTopic - Reduced dimensionality
2023-10-26 19:36:46,866 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 60.810145139694214 seconds
sbert results
Label count: 6
Loss: 0.014213197969543147
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 2
Overlap count of length 2: 0
Overlap count of length 3: 1
Overlap count of length 4 or more: 0
Overlap loss:  0.75
Diversity loss:  1.08
Number of clusters: 6
Median cluster size: 349.0 , standard deviation: 461.90426607729887
Median top company occurence: 28.08022922636103
Cost (outliers): 1.42%

Input file ../glanos-data/clustering/input/combined1_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:37:03,759 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:37:06,470 - BERTopic - Reduced dimensionality
2023-10-26 19:37:06,487 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:37:58,634 - BERTopic - Reduced dimensionality
2023-10-26 19:37:58,691 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 71.2484118938446 seconds
sbert results
Label count: 8
Loss: 0.014467005076142132
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 7
Overlap count of length 2: 2
Overlap count of length 3: 1
Overlap count of length 4 or more: 0
Overlap loss:  0.97
Diversity loss:  1.14
Number of clusters: 8
Median cluster size: 159.0 , standard deviation: 450.22161895319897
Median top company occurence: 27.77777777777778
Cost (outliers): 1.45%

Input file ../glanos-data/clustering/input/combined_ai_car_class_sim_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:38:15,927 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:38:18,640 - BERTopic - Reduced dimensionality
2023-10-26 19:38:18,659 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:39:07,063 - BERTopic - Reduced dimensionality
2023-10-26 19:39:07,124 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 67.91767597198486 seconds
sbert results
Label count: 15
Loss: 0.40939086294416244
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 12
Overlap count of length 2: 8
Overlap count of length 3: 1
Overlap count of length 4 or more: 0
Overlap loss:  1.00
Diversity loss:  1.17
Number of clusters: 15
Median cluster size: 97.5 , standard deviation: 388.9142739730698
Median top company occurence: 37.643493761140824
Cost (outliers): 40.94%

Input file ../glanos-data/clustering/input/combined_consulting_data2_semi_replace_no_tags_to_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:39:23,976 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:39:26,486 - BERTopic - Reduced dimensionality
2023-10-26 19:39:26,499 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:39:58,335 - BERTopic - Reduced dimensionality
2023-10-26 19:39:58,391 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 50.70572519302368 seconds
sbert results
Label count: 14
Loss: 0.3426395939086294
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 9
Overlap count of length 2: 5
Overlap count of length 3: 3
Overlap count of length 4 or more: 0
Overlap loss:  1.38
Diversity loss:  1.19
Number of clusters: 14
Median cluster size: 140.0 , standard deviation: 358.67230478839895
Median top company occurence: 40.0
Cost (outliers): 34.26%

Input file ../glanos-data/clustering/input/classification_ai_car_data_aug_data2_semi_replace_no_tags_to_params.json
Supervised
Fitting the BERT model on 1278 snippets                                        
  0%|                                   | 0/15 [00:11<?, ?trial/s, best loss=?]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 19:40:15,768 - BERTopic - Transformed documents to Embeddings
2023-10-26 19:40:18,931 - BERTopic - Reduced dimensionality
2023-10-26 19:40:18,944 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
  0%|                                   | 0/15 [00:20<?, ?trial/s, best loss=?]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 19:40:54,989 - BERTopic - Reduced dimensionality
2023-10-26 19:40:55,035 - BERTopic - Predicted clusters
job exception: 'cluster_id'



  0%|                                   | 0/15 [00:56<?, ?trial/s, best loss=?]


KeyError: 'cluster_id'

In [12]:
# TODO run
params['BAYESIAN_SEARCH_METRIC'] = 'topic_overlap+bert'



Input file ../glanos-data/clustering/input/classification_ai_car_data_aug_data2_semi_replace_no_tags_to_params.json
Supervised
Fitting the BERT model on 1278 snippets                                        
  0%|                                   | 0/15 [00:11<?, ?trial/s, best loss=?]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:04:35,163 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:04:39,002 - BERTopic - Reduced dimensionality
2023-10-26 21:04:39,021 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
  0%|                                   | 0/15 [00:22<?, ?trial/s, best loss=?]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:05:18,426 - BERTopic - Reduced dimensionality
2023-10-26 21:05:18,477 - BERTopic - Predicted clusters


Loss: 1.3333333333333335 (penalty: 0.2222222222222222), Label_count: 9, Metric: topic_overlap+bert
Fitting the BERT model on 1278 snippets                                        
  7%|▌        | 1/15 [01:12<14:08, 60.60s/trial, best loss: 1.3333333333333335]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:05:35,818 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:05:38,319 - BERTopic - Reduced dimensionality
2023-10-26 21:05:38,334 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
  7%|▌        | 1/15 [01:21<14:08, 60.60s/trial, best loss: 1.3333333333333335]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:06:17,162 - BERTopic - Reduced dimensionality
2023-10-26 21:06:17,214 - BERTopic - Predicted clusters


Loss: 1.4166666666666665 (penalty: 0.2222222222222222), Label_count: 9, Metric: topic_overlap+bert
Fitting the BERT model on 1278 snippets                                        
 13%|█▏       | 2/15 [02:10<12:53, 59.50s/trial, best loss: 1.3333333333333335]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:06:33,959 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:06:36,661 - BERTopic - Reduced dimensionality
2023-10-26 21:06:36,677 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 13%|█▏       | 2/15 [02:19<12:53, 59.50s/trial, best loss: 1.3333333333333335]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:07:29,901 - BERTopic - Reduced dimensionality
2023-10-26 21:07:29,952 - BERTopic - Predicted clusters


Loss: 0.7 (penalty: 0.4), Label_count: 5, Metric: topic_overlap+bert           
Fitting the BERT model on 1278 snippets                                        
 20%|████▊                   | 3/15 [03:23<13:06, 65.54s/trial, best loss: 0.7]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:07:47,098 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:07:49,764 - BERTopic - Reduced dimensionality
2023-10-26 21:07:49,780 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 20%|████▊                   | 3/15 [03:32<13:06, 65.54s/trial, best loss: 0.7]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:08:42,028 - BERTopic - Reduced dimensionality
2023-10-26 21:08:42,079 - BERTopic - Predicted clusters


Loss: 0.7 (penalty: 0.4), Label_count: 5, Metric: topic_overlap+bert           
Fitting the BERT model on 1278 snippets                                        
 27%|██████▍                 | 4/15 [04:35<12:29, 68.14s/trial, best loss: 0.7]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:08:58,858 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:09:02,389 - BERTopic - Reduced dimensionality
2023-10-26 21:09:02,402 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 27%|██████▍                 | 4/15 [04:45<12:29, 68.14s/trial, best loss: 0.7]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:09:33,653 - BERTopic - Reduced dimensionality
2023-10-26 21:09:33,699 - BERTopic - Predicted clusters


Loss: 0.6000000000000001 (penalty: 0.4), Label_count: 5, Metric: topic_overlap+bert
Fitting the BERT model on 1278 snippets                                        
 33%|███      | 5/15 [05:27<10:21, 62.18s/trial, best loss: 0.6000000000000001]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:09:50,604 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:09:52,916 - BERTopic - Reduced dimensionality
2023-10-26 21:09:52,929 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 33%|███      | 5/15 [05:35<10:21, 62.18s/trial, best loss: 0.6000000000000001]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:10:22,464 - BERTopic - Reduced dimensionality
2023-10-26 21:10:22,510 - BERTopic - Predicted clusters


Loss: 0.5 (penalty: 0.3333333333333333), Label_count: 6, Metric: topic_overlap+bert
Fitting the BERT model on 1278 snippets                                        
 40%|█████████▌              | 6/15 [06:16<08:38, 57.64s/trial, best loss: 0.5]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:10:39,175 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:10:41,847 - BERTopic - Reduced dimensionality
2023-10-26 21:10:41,862 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 40%|█████████▌              | 6/15 [06:24<08:38, 57.64s/trial, best loss: 0.5]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:11:33,156 - BERTopic - Reduced dimensionality
2023-10-26 21:11:33,211 - BERTopic - Predicted clusters


Loss: 0.6000000000000001 (penalty: 0.4), Label_count: 5, Metric: topic_overlap+bert
Fitting the BERT model on 1278 snippets                                        
 47%|███████████▏            | 7/15 [07:27<08:15, 61.91s/trial, best loss: 0.5]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:11:50,603 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:11:53,297 - BERTopic - Reduced dimensionality
2023-10-26 21:11:53,313 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 47%|███████████▏            | 7/15 [07:36<08:15, 61.91s/trial, best loss: 0.5]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:12:47,705 - BERTopic - Reduced dimensionality
2023-10-26 21:12:47,759 - BERTopic - Predicted clusters


Loss: 1.25 (penalty: 0.2222222222222222), Label_count: 9, Metric: topic_overlap+bert
Fitting the BERT model on 1278 snippets                                        
 53%|████████████▊           | 8/15 [08:41<07:41, 65.94s/trial, best loss: 0.5]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:13:05,198 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:13:09,034 - BERTopic - Reduced dimensionality
2023-10-26 21:13:09,050 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 53%|████████████▊           | 8/15 [08:52<07:41, 65.94s/trial, best loss: 0.5]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:13:52,027 - BERTopic - Reduced dimensionality
2023-10-26 21:13:52,079 - BERTopic - Predicted clusters


Loss: 0.5714285714285714 (penalty: 0.2857142857142857), Label_count: 7, Metric: topic_overlap+bert
Fitting the BERT model on 1278 snippets                                        
 60%|██████████████▍         | 9/15 [09:45<06:32, 65.43s/trial, best loss: 0.5]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:14:09,036 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:14:11,556 - BERTopic - Reduced dimensionality
2023-10-26 21:14:11,571 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 60%|██████████████▍         | 9/15 [09:54<06:32, 65.43s/trial, best loss: 0.5]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:14:52,631 - BERTopic - Reduced dimensionality
2023-10-26 21:14:52,679 - BERTopic - Predicted clusters


Loss: 1.7142857142857144 (penalty: 0.2857142857142857), Label_count: 7, Metric: topic_overlap+bert
Fitting the BERT model on 1278 snippets                                        
 67%|███████████████▎       | 10/15 [10:46<05:19, 63.94s/trial, best loss: 0.5]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:15:09,103 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:15:11,583 - BERTopic - Reduced dimensionality
2023-10-26 21:15:11,598 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 67%|███████████████▎       | 10/15 [10:54<05:19, 63.94s/trial, best loss: 0.5]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:15:50,659 - BERTopic - Reduced dimensionality
2023-10-26 21:15:50,712 - BERTopic - Predicted clusters


Loss: 0.53125 (penalty: 0.25), Label_count: 8, Metric: topic_overlap+bert      
Fitting the BERT model on 1278 snippets                                        
 73%|████████████████▊      | 11/15 [11:44<04:08, 62.13s/trial, best loss: 0.5]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:16:07,230 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:16:09,832 - BERTopic - Reduced dimensionality
2023-10-26 21:16:09,846 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 73%|████████████████▊      | 11/15 [11:52<04:08, 62.13s/trial, best loss: 0.5]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:16:57,963 - BERTopic - Reduced dimensionality
2023-10-26 21:16:58,094 - BERTopic - Predicted clusters


Loss: 0.7 (penalty: 0.4), Label_count: 5, Metric: topic_overlap+bert           
Fitting the BERT model on 1278 snippets                                        
 80%|██████████████████▍    | 12/15 [12:53<03:11, 63.73s/trial, best loss: 0.5]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:17:16,701 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:17:20,526 - BERTopic - Reduced dimensionality
2023-10-26 21:17:20,541 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 80%|██████████████████▍    | 12/15 [13:03<03:11, 63.73s/trial, best loss: 0.5]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:18:04,924 - BERTopic - Reduced dimensionality
2023-10-26 21:18:04,972 - BERTopic - Predicted clusters


Loss: 0.7 (penalty: 0.4), Label_count: 5, Metric: topic_overlap+bert           
Fitting the BERT model on 1278 snippets                                        
 87%|███████████████████▉   | 13/15 [13:58<02:09, 64.68s/trial, best loss: 0.5]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:18:21,863 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:18:24,374 - BERTopic - Reduced dimensionality
2023-10-26 21:18:24,388 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 87%|███████████████████▉   | 13/15 [14:07<02:09, 64.68s/trial, best loss: 0.5]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:19:07,692 - BERTopic - Reduced dimensionality
2023-10-26 21:19:07,779 - BERTopic - Predicted clusters


Loss: 1.6428571428571428 (penalty: 0.2857142857142857), Label_count: 7, Metric: topic_overlap+bert
Fitting the BERT model on 1278 snippets                                        
 93%|█████████████████████▍ | 14/15 [15:01<01:04, 64.11s/trial, best loss: 0.5]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:19:25,293 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:19:27,779 - BERTopic - Reduced dimensionality
2023-10-26 21:19:27,794 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets                                             
inference_snippets                                                             
3940                                                                           
 93%|█████████████████████▍ | 14/15 [15:10<01:04, 64.11s/trial, best loss: 0.5]

Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:20:03,605 - BERTopic - Reduced dimensionality
2023-10-26 21:20:03,652 - BERTopic - Predicted clusters


Loss: 1.75 (penalty: 0.2857142857142857), Label_count: 7, Metric: topic_overlap+bert
100%|███████████████████████| 15/15 [15:45<00:00, 63.05s/trial, best loss: 0.5]
best:
{'min_cluster_size': 33, 'min_samples': 2, 'n_components': 4, 'n_neighbors': 64, 'random_state': 42}
label count: 6

Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 21:20:41,219 - BERTopic - Transformed documents to Embeddings
2023-10-26 21:20:44,623 - BERTopic - Reduced dimensionality
2023-10-26 21:20:44,636 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 21:21:14,640 - BERTopic - Reduced dimensionality
2023-10-26 21:21:14,687 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 50.21725106239319 seconds
sbert results
Label count: 6
Loss: 0.003045685279187817
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 0
Overlap count of length 2: 1
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.17
Diversity loss:  1.03
Number of clusters: 6
Median cluster size: 586.0 , standard deviation: 554.6841944345949
Median top company occurence: 33.666666666666664
Cost (outliers): 0.30%



In [None]:
data2_input_file = '../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv'
print_results(input_file=data2_input_file)


# Experiment 3

In [23]:
# on 'snippet' column
params['BAYESIAN_SEARCH_METRIC'] = 'topic_diversity+bert'
params['MAX_EVALS'] = 15
params["SHOW"] = False

# sbert_model, sbert_output_df, sbert_cluster_topics, sbert_params, sbert_topic_model = cluster('baseline_data2_semi_td_temp_params.json', params, hf_model_name='sentence-transformers/all-MiniLM-L12-v2')
# baai_model, baai_output_df, baai_cluster_topics, baai_params, baai_topic_model = cluster('baai_baseline_data2_semi_td_params.json', params, hf_model_name='BAAI/bge-base-en-v1.5')

# similarity_sbert_model, similarity_sbert_output_df, similarity_sbert_cluster_topics, similarity_sbert_params, similarity_sbert_topic_model = cluster('similarity_data2_semi_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-similarity")
# similarity_baai_model, similarity_baai_output_df, similarity_baai_cluster_topics, similarity_baai_params, similarity_baai_topic_model = cluster('best_similarity_data2_semi_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-similarity-best-baai")
# similarity_ai_car_sbert_model, similarity_ai_car_sbert_output_df, similarity_ai_car_sbert_cluster_topics, similarity_ai_car_sbert_params, similarity_ai_car_sbert_topic_model = cluster('similarity_ai_car_data2_semi_td_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-ai-car')
# similarity_distill_consistency_model, similarity_distill_consistency_output_df, similarity_distill_consistency_cluster_topics, similarity_distill_consistency_params, similarity_distill_consistency_topic_model = cluster('similarity_distill_consistency_data2_semi_td_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-distill-consistency')
# similarity_ai_car_distill_consistency_model, similarity_ai_car_distill_consistency_output_df, similarity_ai_car_distill_consistency_cluster_topics, similarity_ai_car_distill_consistency_params, similarity_ai_car_distill_consistency_topic_model = cluster('similarity_ai_car_distill_consistency_data2_semi_td_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-ai_car-distill-consistency')

# classification_model, classification_output_df, classification_cluster_topics, classification_params, classification_topic_model = cluster('classification_data2_semi_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification")
# classification_ai_car_sbert_model, classification_ai_car_sbert_output_df, classification_ai_car_sbert_cluster_topics, classification_ai_car_sbert_params, classification_ai_car_sbert_topic_model = cluster('classification_ai_car_sbert_data2_semi_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification-ai_car-sbert")
# combined_ai_car_class_consulting_sim_model, combined_ai_car_class_consulting_sim_output_df, combined_ai_car_class_consulting_sim_cluster_topics, combined_ai_car_class_consulting_sim_params, combined_ai_car_class_consulting_sim_topic_model = cluster('combined_ai_car_class_consulting_sim_data2_semi_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-class-consulting-sim")
# combined_consulting_sim_ai_car_class_model, combined_consulting_sim_ai_car_class_output_df, combined_consulting_sim_ai_car_class_cluster_topics, combined_consulting_sim_ai_car_class_params, combined_consulting_sim_ai_car_class_topic_model = cluster('combined_consulting_sim_ai_car_class_data2_semi_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-consulting-sim-ai_car-class")
# combined_consulting_sim_and_ai_car_class_model, combined_consulting_sim_and_ai_car_class_output_df, combined_consulting_sim_and_ai_car_class_cluster_topics, combined_consulting_sim_and_ai_car_class_params, combined_consulting_sim_and_ai_car_class_topic_model = cluster('combined_consulting_sim_and_ai_car_class_data2_semi_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-consulting-sim-and-ai_car-class")
# combined_ai_car_sim_and_class_model, combined_ai_car_sim_and_class_output_df, combined_ai_car_sim_and_class_cluster_topics, combined_ai_car_sim_and_class_params, combined_ai_car_sim_and_class_topic_model = cluster('combined_ai_car_sim_and_class_data2_semi_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-sim-and-class")
# combined_ai_car_sim_class_model, combined_ai_car_sim_class_output_df, combined_ai_car_sim_class_cluster_topics, combined_ai_car_sim_class_params, combined_ai_car_sim_class_topic_model = cluster('combined_ai_car_sim_class_data2_semi_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-sim-class")
# combined_ai_car_class_sim_model, combined_ai_car_class_sim_output_df, combined_ai_car_class_sim_cluster_topics, combined_ai_car_class_sim_params, combined_ai_car_class_sim_topic_model = cluster('combined_ai_car_class_sim_data2_semi_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-class-sim")
# combined_consulting_model, combined_consulting_output_df, combined_consulting_cluster_topics, combined_consulting_params, combined_consulting_topic_model = cluster('combined_consulting_data2_semi_td_params.json', params, hf_model_name="brjezierski/combined-consulting")

ex3_classification_ai_car_data_aug_model, ex3_classification_ai_car_data_aug_output_df, ex3_classification_ai_car_data_aug_cluster_topics, ex3_classification_ai_car_data_aug_params, ex3_classification_ai_car_data_aug_topic_model = cluster('classification_ai_car_data_aug_data2_semi_td_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification-ai_car-data_aug")


Input file ../glanos-data/clustering/input/classification_ai_car_data_aug_data2_semi_td_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-27 18:24:28,329 - BERTopic - Transformed documents to Embeddings
2023-10-27 18:24:30,865 - BERTopic - Reduced dimensionality
2023-10-27 18:24:30,881 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-27 18:25:17,313 - BERTopic - Reduced dimensionality
2023-10-27 18:25:17,366 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 67.36825299263 seconds
sbert results
Label count: 7
Loss: 0.006852791878172589
dataset_file_path ../glanos-data/clustering/big_consulting_export_data2_no_duplicates.tsv
Overlap count of length 1: 5
Overlap count of length 2: 2
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.46
Diversity loss:  1.13
Number of clusters: 7
Avg cluster size: 544.5714285714286
Median cluster size: 169.0 , standard deviation: 706.8539669636473
Median top company occurence: 28.50678733031674
Cost (outliers): 0.69%



In [22]:
classification_ai_car_data_aug_cluster_topics

[['ibm',
  'ai',
  'consulting',
  'technology',
  'infosys',
  'services',
  'announced',
  'cloud',
  'wipro',
  'business'],
 ['mr',
  'ernst',
  'ernst young',
  'young',
  'prior',
  'partner',
  'llp',
  'years',
  'young llp',
  'joining'],
 ['pwc',
  'contract',
  'tax',
  'million',
  'secured',
  'contracts',
  'federal',
  'based',
  'secures',
  'department'],
 ['accenture',
  'domus',
  'sustainability',
  'green domus',
  'acquisition',
  'green',
  'acquire',
  'based',
  'digital',
  'announced'],
 ['fti',
  'appointed',
  'managing director',
  'director',
  'managing',
  'fti consulting',
  'tax',
  'jobs',
  'pwc',
  'joins'],
 ['patent',
  'application',
  'filed',
  'tata consultancy',
  'tata',
  'consultancy services',
  'consultancy',
  'services',
  'patents',
  'method'],
 ['microsoft',
  'salesforce',
  'leave',
  'employees',
  'ibm',
  'jobs',
  '000',
  'pwc',
  'reportedly',
  'layoffs']]

In [None]:
data2_input_file = '../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv'
print_results(input_file=data2_input_file)

# Experiment 4

In [15]:
# on replace_no_tags dataset
# overlap loss

data2_input_file = '../glanos-data/clustering/big_consulting_export_replace.tsv'
params['BAYESIAN_SEARCH_METRIC'] = 'topic_overlap+chat_intents'
params['MAX_EVALS'] = 15

sbert_model, sbert_output_df, sbert_cluster_topics, sbert_params, sbert_topic_model = cluster('baseline_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name='sentence-transformers/all-MiniLM-L12-v2')
baai_model, baai_output_df, baai_cluster_topics, baai_params, baai_topic_model = cluster('baai_baseline_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name='BAAI/bge-base-en-v1.5')

similarity_sbert_model, similarity_sbert_output_df, similarity_sbert_cluster_topics, similarity_sbert_params, similarity_sbert_topic_model = cluster('similarity_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/sentence-embeddings-similarity")
similarity_baai_model, similarity_baai_output_df, similarity_baai_cluster_topics, similarity_baai_params, similarity_baai_topic_model = cluster('best_similarity_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/sentence-embeddings-similarity-best-baai")
similarity_ai_car_sbert_model, similarity_ai_car_sbert_output_df, similarity_ai_car_sbert_cluster_topics, similarity_ai_car_sbert_params, similarity_ai_car_sbert_topic_model = cluster('similarity_ai_car_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-ai-car')
similarity_distill_consistency_model, similarity_distill_consistency_output_df, similarity_distill_consistency_cluster_topics, similarity_distill_consistency_params, similarity_distill_consistency_topic_model = cluster('similarity_distill_consistency_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-distill-consistency')
similarity_ai_car_distill_consistency_model, similarity_ai_car_distill_consistency_output_df, similarity_ai_car_distill_consistency_cluster_topics, similarity_ai_car_distill_consistency_params, similarity_ai_car_distill_consistency_topic_model = cluster('similarity_ai_car_distill_consistency_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name='brjezierski/sentence-embeddings-similarity-ai_car-distill-consistency')


classification_model, classification_output_df, classification_cluster_topics, classification_params, classification_topic_model = cluster('classification_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification")
classification_ai_car_sbert_model, classification_ai_car_sbert_output_df, classification_ai_car_sbert_cluster_topics, classification_ai_car_sbert_params, classification_ai_car_sbert_topic_model = cluster('classification_ai_car_sbert_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification-ai_car-sbert")
combined_ai_car_class_consulting_sim_model, combined_ai_car_class_consulting_sim_output_df, combined_ai_car_class_consulting_sim_cluster_topics, combined_ai_car_class_consulting_sim_params, combined_ai_car_class_consulting_sim_topic_model = cluster('combined_ai_car_class_consulting_sim_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-class-consulting-sim")
combined_consulting_sim_ai_car_class_model, combined_consulting_sim_ai_car_class_output_df, combined_consulting_sim_ai_car_class_cluster_topics, combined_consulting_sim_ai_car_class_params, combined_consulting_sim_ai_car_class_topic_model = cluster('combined_consulting_sim_ai_car_class_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-consulting-sim-ai_car-class")
combined_consulting_sim_and_ai_car_class_model, combined_consulting_sim_and_ai_car_class_output_df, combined_consulting_sim_and_ai_car_class_cluster_topics, combined_consulting_sim_and_ai_car_class_params, combined_consulting_sim_and_ai_car_class_topic_model = cluster('combined_consulting_sim_and_ai_car_class_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-consulting-sim-and-ai_car-class")
combined_ai_car_sim_and_class_model, combined_ai_car_sim_and_class_output_df, combined_ai_car_sim_and_class_cluster_topics, combined_ai_car_sim_and_class_params, combined_ai_car_sim_and_class_topic_model = cluster('combined_ai_car_sim_and_class_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-sim-and-class")
combined_ai_car_sim_class_model, combined_ai_car_sim_class_output_df, combined_ai_car_sim_class_cluster_topics, combined_ai_car_sim_class_params, combined_ai_car_sim_class_topic_model = cluster('combined_ai_car_sim_class_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-sim-class")
combined_ai_car_class_sim_model, combined_ai_car_class_sim_output_df, combined_ai_car_class_sim_cluster_topics, combined_ai_car_class_sim_params, combined_ai_car_class_sim_topic_model = cluster('combined_ai_car_class_sim_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/sentence-embeddings-combined-ai_car-class-sim")
combined_consulting_model, combined_consulting_output_df, combined_consulting_cluster_topics, combined_consulting_params, combined_consulting_topic_model = cluster('combined_consulting_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/combined-consulting")

classification_ai_car_data_aug_model, classification_ai_car_data_aug_output_df, classification_ai_car_data_aug_cluster_topics, classification_ai_car_data_aug_params, classification_ai_car_data_aug_topic_model = cluster('classification_ai_car_data_aug_data2_semi_replace_no_tags_to_ci_params.json', params, hf_model_name="brjezierski/sentence-embeddings-classification-ai_car-data_aug")

Input file ../glanos-data/clustering/input/baseline_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:36:28,364 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:36:30,976 - BERTopic - Reduced dimensionality
2023-10-26 23:36:30,995 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:37:12,591 - BERTopic - Reduced dimensionality
2023-10-26 23:37:12,650 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 62.191986083984375 seconds
sbert results
Label count: 8
Loss: 0.5068527918781726
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 5
Overlap count of length 2: 5
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.78
Diversity loss:  1.22
Number of clusters: 8
Avg cluster size: 437.77777777777777
Median cluster size: 209.0 , standard deviation: 580.5916097257613
Median top company occurence: 29.166666666666668
Cost (outliers): 50.69%

Input file ../glanos-data/clustering/input/baai_baseline_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:37:40,632 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:37:43,587 - BERTopic - Reduced dimensionality
2023-10-26 23:37:43,605 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:38:41,839 - BERTopic - Reduced dimensionality
2023-10-26 23:38:41,896 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 88.19851088523865 seconds
sbert results
Label count: 7
Loss: 0.3238578680203046
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 3
Overlap count of length 2: 3
Overlap count of length 3: 1
Overlap count of length 4 or more: 0
Overlap loss:  1.11
Diversity loss:  1.14
Number of clusters: 7
Avg cluster size: 492.5
Median cluster size: 364.5 , standard deviation: 392.73623464101195
Median top company occurence: 28.362639872116922
Cost (outliers): 32.39%

Input file ../glanos-data/clustering/input/similarity_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:38:59,400 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:39:01,971 - BERTopic - Reduced dimensionality
2023-10-26 23:39:01,986 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:39:40,159 - BERTopic - Reduced dimensionality
2023-10-26 23:39:40,214 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 57.53597807884216 seconds
sbert results
Label count: 9
Loss: 0.4588832487309645
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 9
Overlap count of length 2: 5
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.81
Diversity loss:  1.19
Number of clusters: 9
Avg cluster size: 394.0
Median cluster size: 206.5 , standard deviation: 515.5294365989201
Median top company occurence: 28.04232804232804
Cost (outliers): 45.89%

Input file ../glanos-data/clustering/input/best_similarity_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:40:07,524 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:40:10,311 - BERTopic - Reduced dimensionality
2023-10-26 23:40:10,324 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:41:05,001 - BERTopic - Reduced dimensionality
2023-10-26 23:41:05,055 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 83.78383493423462 seconds
sbert results
Label count: 9
Loss: 0.45253807106598987
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 7
Overlap count of length 2: 5
Overlap count of length 3: 1
Overlap count of length 4 or more: 1
Overlap loss:  2.08
Diversity loss:  1.27
Number of clusters: 9
Avg cluster size: 394.0
Median cluster size: 181.0 , standard deviation: 496.4046736282808
Median top company occurence: 31.432511356262168
Cost (outliers): 45.25%

Input file ../glanos-data/clustering/input/similarity_ai_car_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:41:22,304 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:41:24,980 - BERTopic - Reduced dimensionality
2023-10-26 23:41:24,998 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:42:08,108 - BERTopic - Reduced dimensionality
2023-10-26 23:42:08,169 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 62.37531113624573 seconds
sbert results
Label count: 9
Loss: 0.4733502538071066
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 4
Overlap count of length 2: 5
Overlap count of length 3: 1
Overlap count of length 4 or more: 1
Overlap loss:  2.00
Diversity loss:  1.19
Number of clusters: 9
Avg cluster size: 394.0
Median cluster size: 177.0 , standard deviation: 515.6114816409736
Median top company occurence: 41.22463331324091
Cost (outliers): 47.34%

Input file ../glanos-data/clustering/input/similarity_distill_consistency_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:42:26,166 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:42:28,694 - BERTopic - Reduced dimensionality
2023-10-26 23:42:28,708 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:43:06,362 - BERTopic - Reduced dimensionality
2023-10-26 23:43:06,418 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 57.54538702964783 seconds
sbert results
Label count: 9
Loss: 0.45355329949238576
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 7
Overlap count of length 2: 3
Overlap count of length 3: 1
Overlap count of length 4 or more: 2
Overlap loss:  2.75
Diversity loss:  1.23
Number of clusters: 9
Avg cluster size: 394.0
Median cluster size: 170.0 , standard deviation: 547.8561855085694
Median top company occurence: 34.31372549019608
Cost (outliers): 45.36%

Input file ../glanos-data/clustering/input/similarity_ai_car_distill_consistency_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:43:24,379 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:43:26,808 - BERTopic - Reduced dimensionality
2023-10-26 23:43:26,825 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:44:10,183 - BERTopic - Reduced dimensionality
2023-10-26 23:44:10,243 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 63.08823823928833 seconds
sbert results
Label count: 10
Loss: 0.05862944162436548
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 11
Overlap count of length 2: 6
Overlap count of length 3: 5
Overlap count of length 4 or more: 1
Overlap loss:  3.67
Diversity loss:  1.33
Number of clusters: 10
Avg cluster size: 358.1818181818182
Median cluster size: 192.0 , standard deviation: 599.6433513016972
Median top company occurence: 30.927835051546392
Cost (outliers): 5.86%

Input file ../glanos-data/clustering/input/classification_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:44:27,362 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:44:29,952 - BERTopic - Reduced dimensionality
2023-10-26 23:44:29,967 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:45:05,018 - BERTopic - Reduced dimensionality
2023-10-26 23:45:05,073 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 54.14522910118103 seconds
sbert results
Label count: 11
Loss: 0.30710659898477155
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 6
Overlap count of length 2: 2
Overlap count of length 3: 1
Overlap count of length 4 or more: 0
Overlap loss:  0.68
Diversity loss:  1.11
Number of clusters: 11
Avg cluster size: 328.3333333333333
Median cluster size: 188.5 , standard deviation: 342.6646400718282
Median top company occurence: 29.655990510083036
Cost (outliers): 30.71%

Input file ../glanos-data/clustering/input/classification_ai_car_sbert_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:45:22,130 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:45:24,684 - BERTopic - Reduced dimensionality
2023-10-26 23:45:24,698 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:46:02,226 - BERTopic - Reduced dimensionality
2023-10-26 23:46:02,279 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 56.54367017745972 seconds
sbert results
Label count: 13
Loss: 0.15050761421319797
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 18
Overlap count of length 2: 2
Overlap count of length 3: 3
Overlap count of length 4 or more: 1
Overlap loss:  2.04
Diversity loss:  1.23
Number of clusters: 13
Avg cluster size: 281.42857142857144
Median cluster size: 168.5 , standard deviation: 261.69740276174866
Median top company occurence: 34.16478520397118
Cost (outliers): 15.05%

Input file ../glanos-data/clustering/input/combined_ai_car_class_consulting_sim_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:46:20,008 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:46:22,471 - BERTopic - Reduced dimensionality
2023-10-26 23:46:22,484 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:46:53,742 - BERTopic - Reduced dimensionality
2023-10-26 23:46:53,791 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 50.88553214073181 seconds
sbert results
Label count: 10
Loss: 0.14720812182741116
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 6
Overlap count of length 2: 2
Overlap count of length 3: 2
Overlap count of length 4 or more: 0
Overlap loss:  1.15
Diversity loss:  1.13
Number of clusters: 10
Avg cluster size: 358.1818181818182
Median cluster size: 212.0 , standard deviation: 290.9194884505515
Median top company occurence: 29.523809523809526
Cost (outliers): 14.72%

Input file ../glanos-data/clustering/input/combined_consulting_sim_ai_car_class_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:47:10,712 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:47:13,458 - BERTopic - Reduced dimensionality
2023-10-26 23:47:13,475 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:48:04,634 - BERTopic - Reduced dimensionality
2023-10-26 23:48:04,694 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 70.3375141620636 seconds
sbert results
Label count: 11
Loss: 0.0583756345177665
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 12
Overlap count of length 2: 1
Overlap count of length 3: 1
Overlap count of length 4 or more: 2
Overlap loss:  2.18
Diversity loss:  1.24
Number of clusters: 11
Avg cluster size: 328.3333333333333
Median cluster size: 119.5 , standard deviation: 401.49934274195545
Median top company occurence: 32.42285237698082
Cost (outliers): 5.84%

Input file ../glanos-data/clustering/input/combined_consulting_sim_and_ai_car_class_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:48:22,419 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:48:26,170 - BERTopic - Reduced dimensionality
2023-10-26 23:48:26,187 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:49:08,682 - BERTopic - Reduced dimensionality
2023-10-26 23:49:08,740 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 63.36781597137451 seconds
sbert results
Label count: 12
Loss: 0.34949238578680203
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 16
Overlap count of length 2: 4
Overlap count of length 3: 3
Overlap count of length 4 or more: 0
Overlap loss:  1.67
Diversity loss:  1.19
Number of clusters: 12
Avg cluster size: 303.0769230769231
Median cluster size: 229.0 , standard deviation: 346.7005938488538
Median top company occurence: 35.55555555555556
Cost (outliers): 34.95%

Input file ../glanos-data/clustering/input/combined_ai_car_sim_and_class_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:49:26,628 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:49:29,244 - BERTopic - Reduced dimensionality
2023-10-26 23:49:29,261 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:50:09,594 - BERTopic - Reduced dimensionality
2023-10-26 23:50:09,652 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 60.301599979400635 seconds
sbert results
Label count: 8
Loss: 0.3223350253807107
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 3
Overlap count of length 2: 2
Overlap count of length 3: 1
Overlap count of length 4 or more: 0
Overlap loss:  0.84
Diversity loss:  1.12
Number of clusters: 8
Avg cluster size: 437.77777777777777
Median cluster size: 306.0 , standard deviation: 336.6738539753529
Median top company occurence: 29.559748427672954
Cost (outliers): 32.23%

Input file ../glanos-data/clustering/input/combined_ai_car_sim_class_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:50:27,407 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:50:30,135 - BERTopic - Reduced dimensionality
2023-10-26 23:50:30,151 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:51:17,497 - BERTopic - Reduced dimensionality
2023-10-26 23:51:17,550 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 67.21248579025269 seconds
sbert results
Label count: 6
Loss: 0.013705583756345178
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 2
Overlap count of length 2: 0
Overlap count of length 3: 1
Overlap count of length 4 or more: 0
Overlap loss:  0.75
Diversity loss:  1.08
Number of clusters: 6
Avg cluster size: 562.8571428571429
Median cluster size: 352.0 , standard deviation: 458.17227844413924
Median top company occurence: 28.693181818181817
Cost (outliers): 1.37%

Input file ../glanos-data/clustering/input/combined_ai_car_class_sim_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:51:34,880 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:51:38,472 - BERTopic - Reduced dimensionality
2023-10-26 23:51:38,486 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:52:14,187 - BERTopic - Reduced dimensionality
2023-10-26 23:52:14,240 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 56.096415996551514 seconds
sbert results
Label count: 10
Loss: 0.3934010152284264
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 6
Overlap count of length 2: 5
Overlap count of length 3: 3
Overlap count of length 4 or more: 0
Overlap loss:  1.85
Diversity loss:  1.20
Number of clusters: 10
Avg cluster size: 358.1818181818182
Median cluster size: 172.0 , standard deviation: 451.97019575547193
Median top company occurence: 30.952380952380953
Cost (outliers): 39.34%

Input file ../glanos-data/clustering/input/combined_consulting_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:52:31,674 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:52:34,258 - BERTopic - Reduced dimensionality
2023-10-26 23:52:34,272 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:53:06,930 - BERTopic - Reduced dimensionality
2023-10-26 23:53:06,981 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 52.1260929107666 seconds
sbert results
Label count: 10
Loss: 0.2652284263959391
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 10
Overlap count of length 2: 3
Overlap count of length 3: 5
Overlap count of length 4 or more: 0
Overlap loss:  2.55
Diversity loss:  1.24
Number of clusters: 10
Avg cluster size: 358.1818181818182
Median cluster size: 217.0 , standard deviation: 328.9080607049548
Median top company occurence: 40.21164021164021
Cost (outliers): 26.52%

Input file ../glanos-data/clustering/input/classification_ai_car_data_aug_data2_semi_replace_no_tags_to_ci_params.json
Supervised
Topic modelling method: BERTopic
Fitting the BERT model on 1278 snippets


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

2023-10-26 23:53:24,368 - BERTopic - Transformed documents to Embeddings
2023-10-26 23:53:26,675 - BERTopic - Reduced dimensionality
2023-10-26 23:53:26,687 - BERTopic - Clustered reduced embeddings


BERT model labelling 3940 snippets
inference_snippets 3940


Batches:   0%|          | 0/124 [00:00<?, ?it/s]

2023-10-26 23:53:56,804 - BERTopic - Reduced dimensionality
2023-10-26 23:53:56,849 - BERTopic - Predicted clusters


Elapsed total time for topic modelling: 49.1462767124176 seconds
sbert results
Label count: 6
Loss: 0.0025380710659898475
dataset_file_path ../glanos-data/clustering/big_consulting_export_replace_no_duplicates.tsv
Overlap count of length 1: 0
Overlap count of length 2: 1
Overlap count of length 3: 0
Overlap count of length 4 or more: 0
Overlap loss:  0.17
Diversity loss:  1.03
Number of clusters: 6
Avg cluster size: 640.1666666666666
Median cluster size: 583.0 , standard deviation: 549.8100934767284
Median top company occurence: 31.967871485943775
Cost (outliers): 0.25%



In [None]:
print_results(input_file=data2_input_file)