# MyEmbeddings

In [None]:
import json
import logging
import os
import shutil
import stat
from collections import OrderedDict
from typing import List, Dict, Tuple, Iterable, Type, Union, Callable, Optional
import requests
import numpy as np
from numpy import ndarray
import transformers
from huggingface_hub import HfApi, HfFolder, Repository, hf_hub_url, cached_download
import torch
from torch import nn, Tensor, device
from torch.optim import Optimizer
from torch.utils.data import DataLoader
import torch.multiprocessing as mp
from tqdm.autonotebook import trange
import math
import queue
import tempfile
from distutils.dir_util import copy_tree
from sentence_transformers.evaluation import SentenceEvaluator
from sentence_transformers.model_card_templates import ModelCardTemplate
from sentence_transformers.util import import_from_string, batch_to_device, fullname, snapshot_download
from sentence_transformers.models import Transformer, Pooling, Dense

class MySentenceTransformer(SentenceTransformer):
    """
    Loads or create a SentenceTransformer model, that can be used to map sentences / text to embeddings.

    :param model_name_or_path: If it is a filepath on disc, it loads the model from that path. If it is not a path, it first tries to download a pre-trained SentenceTransformer model. If that fails, tries to construct a model from Huggingface models repository with that name.
    :param modules: This parameter can be used to create custom SentenceTransformer models from scratch.
    :param device: Device (like 'cuda' / 'cpu') that should be used for computation. If None, checks if a GPU can be used.
    :param cache_folder: Path to store models. Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable.
    :param use_auth_token: HuggingFace authentication token to download private models.
    """
    def __init__(self, model_name_or_path: Optional[str] = None,
                 modules: Optional[Iterable[nn.Module]] = None,
                 device: Optional[str] = None,
                 cache_folder: Optional[str] = None,
                 use_auth_token: Union[bool, str, None] = None
                 ):
        self._model_card_vars = {}
        self._model_card_text = None
        self._model_config = {}

        if cache_folder is None:
            cache_folder = os.getenv('SENTENCE_TRANSFORMERS_HOME')
            if cache_folder is None:
                try:
                    from torch.hub import _get_torch_home

                    torch_cache_home = _get_torch_home()
                except ImportError:
                    torch_cache_home = os.path.expanduser(os.getenv('TORCH_HOME', os.path.join(os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))

                cache_folder = os.path.join(torch_cache_home, 'sentence_transformers')

        if model_name_or_path is not None and model_name_or_path != "":
            logger.info("Load pretrained SentenceTransformer: {}".format(model_name_or_path))

            #Old models that don't belong to any organization
            basic_transformer_models = ['albert-base-v1', 'albert-base-v2', 'albert-large-v1', 'albert-large-v2', 'albert-xlarge-v1', 'albert-xlarge-v2', 'albert-xxlarge-v1', 'albert-xxlarge-v2', 'bert-base-cased-finetuned-mrpc', 'bert-base-cased', 'bert-base-chinese', 'bert-base-german-cased', 'bert-base-german-dbmdz-cased', 'bert-base-german-dbmdz-uncased', 'bert-base-multilingual-cased', 'bert-base-multilingual-uncased', 'bert-base-uncased', 'bert-large-cased-whole-word-masking-finetuned-squad', 'bert-large-cased-whole-word-masking', 'bert-large-cased', 'bert-large-uncased-whole-word-masking-finetuned-squad', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased', 'camembert-base', 'ctrl', 'distilbert-base-cased-distilled-squad', 'distilbert-base-cased', 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased', 'distilbert-base-uncased-distilled-squad', 'distilbert-base-uncased-finetuned-sst-2-english', 'distilbert-base-uncased', 'distilgpt2', 'distilroberta-base', 'gpt2-large', 'gpt2-medium', 'gpt2-xl', 'gpt2', 'openai-gpt', 'roberta-base-openai-detector', 'roberta-base', 'roberta-large-mnli', 'roberta-large-openai-detector', 'roberta-large', 't5-11b', 't5-3b', 't5-base', 't5-large', 't5-small', 'transfo-xl-wt103', 'xlm-clm-ende-1024', 'xlm-clm-enfr-1024', 'xlm-mlm-100-1280', 'xlm-mlm-17-1280', 'xlm-mlm-en-2048', 'xlm-mlm-ende-1024', 'xlm-mlm-enfr-1024', 'xlm-mlm-enro-1024', 'xlm-mlm-tlm-xnli15-1024', 'xlm-mlm-xnli15-1024', 'xlm-roberta-base', 'xlm-roberta-large-finetuned-conll02-dutch', 'xlm-roberta-large-finetuned-conll02-spanish', 'xlm-roberta-large-finetuned-conll03-english', 'xlm-roberta-large-finetuned-conll03-german', 'xlm-roberta-large', 'xlnet-base-cased', 'xlnet-large-cased']

            if os.path.exists(model_name_or_path):
                #Load from path
                model_path = model_name_or_path
            else:
                #Not a path, load from hub
                if '\\' in model_name_or_path or model_name_or_path.count('/') > 1:
                    raise ValueError("Path {} not found".format(model_name_or_path))

                if '/' not in model_name_or_path and model_name_or_path.lower() not in basic_transformer_models:
                    # A model from sentence-transformers
                    model_name_or_path = __MODEL_HUB_ORGANIZATION__ + "/" + model_name_or_path

                model_path = os.path.join(cache_folder, model_name_or_path.replace("/", "_"))

                if not os.path.exists(os.path.join(model_path, 'modules.json')):
                    # Download from hub with caching
                    snapshot_download(model_name_or_path,
                                        cache_dir=cache_folder,
                                        library_name='sentence-transformers',
                                        library_version=__version__,
                                        ignore_files=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'],
                                        use_auth_token=use_auth_token)

            if os.path.exists(os.path.join(model_path, 'modules.json')):    #Load as SentenceTransformer model
                modules = self._load_sbert_model(model_path)
            else:   #Load with AutoModel
                modules = self._load_auto_model(model_path)

        if modules is not None and not isinstance(modules, OrderedDict):
            modules = OrderedDict([(str(idx), module) for idx, module in enumerate(modules)])

        super().__init__(modules)
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
            logger.info("Use pytorch device: {}".format(device))

        self._target_device = torch.device(device)


    def encode(self, sentences: Union[str, List[str]],
               batch_size: int = 32,
               show_progress_bar: bool = None,
               output_value: str = 'sentence_embedding',
               convert_to_numpy: bool = True,
               convert_to_tensor: bool = False,
               device: str = None,
               normalize_embeddings: bool = False) -> Union[List[Tensor], ndarray, Tensor]:
        """
        Computes sentence embeddings

        :param sentences: the sentences to embed
        :param batch_size: the batch size used for the computation
        :param show_progress_bar: Output a progress bar when encode sentences
        :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
        :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
        :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
        :param device: Which torch.device to use for the computation
        :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.

        :return:
           By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
        """

        return super().encode(sentences)

    def fit(self,
            train_objectives: Iterable[Tuple[DataLoader, nn.Module]],
            evaluator: SentenceEvaluator = None,
            epochs: int = 1,
            steps_per_epoch = None,
            scheduler: str = 'WarmupLinear',
            warmup_steps: int = 10000,
            optimizer_class: Type[Optimizer] = torch.optim.AdamW,
            optimizer_params : Dict[str, object]= {'lr': 2e-5},
            weight_decay: float = 0.01,
            evaluation_steps: int = 0,
            output_path: str = None,
            save_best_model: bool = True,
            max_grad_norm: float = 1,
            use_amp: bool = False,
            callback: Callable[[float, int, int], None] = None,
            show_progress_bar: bool = True,
            checkpoint_path: str = None,
            checkpoint_save_steps: int = 500,
            checkpoint_save_total_limit: int = 0
            ):
        print("fitting 2")
        ##Add info to model card
        #info_loss_functions = "\n".join(["- {} with {} training examples".format(str(loss), len(dataloader)) for dataloader, loss in train_objectives])
        info_loss_functions =  []
        for dataloader, loss in train_objectives:
            info_loss_functions.extend(ModelCardTemplate.get_train_objective_info(dataloader, loss))
        info_loss_functions = "\n\n".join([text for text in info_loss_functions])

        info_fit_parameters = json.dumps({"evaluator": fullname(evaluator), "epochs": epochs, "steps_per_epoch": steps_per_epoch, "scheduler": scheduler, "warmup_steps": warmup_steps, "optimizer_class": str(optimizer_class),  "optimizer_params": optimizer_params, "weight_decay": weight_decay, "evaluation_steps": evaluation_steps, "max_grad_norm": max_grad_norm }, indent=4, sort_keys=True)
        self._model_card_text = None
        self._model_card_vars['{TRAINING_SECTION}'] = ModelCardTemplate.__TRAINING_SECTION__.replace("{LOSS_FUNCTIONS}", info_loss_functions).replace("{FIT_PARAMETERS}", info_fit_parameters)


        if use_amp:
            from torch.cuda.amp import autocast
            scaler = torch.cuda.amp.GradScaler()

        self.to(self._target_device)

        dataloaders = [dataloader for dataloader, _ in train_objectives]

        # Use smart batching
        for dataloader in dataloaders:
            dataloader.collate_fn = self.smart_batching_collate


        loss_models = [loss for _, loss in train_objectives]
        for loss_model in loss_models:
            print('loss_model type', type(loss_model))
            loss_model.to(self._target_device)

        self.best_score = -9999999

        if steps_per_epoch is None or steps_per_epoch == 0:
            steps_per_epoch = min([len(dataloader) for dataloader in dataloaders])

        num_train_steps = int(steps_per_epoch * epochs)

        # Prepare optimizers
        optimizers = []
        schedulers = []
        for loss_model in loss_models:
            param_optimizer = list(loss_model.named_parameters())

            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

            optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params)
            scheduler_obj = self._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps)

            optimizers.append(optimizer)
            schedulers.append(scheduler_obj)


        global_step = 0
        data_iterators = [iter(dataloader) for dataloader in dataloaders]

        num_train_objectives = len(train_objectives)

        skip_scheduler = False
        for epoch in trange(epochs, desc="Epoch", disable=not show_progress_bar):
            training_steps = 0

            for loss_model in loss_models:
                loss_model.zero_grad()
                loss_model.train()

            for _ in trange(steps_per_epoch, desc="Iteration", smoothing=0.05, disable=not show_progress_bar):
                # iterate through training objectives?
                for train_idx in range(num_train_objectives):
                    loss_model = loss_models[train_idx]
                    optimizer = optimizers[train_idx]
                    scheduler = schedulers[train_idx]
                    data_iterator = data_iterators[train_idx]

                    try:
                        data = next(data_iterator)
                    except StopIteration:
                        data_iterator = iter(dataloaders[train_idx])
                        data_iterators[train_idx] = data_iterator
                        data = next(data_iterator)

                    features, labels = data
                    labels = labels.to(self._target_device)
                    features = list(map(lambda batch: batch_to_device(batch, self._target_device), features))

                    if use_amp:
                        with autocast():
                            loss_value = loss_model(features, labels)

                        scale_before_step = scaler.get_scale()
                        scaler.scale(loss_value).backward()
                        scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm)
                        scaler.step(optimizer)
                        scaler.update()

                        skip_scheduler = scaler.get_scale() != scale_before_step
                    else:
                        loss_value = loss_model(features, labels)
                        loss_value.backward()
                        torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm)
                        optimizer.step()

                    optimizer.zero_grad()

                    if not skip_scheduler:
                        scheduler.step()

                training_steps += 1
                global_step += 1

                if evaluation_steps > 0 and training_steps % evaluation_steps == 0:
                    self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, callback)

                    for loss_model in loss_models:
                        loss_model.zero_grad()
                        loss_model.train()

                if checkpoint_path is not None and checkpoint_save_steps is not None and checkpoint_save_steps > 0 and global_step % checkpoint_save_steps == 0:
                    self._save_checkpoint(checkpoint_path, checkpoint_save_total_limit, global_step)


            self._eval_during_training(evaluator, output_path, save_best_model, epoch, -1, callback)

        if evaluator is None and output_path is not None:   #No evaluator, but output path: save final model version
            self.save(output_path)

        if checkpoint_path is not None:
            self._save_checkpoint(checkpoint_path, checkpoint_save_total_limit, global_step)



ModuleNotFoundError: ignored

# Main

In [1]:
from google.colab import drive
import shutil

drive.mount('/content/drive')

source_path = "/content/drive/MyDrive/thesis/."
destination_path = "/content/thesis"
shutil.copytree(source_path, destination_path)
!ls /content/thesis

Mounted at /content/drive
ai_car				  glanos-company-0.2  glanos-data	     sbert-company-0.5
chat-intents			  glanos-company-0.3  model_freeze.py	     sbert-company-0.6
classification_training_utils.py  glanos-company-0.4  prediction_helpers.py  sbert-company-0.7
clustering_utils.py		  glanos-company-0.5  sbert-all		     sbert-company-0.8
custom_evaluators.py		  glanos-company-0.6  sbert-company-0.0      sbert-company-0.9
custom_losses.py		  glanos-company-0.7  sbert-company-0.1      sbert-company-1.0
data_helpers.py			  glanos-company-0.8  sbert-company-0.2      training_utils.py
glanos-company-0.0		  glanos-company-0.9  sbert-company-0.3      utils.py
glanos-company-0.1		  glanos-company-1.0  sbert-company-0.4


In [4]:
%%capture
!pip install transformers
!pip install sentence-transformers
!pip install pycountry

In [5]:
import sys
sys.path.append('thesis/')
import os
import pandas as pd
import json
import torch

NameError: ignored

In [None]:
import importlib
import training_utils
importlib.reload(training_utils)

import custom_losses
importlib.reload(custom_losses)
import custom_evaluators
importlib.reload(custom_evaluators)
from sentence_transformers import SentenceTransformer, InputExample
import model_freeze as freeze
import prediction_helpers as ph
import ast
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from transformers import AutoModel
from training_utils import get_data, create_weighted_avg, normalize_column, train, infer, get_mse_scores, get_spearman_scores
from sentence_transformers import losses
from sentence_transformers import evaluation
from scipy.stats import spearmanr

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

print(torch.cuda.device_count())
print("Device: ", device)

# Training

In [None]:
params = {}
params["UNFREEZE_LAYERS"] = 2
params["USE_WEIGHTED_SIMILARITY"] = True
params["USE_MANUAL_LABELS"] = True # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["cosine"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 15
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'country', 'classification', 'keywords'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0
params["EVAL_STEPS"] = 1000

params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_INIT"] = "all-MiniLM-L12-v2" # "intfloat/e5-small-v2" # "brjezierski/S3BERT"
params["SBERT_DIM"] = 384

# where to save model and logs
params["SBERT_SAVE_PATH"] = "s3bert_" + params["SBERT_INIT"] + "/"

In [None]:
model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
# model = AutoModel.from_pretrained("intfloat/e5-small-v2").to(device)
freeze.freeze_except_last_layers(model, 2)

In [None]:
column_list = ['country', 'company', 'keywords', 'classification']
column_proportions = (10, 2, 1, 1, 1)

# column_list = ['company-inverse']
# column_proportions = (1, 1)
column_list = ['company']
column_proportions = (5, 1)
# column_list = []
# column_proportions = ()

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/sbert-all/') #='thesis/replace_no_tags/')
steps_per_epoch = len(train_df)/params["BATCH_SIZE"]
params["EVAL_STEPS"] = 100 if steps_per_epoch <= 200 else steps_per_epoch/4
train_df[:1]

Unnamed: 0,snippet1,snippet2,similarity,country_similarity,country1,country2,classification_similarity,classification1,classification2,keywords_similarity,keywords1,keywords2,company_similarity,company1,company2
0,at KPMG where he focused on complex financial ...,LatentView has been recognized as an industry ...,0.279512,0.536794,US,IN,0.347584,"['strategy', 'company info']",['leadership'],0.299588,['he'],"['leader', 'industry leader']",0.632063,KPMG A/S,Gartner Inc.


In [None]:
params["FEATURE_DIM"], params["N"]

(0, 0)

In [None]:
train(model, params, train_df, val_df)

In [None]:
!zip -r /content/s3bert_intfloat.zip /content/s3bert_intfloat
%cp -av "/content/s3bert_intfloat.zip" "/content/drive/MyDrive"

  adding: content/s3bert_intfloat/ (stored 0%)
  adding: content/s3bert_intfloat/e5-small-v2/ (stored 0%)
  adding: content/s3bert_intfloat/e5-small-v2/tokenizer.json (deflated 71%)
  adding: content/s3bert_intfloat/e5-small-v2/config_sentence_transformers.json (deflated 27%)
  adding: content/s3bert_intfloat/e5-small-v2/special_tokens_map.json (deflated 42%)
  adding: content/s3bert_intfloat/e5-small-v2/sentence_bert_config.json (deflated 4%)
  adding: content/s3bert_intfloat/e5-small-v2/modules.json (deflated 62%)
  adding: content/s3bert_intfloat/e5-small-v2/pytorch_model.bin (deflated 37%)
  adding: content/s3bert_intfloat/e5-small-v2/README.md (deflated 55%)
  adding: content/s3bert_intfloat/e5-small-v2/added_tokens.json (deflated 37%)
  adding: content/s3bert_intfloat/e5-small-v2/eval/ (stored 0%)
  adding: content/s3bert_intfloat/e5-small-v2/eval/similarity_evaluation_results.csv (deflated 62%)
  adding: content/s3bert_intfloat/e5-small-v2/vocab.txt (deflated 53%)
  adding: cont

In [None]:
eval_model_path = "./" + params['SBERT_SAVE_PATH'] + "/" # local
# eval_model_path = "brjezierski/sentence-embeddings" # classification model
# eval_model_path = "brjezierski/S3BERT"

test_df_wpred = infer(test_df, eval_model_path, params)


In [None]:
get_spearman_scores(test_df_wpred, params)
# get_mse_scores(test_df_wpred, params)

# We want to be better than 0.92

# (0.9141509653876244, 0.9184827961965057) - N64 w companies and countries e=5, only consistency loss?
# (0.9137460700377446, 0.9184827961965057) - N64 w companies e=5, only consistency loss?
# (0.9155227750058739, 0.9184827961965057) - N16 w companies e=5, only consistency loss?
# (0.9169006278383008, 0.9184827961965057) - N16 w companies and countries e=5, only consistency loss?
# (0.9163748682048749, 0.9184827961965057) - N1 w companies and countries e=5, only consistency loss?

# (0.9158672382139807, 0.9184827961965057) - for 0.4main, only consistency loss
# (0.9180730113886992, 0.9184827961965057) - N16 w companies and countries e=5, SBERT embeddings, only consistency loss
# (0.9171423564053932, 0.9184827961965057) - N16 w companies, countries, classification, keywords e=5, SBERT embeddings, only consistency loss
# (0.9182845238849051, 0.9184827961965057) - N16 w keywords e=5, SBERT embeddings, only consistency loss
# (0.9134922550422976, 0.9184827961965057) - N25 w companies, countries, classification e=5, SBERT embeddings, both losses

# (0.8299206461842668, 0.9285338151296241) - using weighted_similarity as similarity metric with e=7 /w contrastive loss (['country', 'company', 'keywords', 'classification'], (10, 2, 1, 1, 1) - same below)
# (0.873655387185465, 0.9285338151296241) - using weighted_similarity as similarity metric with e=2 and simple evaluator /w contrastive loss
# (0.8829075480909276, 0.9285338151296241) - using weighted_similarity as similarity metric with e=7 and simple evaluator /w contrastive loss
# (0.9197107224307504, 0.9184827961965057) - using weighted_similarity as similarity metric with e=7 and simple evaluator /w cosine similarity loss
# (0.9143624778838304, 0.9184827961965057) - using weighted_similarity as similarity metric with e=5 and simple evaluator /w cosine similarity loss and distill loss N16
# (0.9139334096772413, 0.9184827961965057) - using weighted_similarity as similarity metric with e=10 and simple evaluator /w cosine similarity loss and distill loss N16
# (0.9133532611162194, 0.9184827961965057) - using weighted_similarity as similarity metric with e=10 and simple evaluator /w cosine similarity loss and distill loss N24

# (0.915559034290938, 0.9184827961965057) - e=5 and simple evaluator /w cosine similarity loss and distill loss N24
# (0.9149244968023202, 0.9184827961965057) - e=5 and simple evaluator /w cosine similarity loss and distill loss N16
# (0.9159820592833496, 0.9184827961965057) - e=5 and simple evaluator /w cosine similarity loss
# (0.9201518770656941, 0.9184827961965057) - using weighted_similarity as similarity metric with e=15 and simple evaluator /w cosine similarity loss
# (0.9214813841847025, 0.9184827961965057) - using weighted_similarity as similarity metric with e=50 and simple evaluator /w cosine similarity loss
# (0.9181394867446495, 0.9184827961965057) - using weighted_similarity as similarity metric with e=15 and simple evaluator /w cosine similarity loss (S3BERT)

# (0.9147855028762422, 0.9184827961965057) - classification model, using weighted_similarity as similarity metric with e=7 and simple evaluator /w cosine similarity loss



Baseline similarity 0.9184827961965057
Global similarity 0.9217775016793907


# Experiment 1

In [None]:
params = {}
params["UNFREEZE_LAYERS"] = 2
params["USE_WEIGHTED_SIMILARITY"] = False
params["USE_MANUAL_LABELS"] = True # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["consistency", "distill2"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 10
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'country', 'classification', 'keywords'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0
params["EVAL_STEPS"] = 1000

params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_INIT"] = "all-MiniLM-L12-v2" # "intfloat/e5-small-v2" # "brjezierski/S3BERT"
params["SBERT_DIM"] = 384

# where to save model and logs
params["SBERT_SAVE_PATH"] = "distill_consistency/"


column_list = []
column_proportions = ()

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/sbert-all/') # replace_no_tags

In [None]:
# - S3BERT approach - graph for FEATURE_DIM, write down how many epochs (params["LOSSES"] = ["consistency", "distill2"] )
for feature_dim in [0, 2]: # 4, 8, 16, 32,
  print("Feature dimension", feature_dim)
  model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
  params["FEATURE_DIM"] = feature_dim
  params["EPOCHS"] = 10
  train(model, params, train_df, val_df)
  eval_model_path = "./" + params['SBERT_SAVE_PATH'] + "/" # local
  test_df_wpred = infer(test_df, eval_model_path, params)
  print("Spearman")
  get_spearman_scores(test_df_wpred, params)
  print("MSE")
  get_mse_scores(test_df_wpred, params)

Feature dimension 0
['company_similarity', 'country_similarity', 'classification_similarity', 'keywords_similarity', 'similarity']
['company_similarity', 'country_similarity', 'classification_similarity', 'keywords_similarity', 'similarity']
['company_similarity', 'country_similarity', 'classification_similarity', 'keywords_similarity']
['company_similarity', 'country_similarity', 'classification_similarity', 'keywords_similarity']
['similarity']
['similarity']


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


features 2
Score at epoch 0, step -1: -44.94047546386719


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 1, step -1: -43.81487274169922


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 2, step -1: -42.87238311767578


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 3, step -1: -41.66858673095703


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 4, step -1: -40.92528533935547


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 5, step -1: -40.27954864501953


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 6, step -1: -39.68590545654297


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 7, step -1: -39.3345947265625


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 8, step -1: -39.09546661376953


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 9, step -1: -39.09654235839844


ValueError: ignored

# Experiment 2

In [None]:
# for column in ['country', 'company', 'keywords', 'classification']:
#   print(column)
#   for ratio in range(1, 51):
#     test_df_wpred = create_weighted_avg(test_df_wpred, [column], (100-ratio, ratio), 'weighted_avg')
#     weighted_sr = '%.4f'%(spearmanr(test_df_wpred["label"].tolist(), test_df_wpred["weighted_avg"].tolist())[0])
#     s3bert_sr = '%.3f'%(spearmanr(test_df_wpred["label"].tolist(), test_df_wpred["similarity"].tolist())[0])
#     print(f'{ratio}', weighted_sr)


# test_df_wpred = create_weighted_avg(test_df_wpred, ['country'], (6, 1), 'weighted_avg')
# test_df_wpred = create_weighted_avg(test_df_wpred, ['company'], (5, 1), 'weighted_avg')
# test_df_wpred = create_weighted_avg(test_df_wpred, ['country', 'company'], (12, 2, 1), 'weighted_avg')
# test_df_wpred = create_weighted_avg(test_df_wpred, ['classification'], (7, 1), 'weighted_avg')
# test_df_wpred = create_weighted_avg(test_df_wpred, ['keywords'], (7, 1), 'weighted_avg')
# test_df_wpred = create_weighted_avg(test_df_wpred, ['country', 'company', 'keywords', 'classification'], (10, 2, 1, 1, 1), 'weighted_avg') # used for training
test_df_wpred = create_weighted_avg(test_df_wpred, ['country', 'company', 'keywords', 'classification'], (59, 10, 20, 7, 5), 'weighted_avg') # used for training

# baseline - 0.918
# best result - for country:company:keywords:classification=10:2:1:1:1 - 0.929
# for country:company:keywords=12:2:1:1 - 0.927
# for country:company=12:2:1 - 0.925
# for country=6:1 - 0.922
# for company=5:1 - 0.921 (for Glanos emb. 0.919)
# for classification=7:1 - 0.920
# for keywords=7:1 - 0.921

# Experiment 3

In [None]:
from scipy.stats import spearmanr
for embedding_type in ['glanos', 'sbert']:
  for definition_weight in np.arange(0.0, 1.1, 0.1):
      suffix = str(definition_weight)[:3]
      train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix=f'thesis/{embedding_type}-company-{suffix}/') #='thesis/replace_no_tags/')
      test_df = create_weighted_avg(test_df, ['company'], (5, 1), 'weighted_avg')
      weighted_sr = '%.4f'%(spearmanr(test_df["label"].tolist(), test_df["weighted_avg"].tolist())[0])
      s3bert_sr = '%.3f'%(spearmanr(test_df["label"].tolist(), test_df["similarity"].tolist())[0])
      print(f"{suffix}: {weighted_sr}")

0.0: 0.9214
0.1: 0.9196
0.2: 0.9190
0.3: 0.9188
0.4: 0.9185
0.5: 0.9190
0.6: 0.9189
0.7: 0.9189
0.8: 0.9191
0.9: 0.9182
1.0: 0.9177


# Experiment 4

In [None]:
params = {}
params["UNFREEZE_LAYERS"] = 2
params["USE_WEIGHTED_SIMILARITY"] = True
params["USE_MANUAL_LABELS"] = True # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["cosine"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 15
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'country', 'classification', 'keywords'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0
params["EVAL_STEPS"] = 1000

params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_DIM"] = 384

column_list = ['country', 'company', 'keywords', 'classification']
column_proportions = (10, 2, 1, 1, 1)

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/sbert-all/')

In [None]:
for model_name in ["all-mpnet-base-v2", "BAAI/bge-small-en-v1.5", "thenlper/gte-small"]:
#  ["all-MiniLM-L12-v2", "brjezierski/S3BERT", "intfloat/e5-small-v2", "thenlper/gte-base", "BAAI/bge-base-en-v1.5", "intfloat/e5-base-v2"]:
  params["SBERT_INIT"] = model_name
  params["SBERT_SAVE_PATH"] = "s3bert_" + params["SBERT_INIT"] + "/"
  model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
  freeze.freeze_except_last_layers(model, 2)
  train(model, params, train_df, val_df)

  eval_model_path = "./" + params['SBERT_SAVE_PATH'] + "/" # local
  print('eval_model_path', eval_model_path)
  test_df_wpred = infer(test_df, eval_model_path, params)
  print(f'{model_name}')
  get_spearman_scores(test_df_wpred, params)
  get_mse_scores(test_df_wpred, params)

# all-MiniLM-L12-v2
# Baseline similarity 0.9184827961965057
# Global similarity 0.9202123092074671
# Baseline similarity 0.019708063019580228
# Global similarity 0.03248083875776108

# brjezierski/S3BERT
# Baseline similarity 0.9184827961965057
# Global similarity 0.9125434704164598
# Baseline similarity 0.019708063019580228
# Global similarity 0.03359967325817208

# thenlper/gte-base
# Baseline similarity 0.9184827961965057
# Global similarity 0.9232822620095411
# Baseline similarity 0.019708063019580228
# Global similarity 0.03237488916070081

# BAAI/bge-base-en-v1.5
# Baseline similarity 0.9184827961965057
# Global similarity 0.9233970830789099
# Baseline similarity 0.019708063019580228
# Global similarity 0.040280322166003926

# intfloat/e5-base-v2
# Baseline similarity 0.9184827961965057
# Global similarity 0.9158672382139807
# Baseline similarity 0.019708063019580228
# Global similarity 0.03137516868461418

# BAAI/bge-small-en-v1.5
# Baseline similarity 0.9184827961965057
# Global similarity 0.9164896892742438
# Baseline similarity 0.019708063019580228
# Global similarity 0.07869910687513633

# intfloat/e5-small-v2
# Baseline similarity 0.9184827961965057
# Global similarity 0.9224543416672495
# Baseline similarity 0.019708063019580228
# Global similarity 0.21485611031595883

# thenlper/gte-small
# Baseline similarity 0.9184827961965057
# Global similarity 0.9192756110099839
# Baseline similarity 0.019708063019580228
# Global similarity 0.22145066397811117





# all-mpnet-base-v2
# Baseline similarity 0.9184827961965057
# Global similarity 0.9279536665686023
# Baseline similarity 0.019708063019580228
# Global similarity 0.032357109701102046

# BAAI/bge-small-en-v1.5
# Baseline similarity 0.9184827961965057
# Global similarity 0.9196865495740412
# Baseline similarity 0.019708063019580228
# Global similarity 0.043707179333344115

# thenlper/gte-small
# Baseline similarity 0.9184827961965057
# Global similarity 0.9143080889562345
# Baseline similarity 0.019708063019580228
# Global similarity 0.03434666141386442

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

['similarity']
['similarity']


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.9371651933962454


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.945167552732646


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.9441937206440892


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.9458450011420766


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 4, step -1: 0.9442179152301402


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 5, step -1: 0.9429718940485088


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 6, step -1: 0.9442360611696787


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 7, step -1: 0.9448651204070071


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 8, step -1: 0.9443570340999341


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 9, step -1: 0.9445445421418303


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 10, step -1: 0.9446655150720856


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 11, step -1: 0.9414597324203152


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 12, step -1: 0.9447562447697773


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 13, step -1: 0.9436674883974777


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 14, step -1: 0.9436674883974777
eval_model_path ./s3bert_all-mpnet-base-v2//
all-mpnet-base-v2
Baseline similarity 0.9184827961965057
Global similarity 0.9279536665686023
Baseline similarity 0.019708063019580228
Global similarity 0.032357109701102046


Downloading (…)8fc4c/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)a6f2e8fc4c/README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

Downloading (…)f2e8fc4c/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)8fc4c/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading (…)a6f2e8fc4c/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)2e8fc4c/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

['similarity']
['similarity']


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.9261506080964831


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.933717464883964


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.9345400808097013


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.9355441561308219


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 4, step -1: 0.9383386308197236


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 5, step -1: 0.9390705170477693


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 6, step -1: 0.9367204040007723


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 7, step -1: 0.9388406684802838


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 8, step -1: 0.9376430364707544


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 9, step -1: 0.9386410631453623


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 10, step -1: 0.9388527657733095


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 11, step -1: 0.9383386308197236


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 12, step -1: 0.9385926739732602


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 13, step -1: 0.9389555927640265


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 14, step -1: 0.938731792843054
eval_model_path ./s3bert_BAAI/bge-small-en-v1.5//
BAAI/bge-small-en-v1.5
Baseline similarity 0.9184827961965057
Global similarity 0.9196865495740412
Baseline similarity 0.019708063019580228
Global similarity 0.043707179333344115


Downloading (…)9cf89/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)a1bfc9cf89/README.md:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

Downloading (…)bfc9cf89/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/66.8M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)9cf89/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading (…)a1bfc9cf89/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)fc9cf89/modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

['similarity']
['similarity']


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.9215899286258511


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.9432682777276348


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.9418831376762093


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.9419315268483116


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 4, step -1: 0.9421129862436948


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 5, step -1: 0.9420948403041565


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 6, step -1: 0.9429053589368683


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 7, step -1: 0.9425666347321529


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 8, step -1: 0.9429900399880472


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 9, step -1: 0.9433771533648647


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 10, step -1: 0.9430747210392261


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 11, step -1: 0.9430505264531748


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 12, step -1: 0.9429718940485088


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 13, step -1: 0.9430444778066621


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 14, step -1: 0.9430686723927132
eval_model_path ./s3bert_thenlper/gte-small//
thenlper/gte-small
Baseline similarity 0.9184827961965057
Global similarity 0.9143080889562345
Baseline similarity 0.019708063019580228
Global similarity 0.03434666141386442


# Experiment 5 - AI CAR - replace_no_tags

In [None]:
# AI CAR - replace_no_tags

params = {}
params["UNFREEZE_LAYERS"] = 2
params["USE_WEIGHTED_SIMILARITY"] = False
params["USE_MANUAL_LABELS"] = False # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["cosine"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 5
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'country', 'classification', 'keywords'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0
params["EVAL_STEPS"] = 1500
params["SBERT_INIT"] = "all-MiniLM-L12-v2"
params["SBERT_SAVE_PATH"] = "s3bert_" + params["SBERT_INIT"] + "/"


params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_DIM"] = 384

column_list = []
column_proportions = ()

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/ai_car/sbert-company-0.0/') # replace_no_tags

  train_df = pd.read_csv(f'{current_prefix}train.tsv', sep='\t', index_col=0)


In [None]:
model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
freeze.freeze_except_last_layers(model, 2)
train(model, params, train_df, val_df)

['similarity']
['similarity']


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3435 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 0, step 1500: 0.7236045901147529


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 0, step 3000: 0.7436690917272932


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.7330083252081304


Iteration:   0%|          | 0/3435 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 1, step 1500: 0.7392754818870473


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 1, step 3000: 0.7395874896872422


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.7478091952298808


Iteration:   0%|          | 0/3435 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 2, step 1500: 0.7495612390309758


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 2, step 3000: 0.743603090077252


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.7437515937898449


Iteration:   0%|          | 0/3435 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 3, step 1500: 0.7490122253056327


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 3, step 3000: 0.7477941948548715


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.7486027150678767


Iteration:   0%|          | 0/3435 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 4, step 1500: 0.7500097502437562


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 4, step 3000: 0.7504807620190507


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 4, step -1: 0.7499572489312234


NameError: ignored

In [None]:
eval_model_path = "./" + params['SBERT_SAVE_PATH'] + "/" # local
test_df_wpred = infer(test_df, eval_model_path, params)
get_spearman_scores(test_df_wpred, params)
get_mse_scores(test_df_wpred, params)


# Baseline similarity 1.0
# SBERT similarity 0.37452036300907526
# Global similarity 0.7156423910597766
# Baseline similarity 0.0
# SBERT similarity 0.018631956310600203
# Global similarity 0.008349266474003067


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Baseline similarity 1.0
Global similarity 0.7156423910597766
Baseline similarity 0.0
Global similarity 0.008349266474003067


# Experiment 6 - AI CAR - snippet

In [None]:
# AI CAR - snippet

params = {}
params["UNFREEZE_LAYERS"] = 2
params["USE_WEIGHTED_SIMILARITY"] = True
params["USE_MANUAL_LABELS"] = False # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["cosine"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 5
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'country', 'classification', 'keywords'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0
params["EVAL_STEPS"] = 1500
params["SBERT_INIT"] = "all-MiniLM-L12-v2"
params["SBERT_SAVE_PATH"] = "s3bert_" + params["SBERT_INIT"] + "/"


params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_DIM"] = 384

column_list = ['company', 'keyword', 'classification']
column_proportions = (10, 2, 1, 1)

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/ai_car/snippet/sbert-company-0.0/')

  train_df = pd.read_csv(f'{current_prefix}train.tsv', sep='\t', index_col=0)


In [None]:
model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
freeze.freeze_except_last_layers(model, 2)
train(model, params, train_df, val_df)

In [None]:
eval_model_path = "./" + params['SBERT_SAVE_PATH'] + "/" # local
test_df_wpred = infer(test_df, eval_model_path, params)
get_spearman_scores(test_df_wpred, params)
get_mse_scores(test_df_wpred, params)

test_df2 = test_df.copy()
test_df2['label'] = test_df['weighted_similarity']
eval_model_path = params["SBERT_INIT"]
test_df_wpred = infer(test_df2, eval_model_path, params)
get_spearman_scores(test_df_wpred, params)
get_mse_scores(test_df_wpred, params)

# Baseline similarity 1.0
# Global similarity 0.9368694217355434
# SBERT similarity 0.8550888772219306

# Baseline similarity 0.0
# Global similarity 0.008752451263758298
# SBERT similarity 0.011448911798255001

# weighted similarity vs predicted

# Experiment 7

In [None]:
params = {}
params["UNFREEZE_LAYERS"] = 2
params["USE_WEIGHTED_SIMILARITY"] = False
params["USE_MANUAL_LABELS"] = True # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["consistency", "distill2"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 10
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'country', 'classification', 'keywords'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0
params["EVAL_STEPS"] = 1000

params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_INIT"] = "all-MiniLM-L12-v2" # "intfloat/e5-small-v2" # "brjezierski/S3BERT"
params["SBERT_DIM"] = 384

# where to save model and logs
params["SBERT_SAVE_PATH"] = "distill_consistency/"


column_list = []
column_proportions = ()

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/sbert-all/') # replace_no_tags

In [None]:
for feature_dim in [64]: # 4, 8, 16, 32,
  print("Feature dimension", feature_dim)
  model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
  params["FEATURE_DIM"] = feature_dim
  train(model, params, train_df, val_df)
  eval_model_path = "./" + params['SBERT_SAVE_PATH'] + "/" # local
  test_df_wpred = infer(test_df, eval_model_path, params)
  print("Spearman")
  get_spearman_scores(test_df_wpred, params)
  print("MSE")
  get_mse_scores(test_df_wpred, params)

# Spearman
# Baseline similarity 0.9184827961965057
# Global similarity 0.9002999184932281
# Residual similarity 0.837353799622359
# company similarity 0.6519516716966302
# country similarity 0.452573412677717
# classification similarity 0.8622008949513379
# keywords similarity 0.5717251854806715
# MSE
# Baseline similarity 0.019708063019580228
# Global similarity 0.03482641690259627
# Residual similarity 0.03464998638057867
# company similarity 0.19201520618566992
# country similarity 0.12265177828733297
# classification similarity 0.05252195399472107
# keywords similarity 0.10650159641635333

Feature dimension 64
['company_similarity', 'country_similarity', 'classification_similarity', 'keywords_similarity', 'similarity']
['company_similarity', 'country_similarity', 'classification_similarity', 'keywords_similarity', 'similarity']
['company_similarity', 'country_similarity', 'classification_similarity', 'keywords_similarity']
['company_similarity', 'country_similarity', 'classification_similarity', 'keywords_similarity']
['similarity']
['similarity']


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


features 2
Score at epoch 0, step -1: -10.587505340576172


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 1, step -1: -9.727346420288086


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 2, step -1: -9.594633102416992


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 3, step -1: -9.176380157470703


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 4, step -1: -8.796642303466797


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 5, step -1: -8.733521461486816


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 6, step -1: -8.473165512084961


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 7, step -1: -8.436488151550293


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 8, step -1: -8.39621353149414


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

features 2
features 2
Score at epoch 9, step -1: -8.383316993713379


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Spearman
Baseline similarity 0.9184827961965057
Global similarity 0.9002999184932281
Residual similarity 0.837353799622359
company similarity 0.6519516716966302
country similarity 0.452573412677717
classification similarity 0.8622008949513379
keywords similarity 0.5717251854806715
MSE
Baseline similarity 0.019708063019580228
Global similarity 0.03482641690259627
Residual similarity 0.03464998638057867
company similarity 0.19201520618566992
country similarity 0.12265177828733297
classification similarity 0.05252195399472107
keywords similarity 0.10650159641635333


In [None]:
!zip -r /content/distill_consistency.zip /content/distill_consistency
%cp -av "/content/distill_consistency.zip" "/content/drive/MyDrive"

  adding: content/distill_consistency/ (stored 0%)
  adding: content/distill_consistency/pytorch_model.bin (deflated 8%)
  adding: content/distill_consistency/config_sentence_transformers.json (deflated 27%)
  adding: content/distill_consistency/2_Normalize/ (stored 0%)
  adding: content/distill_consistency/eval/ (stored 0%)
  adding: content/distill_consistency/eval/accuracy_evaluation_results.csv (deflated 57%)
  adding: content/distill_consistency/config.json (deflated 48%)
  adding: content/distill_consistency/added_tokens.json (deflated 37%)
  adding: content/distill_consistency/modules.json (deflated 62%)
  adding: content/distill_consistency/sentence_bert_config.json (deflated 4%)
  adding: content/distill_consistency/vocab.txt (deflated 53%)
  adding: content/distill_consistency/README.md (deflated 58%)
  adding: content/distill_consistency/tokenizer.json (deflated 71%)
  adding: content/distill_consistency/tokenizer_config.json (deflated 73%)
  adding: content/distill_consiste

# Experiment 8 - no results

In [None]:
params = {}
params["UNFREEZE_LAYERS"] = 2
params["USE_WEIGHTED_SIMILARITY"] = False
params["USE_MANUAL_LABELS"] = False # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["consistency", "distill2"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 5
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'classification', 'keyword'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["EVAL_STEPS"] = 1500

params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_INIT"] = "all-MiniLM-L12-v2" # "intfloat/e5-small-v2" # "brjezierski/S3BERT"
params["SBERT_DIM"] = 384

# where to save model and logs
params["SBERT_SAVE_PATH"] = "distill_consistency_ai_car/"


column_list = []
column_proportions = ()

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/ai_car/snippet/sbert-company-0.0/')

  train_df = pd.read_csv(f'{current_prefix}train.tsv', sep='\t', index_col=0)


In [None]:
for feature_dim in [64]: # 4, 8, 16, 32,
  print("Feature dimension", feature_dim)
  model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
  params["FEATURE_DIM"] = feature_dim
  train(model, params, train_df, val_df)


Feature dimension 64
['company_similarity', 'classification_similarity', 'keyword_similarity', 'similarity']
['company_similarity', 'classification_similarity', 'keyword_similarity', 'similarity']
['company_similarity', 'classification_similarity', 'keyword_similarity']
['company_similarity', 'classification_similarity', 'keyword_similarity']
['similarity']
['similarity']


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3435 [00:00<?, ?it/s]

features 2


  return F.mse_loss(input, target, reduction=self.reduction)


features 2
features 2
features 2
Score at epoch 0, step 1500: -3.4180026054382324


  return F.mse_loss(input, target, reduction=self.reduction)


In [None]:
eval_model_path = "./" + params['SBERT_SAVE_PATH'] + "/" # local
test_df_wpred = infer(test_df, eval_model_path, params)
get_spearman_scores(test_df_wpred, params)
get_mse_scores(test_df_wpred, params)

test_df2 = test_df.copy()
test_df2['label'] = test_df['weighted_similarity']
eval_model_path = params["SBERT_INIT"]
test_df_wpred = infer(test_df2, eval_model_path, params)
get_spearman_scores(test_df_wpred, params)
get_mse_scores(test_df_wpred, params)


In [None]:
!zip -r /content/distill_consistency_ai_car.zip /content/distill_consistency_ai_car
%cp -av "/content/distill_consistency_ai_car.zip" "/content/drive/MyDrive"

# Experiment 9

In [None]:
!unzip combined-ai_car-class-consulting-sim.zip

In [None]:
# - combined ai+car class->consulting sim
params = {}
params["UNFREEZE_LAYERS"] = 2
params["USE_WEIGHTED_SIMILARITY"] = True
params["USE_MANUAL_LABELS"] = True # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["cosine"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 25
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'country', 'classification', 'keywords'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0
params["EVAL_STEPS"] = 1000

params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_DIM"] = 384

column_list = ['country', 'company', 'keywords', 'classification']
column_proportions = (10, 2, 1, 1, 1)

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/sbert-all/')

In [None]:
for model_name in ["brjezierski/sentence-embeddings-classification-ai_car-sbert"]:
  params["SBERT_INIT"] = model_name
  params["SBERT_SAVE_PATH"] = "content/combined-ai_car-class-consulting-sim/"
  # model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
  # freeze.freeze_except_last_layers(model, 2)
  # train(model, params, train_df, val_df)

  eval_model_path = "./" + params['SBERT_SAVE_PATH'] + "/" # local
  print('eval_model_path', eval_model_path)
  test_df_wpred = infer(test_df, eval_model_path, params)
  print(f'{model_name}')
  get_spearman_scores(test_df_wpred, params)
  get_mse_scores(test_df_wpred, params)

  test_df2 = test_df.copy()
  test_df2['label'] = test_df['weighted_similarity']
  eval_model_path = params["SBERT_INIT"]
  test_df_wpred = infer(test_df2, eval_model_path, params)
  get_spearman_scores(test_df_wpred, params)
  get_mse_scores(test_df_wpred, params)

# Baseline similarity 0.9184827961965057
# Global similarity 0.8834151780818218
# Baseline similarity 0.019708063019580228
# Global similarity 0.041798828459223246

eval_model_path ./content/combined-ai_car-class-consulting-sim//


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


brjezierski/sentence-embeddings-classification-ai_car-sbert
Baseline similarity 0.9184827961965057
Global similarity 0.8834151780818218
Baseline similarity 0.019708063019580228
Global similarity 0.041798828459223246


Downloading (…)e6d53/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)aede7e6d53/README.md:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

Downloading (…)de7e6d53/config.json:   0%|          | 0.00/739 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)n_Glanos_results.csv:   0%|          | 0.00/84.0 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)e6d53/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)aede7e6d53/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)e7e6d53/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Baseline similarity 0.990288613046509
Global similarity 0.5237083708370835
Baseline similarity 0.006873229366766006
Global similarity 0.2962820969415752


In [None]:
!zip -r /content/combined-ai_car-class-consulting-sim.zip /content/combined-ai_car-class-consulting-sim
%cp -av "/content/combined-ai_car-class-consulting-sim.zip" "/content/drive/MyDrive"

  adding: content/combined-ai_car-class-consulting-sim/ (stored 0%)
  adding: content/combined-ai_car-class-consulting-sim/pytorch_model.bin (deflated 8%)
  adding: content/combined-ai_car-class-consulting-sim/config_sentence_transformers.json (deflated 27%)
  adding: content/combined-ai_car-class-consulting-sim/2_Normalize/ (stored 0%)
  adding: content/combined-ai_car-class-consulting-sim/eval/ (stored 0%)
  adding: content/combined-ai_car-class-consulting-sim/eval/similarity_evaluation_results.csv (deflated 62%)
  adding: content/combined-ai_car-class-consulting-sim/config.json (deflated 48%)
  adding: content/combined-ai_car-class-consulting-sim/added_tokens.json (deflated 37%)
  adding: content/combined-ai_car-class-consulting-sim/modules.json (deflated 62%)
  adding: content/combined-ai_car-class-consulting-sim/sentence_bert_config.json (deflated 4%)
  adding: content/combined-ai_car-class-consulting-sim/vocab.txt (deflated 53%)
  adding: content/combined-ai_car-class-consulting-

# Experiment 10 - AI CAR - replace

In [None]:
!unzip similarity_replace.zip
! ls

Archive:  similarity_replace.zip
   creating: content/similarity_replace/
   creating: content/similarity_replace/2_Normalize/
  inflating: content/similarity_replace/tokenizer_config.json  
   creating: content/similarity_replace/eval/
  inflating: content/similarity_replace/eval/similarity_evaluation_results.csv  
  inflating: content/similarity_replace/vocab.txt  
  inflating: content/similarity_replace/README.md  
  inflating: content/similarity_replace/modules.json  
   creating: content/similarity_replace/1_Pooling/
  inflating: content/similarity_replace/1_Pooling/config.json  
  inflating: content/similarity_replace/config.json  
  inflating: content/similarity_replace/config_sentence_transformers.json  
  inflating: content/similarity_replace/tokenizer.json  
  inflating: content/similarity_replace/sentence_bert_config.json  
  inflating: content/similarity_replace/pytorch_model.bin  
  inflating: content/similarity_replace/special_tokens_map.json  
content  drive	sample_data 

In [None]:
# AI CAR - replace

params = {}
params["UNFREEZE_LAYERS"] = 2
params["USE_WEIGHTED_SIMILARITY"] = False
params["USE_MANUAL_LABELS"] = False # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["cosine"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 5
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'country', 'classification', 'keywords'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0
params["EVAL_STEPS"] = 1500
params["SBERT_INIT"] = "all-MiniLM-L12-v2"
params["SBERT_SAVE_PATH"] = "similarity_replace/"


params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_DIM"] = 384

column_list = []
column_proportions = ()

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/ai_car/replace/') # replace_no_tags

  train_df = pd.read_csv(f'{current_prefix}train.tsv', sep='\t', index_col=0)


In [None]:
model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
freeze.freeze_except_last_layers(model, 2)
train(model, params, train_df, val_df)

In [None]:
eval_model_path = "./content/" + params['SBERT_SAVE_PATH'] # local
test_df_wpred = infer(test_df, eval_model_path, params)
get_spearman_scores(test_df_wpred, params)
get_mse_scores(test_df_wpred, params)

print("SBERT")
eval_model_path = params["SBERT_INIT"]
test_df_wpred = infer(test_df, eval_model_path, params)
get_spearman_scores(test_df_wpred, params)
get_mse_scores(test_df_wpred, params)

# Global similarity 0.670953273831846
# SBERT similarity 0.37867996699917505
# Global similarity 0.008759913815233082
# SBERT similarity 0.04704728198143357


Baseline similarity 1.0
Global similarity 0.670953273831846
Baseline similarity 0.0
Global similarity 0.008759913815233082
Baseline similarity 1.0
Global similarity 0.37867996699917505
Baseline similarity 0.0
Global similarity 0.04704728198143357


# Experiment 11 - combined AI CAR class -> sim - snippet

In [None]:
# - combined ai+car class->sim

params = {}
params["UNFREEZE_LAYERS"] = 2
params["USE_WEIGHTED_SIMILARITY"] = True
params["USE_MANUAL_LABELS"] = False # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["cosine"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 5
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'country', 'classification', 'keywords'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0
params["EVAL_STEPS"] = 1500
params["SBERT_INIT"] = "brjezierski/sentence-embeddings-classification-ai_car-sbert"
params["SBERT_SAVE_PATH"] = "combined-ai_car-class-sim/"


params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_DIM"] = 384

column_list = ['company', 'keyword', 'classification']
column_proportions = (10, 2, 1, 1)

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/ai_car/snippet/sbert-company-0.0/')

  train_df = pd.read_csv(f'{current_prefix}train.tsv', sep='\t', index_col=0)


In [None]:
model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
freeze.freeze_except_last_layers(model, 2)
train(model, params, train_df, val_df)

Downloading (…)e6d53/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)aede7e6d53/README.md:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

Downloading (…)de7e6d53/config.json:   0%|          | 0.00/739 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)n_Glanos_results.csv:   0%|          | 0.00/84.0 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)e6d53/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)aede7e6d53/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)e7e6d53/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

['similarity']
['similarity']


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3435 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 0, step 1500: 0.6418690467261682


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 0, step 3000: 0.724009600240006


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.7358553963849097


Iteration:   0%|          | 0/3435 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 1, step 1500: 0.765236630915773


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 1, step 3000: 0.7723633090827271


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.782209555238881


Iteration:   0%|          | 0/3435 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 2, step 1500: 0.7814355358883973


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 2, step 3000: 0.7915652891322283


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.7970119252981325


Iteration:   0%|          | 0/3435 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 3, step 1500: 0.7911827795694892


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 3, step 3000: 0.7999549988749721


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.8038580964524115


Iteration:   0%|          | 0/3435 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 4, step 1500: 0.8007380184504613


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 4, step 3000: 0.8053296332408311


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Score at epoch 4, step -1: 0.804969624240606


In [None]:
eval_model_path = "./" + params['SBERT_SAVE_PATH'] + "/" # local
test_df_wpred = infer(test_df, eval_model_path, params)
get_spearman_scores(test_df_wpred, params)
get_mse_scores(test_df_wpred, params)

test_df2 = test_df.copy()
test_df2['label'] = test_df['weighted_similarity']
eval_model_path = params["SBERT_INIT"]
test_df_wpred = infer(test_df2, eval_model_path, params)
get_spearman_scores(test_df_wpred, params)
get_mse_scores(test_df_wpred, params)


# weighted similarity vs predicted
# Baseline similarity 1.0
# Global similarity 0.814214355358884
# Baseline similarity 0.0
# Global similarity 0.012129632332215085

# Baseline similarity 0.8550888772219306
# Global similarity 0.13266931673291835
# Baseline similarity 0.01144891517221311
# Global similarity 0.4927191804805782


Baseline similarity 1.0
Global similarity 0.814214355358884
Baseline similarity 0.0
Global similarity 0.012129632332215085
Baseline similarity 0.8550888772219306
Global similarity 0.13266931673291835
Baseline similarity 0.01144891517221311
Global similarity 0.4927191804805782


In [None]:
!zip -r /content/combined-ai_car-class-sim.zip /content/combined-ai_car-class-sim
%cp -av "/content/combined-ai_car-class-sim.zip" "/content/drive/MyDrive"

  adding: content/combined-ai_car-class-sim/ (stored 0%)
  adding: content/combined-ai_car-class-sim/sentence_bert_config.json (deflated 4%)
  adding: content/combined-ai_car-class-sim/README.md (deflated 54%)
  adding: content/combined-ai_car-class-sim/config_sentence_transformers.json (deflated 27%)
  adding: content/combined-ai_car-class-sim/special_tokens_map.json (deflated 42%)
  adding: content/combined-ai_car-class-sim/tokenizer.json (deflated 71%)
  adding: content/combined-ai_car-class-sim/modules.json (deflated 62%)
  adding: content/combined-ai_car-class-sim/2_Normalize/ (stored 0%)
  adding: content/combined-ai_car-class-sim/1_Pooling/ (stored 0%)
  adding: content/combined-ai_car-class-sim/1_Pooling/config.json (deflated 47%)
  adding: content/combined-ai_car-class-sim/vocab.txt (deflated 53%)
  adding: content/combined-ai_car-class-sim/pytorch_model.bin (deflated 8%)
  adding: content/combined-ai_car-class-sim/tokenizer_config.json (deflated 74%)
  adding: content/combine

# Experiment 12 - unfreeze layers

In [None]:
params = {}
params["USE_WEIGHTED_SIMILARITY"] = True
params["USE_MANUAL_LABELS"] = True # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["cosine"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 15
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'country', 'classification', 'keywords'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0
params["EVAL_STEPS"] = 1000

params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_DIM"] = 384

column_list = ['country', 'company', 'keywords', 'classification']
column_proportions = (10, 2, 1, 1, 1)

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/sbert-all/')

In [None]:
for unfreeze_layers in [1, 2, 3, 4, 5, 6]:
  params["SBERT_INIT"] = "all-MiniLM-L12-v2"
  params["SBERT_SAVE_PATH"] = "s3bert_" + params["SBERT_INIT"] + "/"
  model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
  freeze.freeze_except_last_layers(model, 2)
  train(model, params, train_df, val_df)

  eval_model_path = "./" + params['SBERT_SAVE_PATH'] + "/" # local
  print('unfreeze_layers', unfreeze_layers)
  test_df_wpred = infer(test_df, eval_model_path, params)
  get_spearman_scores(test_df_wpred, params)
  get_mse_scores(test_df_wpred, params)

# unfreeze_layers 0
# Baseline similarity 0.9184827961965057
# Global similarity 0.9187981970899765
# Baseline similarity 0.019708063019580228
# Global similarity 0.032768098755334193

# unfreeze_layers 1
# Baseline similarity 0.9184827961965057
# Global similarity 0.9198134570717648
# Baseline similarity 0.019708063019580228
# Global similarity 0.0325840529003993

# unfreeze_layers 2
# Baseline similarity 0.9184827961965057
# Global similarity 0.9195959013613814
# Baseline similarity 0.019708063019580228
# Global similarity 0.03273892683038085

# unfreeze_layers 3
# Baseline similarity 0.9184827961965057
# Global similarity 0.9186108574504798
# Baseline similarity 0.019708063019580228
# Global similarity 0.03305023338859068

# unfreeze_layers 4
# Baseline similarity 0.9184827961965057
# Global similarity 0.9190520120854235
# Baseline similarity 0.019708063019580228
# Global similarity 0.03346305023198492

# unfreeze_layers 5
# Baseline similarity 0.9184827961965057
# Global similarity 0.9184174745968058
# Baseline similarity 0.019708063019580228
# Global similarity 0.032910490699699285

# unfreeze_layers 6
# Baseline similarity 0.9184827961965057
# Global similarity 0.9202848277775949
# Baseline similarity 0.019708063019580228
# Global similarity 0.03356503266876098

['similarity']
['similarity']


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.9219044582445155


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.9239428521193205


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.9259751973476126


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.9268280565059138


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 4, step -1: 0.9282857803154925


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 5, step -1: 0.9296225311948156


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 6, step -1: 0.9294350231529197


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 7, step -1: 0.9295136555575857


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 8, step -1: 0.9297737473576351


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 9, step -1: 0.9313766386835202


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 10, step -1: 0.9312617143997776


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 11, step -1: 0.9301971526135291


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 12, step -1: 0.9301608607344526


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 13, step -1: 0.9304088552414762


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 14, step -1: 0.9307233848601406
unfreeze_layers 1
Baseline similarity 0.9184827961965057
Global similarity 0.9198134570717648
Baseline similarity 0.019708063019580228
Global similarity 0.0325840529003993
['similarity']
['similarity']


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.9204225398488857


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.9238702683611671


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.9278502777665727


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.9267978132733499


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 4, step -1: 0.9287333801574379


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 5, step -1: 0.9291265421807682


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 6, step -1: 0.9304149038879892


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 7, step -1: 0.9305903146368595


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 8, step -1: 0.9305903146368595


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 9, step -1: 0.930578217343834


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 10, step -1: 0.9297918932971733


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 11, step -1: 0.9296225311948156


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 12, step -1: 0.930390709301938


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 13, step -1: 0.930844357790396


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 14, step -1: 0.930844357790396
unfreeze_layers 2
Baseline similarity 0.9184827961965057
Global similarity 0.9195959013613814
Baseline similarity 0.019708063019580228
Global similarity 0.03273892683038085
['similarity']
['similarity']


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.9205858533047306


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.9240819709891143


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.9276083319060617


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.9294168772133813


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 4, step -1: 0.9300398878041971


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 5, step -1: 0.9299249635204544


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 6, step -1: 0.9298221365297372


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 7, step -1: 0.9304330498275275


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 8, step -1: 0.9307536280927046


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 9, step -1: 0.931062109064856


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 10, step -1: 0.9305479741112701


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 11, step -1: 0.9310681577113686


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 12, step -1: 0.9312859089858285


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 13, step -1: 0.9309350874880877


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 14, step -1: 0.9309350874880877
unfreeze_layers 3
Baseline similarity 0.9184827961965057
Global similarity 0.9186108574504798
Baseline similarity 0.019708063019580228
Global similarity 0.03305023338859068
['similarity']
['similarity']


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.9198418697836593


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.9258118838917677


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.9266586944035562


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.9276990616037534


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 4, step -1: 0.9286486991062589


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 5, step -1: 0.9268341051524266


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 6, step -1: 0.927227267175757


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 7, step -1: 0.9282555370829286


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 8, step -1: 0.928836207148155


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 9, step -1: 0.9296769690134306


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 10, step -1: 0.9307778226787555


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 11, step -1: 0.9303241741902974


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 12, step -1: 0.9303967579484507


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 13, step -1: 0.9304814389996297


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 14, step -1: 0.9303786120089124
unfreeze_layers 4
Baseline similarity 0.9184827961965057
Global similarity 0.9190520120854235
Baseline similarity 0.019708063019580228
Global similarity 0.03346305023198492
['similarity']
['similarity']


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.921275399007187


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.9247412734590067


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.9259691487010998


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.9275599427339595


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 4, step -1: 0.9291144448877425


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 5, step -1: 0.9297918932971733


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 6, step -1: 0.9290055692505127


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 7, step -1: 0.92878176932954


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 8, step -1: 0.9303120768972719


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 9, step -1: 0.9310500117718304


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 10, step -1: 0.9322234491953085


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 11, step -1: 0.931836335818491


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 12, step -1: 0.9316427791300822


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 13, step -1: 0.9315339034928524


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 14, step -1: 0.9315883413114673
unfreeze_layers 5
Baseline similarity 0.9184827961965057
Global similarity 0.9184174745968058
Baseline similarity 0.019708063019580228
Global similarity 0.032910490699699285
['similarity']
['similarity']


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 0, step -1: 0.9213661287048784


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 1, step -1: 0.9241847979798314


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 2, step -1: 0.9272333158222696


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 3, step -1: 0.9276446237851383


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 4, step -1: 0.9284732883573885


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 5, step -1: 0.9297132608925073


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 6, step -1: 0.9310560604183431


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 7, step -1: 0.9306749956880384


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 8, step -1: 0.9312193738741881


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 9, step -1: 0.9311104982369581


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 10, step -1: 0.9312798603393158


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 11, step -1: 0.9307052389206023


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 12, step -1: 0.9303483687763486


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 13, step -1: 0.930578217343834


Iteration:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Score at epoch 14, step -1: 0.930578217343834
unfreeze_layers 6
Baseline similarity 0.9184827961965057
Global similarity 0.9202848277775949
Baseline similarity 0.019708063019580228
Global similarity 0.03356503266876098


# Experiment 12 - SBERT /wo training

In [None]:
params = {}
params["USE_WEIGHTED_SIMILARITY"] = True
params["USE_MANUAL_LABELS"] = True # whether to use val-test labels that I created manually or some similarity metric
params["LOSSES"] = ["cosine"] # ["consistency", "distill", "distill2", "cosine"]
params["EPOCHS"] = 15
params["FEATURES"] = ['input1', 'input2']
params["USE_DISTILL"] = any('distill' in loss for loss in params["LOSSES"])
if params["USE_DISTILL"]:
  params["FEATURES"] += ['company', 'country', 'classification', 'keywords'] # ['company', 'country', 'classification', 'keywords']
  # params["FEATURES"] += ['similarity']
params["N"] = len(params["FEATURES"]) - 2 # number of metrics
params["FEATURE_DIM"] = 16 if params["USE_DISTILL"] else 0
params["EVAL_STEPS"] = 1000

params["BATCH_SIZE"] = 64
params["WARMUP_STEPS"] = 100
params["LEARNING_RATE"] = 2e-5
params["SBERT_DIM"] = 384

column_list = ['country', 'company', 'keywords', 'classification']
column_proportions = (10, 2, 1, 1, 1)

train_df, val_df, test_df = get_data(column_list, column_proportions, params, current_prefix='thesis/sbert-all/')

In [None]:
params["SBERT_INIT"] = "all-MiniLM-L12-v2"
params["SBERT_SAVE_PATH"] = "s3bert_" + params["SBERT_INIT"] + "/"
model = SentenceTransformer(params["SBERT_INIT"], device="cuda")
  # freeze.freeze_except_last_layers(model, 2)
  # train(model, params, train_df, val_df)

eval_model_path = params["SBERT_INIT"]
test_df_wpred = infer(test_df, eval_model_path, params)
get_spearman_scores(test_df_wpred, params)
get_mse_scores(test_df_wpred, params)


Baseline similarity 0.9184827961965057
Global similarity 0.9162540039213286
Baseline similarity 0.019708063019580228
Global similarity 0.019959834366673958


# Analysis

In [None]:
def create_diff_sort(df, col1, col2, new_col_name):
    # Create new column with absolute difference
    df[new_col_name] = abs(df[col1] - df[col2])

    # Sort dataframe by new column in descending order
    df = df.sort_values(by=new_col_name, ascending=False)

    return df

test_df_wpred_sorted = create_diff_sort(test_df_wpred, 'label', 'global_pred_similarity', 'abs_diff')

# sorted_df = test_df_wpred[["snippet1", "snippet1", "country_pred_similarity", "country1", "country2", "country_similarity"]]
# sorted_df = sorted_df.sort_values(by='country_pred_similarity')
# test_df_wpred = normalize_column(test_df_wpred, 'global_pred_similarity')
# test_df_wpred['global_pred_similarity'] = test_df_wpred['global_pred_similarity'].abs()

In [None]:
test_df_wpred_sorted[:50]
# Shift/absolute value the pred sim for class model from -1,1 to 0,1