In [None]:
import sys
sys.path.append("/Users/user/question-retrieval-KIPerWeb/")
from utils import *
import pandas as pd
pd.set_option('max_colwidth', None) # show full width of showing cols
pd.set_option("expand_frame_repr", False) # print cols side by side as it's supposed to be
import re
import json
from ranx import Qrels, evaluate, Run
import swifter
import json
import requests
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
# With the original pool

pool_w_info = pd.read_csv("/Users/user/question-retrieval-KIPerWeb/testbeds/queries_experiments/trec_pools/pool_w_metadata.txt")
print("original pool")
print(f"-> Pool size: {pool_w_info.shape[0]}")
# pool_w_info = pool_w_info.head(10)

In [None]:
# How to get relevance annotations?
#     1. translate the query, category and content to a single file for manual validation
#     2. Calculate semantic similarity across query/category
#     3. Filter only high relevant pairs
#     4. Train a FSL method to predict relevance
#         1. With the available relevance scores filter only the highest ones
#         2. manually create pairs to classify (content, query) -> 0/1/2

In [None]:
from sentence_transformers import SentenceTransformer, util



# Function to encode all unique utterances for each model
def encode_all_utterances(df, models):
    unique_texts = pd.concat([df['query'], df['category']]).unique()
    encodings = {}
    for model in models:
        encodings[model] = {text: model.encode(text, convert_to_tensor=True) for text in unique_texts}
    return encodings

# Function to get average cosine similarity for a single row
def get_average_cosine_sim_for_row(row, encodings, models):
    total_similarity = 0
    for model in models:
        encoded_utterance = encodings[model][row['query']]
        encoded_q_type = encodings[model][row['category']]
        similarity = util.cos_sim(encoded_utterance, encoded_q_type).item()
        total_similarity += similarity
    return total_similarity / len(models)

# tokenizer.pad_token = tokenizer.eos_token

# List of pretrained models
model_25 =SentenceTransformer("meta-llama/Llama-2-7b-chat-hf")
model_25.tokenizer.pad_token = model_25.tokenizer.eos_token

model_2 = SentenceTransformer("aari1995/German_Semantic_STS_V2")
# model_2.tokenizer.pad_token = model_2.tokenizer.eos_token


models = [
SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2"),
model_2,
SentenceTransformer("sentence-transformers/LaBSE"),
SentenceTransformer("PM-AI/bi-encoder_msmarco_bert-base_german"),
SentenceTransformer("efederici/e5-base-multilingual-4096"),
SentenceTransformer("intfloat/multilingual-e5-base"),
SentenceTransformer("clips/mfaq"),
SentenceTransformer("PM-AI/sts_paraphrase_xlm-roberta-base_de-en"),
SentenceTransformer("deutsche-telekom/gbert-large-paraphrase-euclidean"),
SentenceTransformer("LLukas22/all-MiniLM-L12-v2-embedding-all"),
SentenceTransformer("LLukas22/paraphrase-multilingual-mpnet-base-v2-embedding-all"),
SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1"),
SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v2"),
SentenceTransformer("deutsche-telekom/gbert-large-paraphrase-cosine"),
SentenceTransformer("shibing624/text2vec-base-multilingual"),
SentenceTransformer("Sahajtomar/German-semantic"),
SentenceTransformer("setu4993/LaBSE"),
SentenceTransformer("symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli"),
SentenceTransformer("and-effect/musterdatenkatalog_clf"),
SentenceTransformer("nblokker/debatenet-2-cat"),
SentenceTransformer("setu4993/LEALLA-large"),
SentenceTransformer("dell-research-harvard/lt-wikidata-comp-de"),
SentenceTransformer("ef-zulla/e5-multi-sml-torch"),
SentenceTransformer("barisaydin/text2vec-base-multilingual"),
model_25
]

# Example usage
# utterance = "Your test utterance"
# q_type = "Your query type"
# average_similarity = get_average_cosine_sim(utterance, q_type, models)
# print(average_similarity)

# Pre-compute all encodings
all_encodings = encode_all_utterances(pool_w_info, models)

# Apply the function to each row in the DataFrame
pool_w_info['average_similarity'] = pool_w_info.apply(lambda row: get_average_cosine_sim_for_row(row, all_encodings, models), axis=1)
pool_w_info

In [None]:
float_list = pool_w_info['average_similarity'].tolist()

interval = min(float_list), max(float_list)
interval

In [None]:


def categorize_similarity(row):
    if 0 <= row['average_similarity'] < 0.60:
        return 0
    elif 0.60 <= row['average_similarity'] < 0.70:
        return 1
    elif 0.70 <= row['average_similarity'] <= 1.0:
        return 2
    else:
        return None  # For values outside the specified range

# Apply the function to create the new column
pool_w_info['qrel'] = pool_w_info.apply(categorize_similarity, axis=1)
# pool_w_info['qrel'] = pool_w_info['qrel'].astype(int)
pool_w_info

In [None]:
pool_w_info['qrel'].value_counts()

In [None]:
pool_w_info.to_csv("/Users/user/question-retrieval-KIPerWeb/testbeds/queries_experiments/trec_pools/testbed.csv", index_label=False)