# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import scipy
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load model

In [2]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Load tag data

In [3]:
column = "space" # Set to "nospace" or "space"
path = "../data/tags.xlsx"
transcriber_df = pd.read_excel(path, converters={'space':str, 'nospace':str}, sheet_name="ZooniverseTags")
expert_df = pd.read_excel(path, converters={'space':str, 'nospace':str}, sheet_name="ExpertTags")
expert_tags = expert_df["nospace"].tolist()

# Get tag embeddings

In [4]:
expert_embed = model.encode(expert_df[column].tolist())
transcriber_embed = model.encode(transcriber_df[column].tolist())

# Compare embeddings

In [15]:
def compare_all(tag, embed):
    distances = scipy.spatial.distance.cdist([embed], expert_embed, "cosine")[0]
    ranking = sorted(zip(range(len(distances)), distances), key=lambda x: x[1])
    return {
        "transcriber_tag": tag,
        "expert_tag_1": expert_tags[ranking[0][0]],
        "score_1": 1-ranking[0][1],
        "expert_tag_2": expert_tags[ranking[1][0]],
        "score_2": 1-ranking[1][1],
        "expert_tag_3": expert_tags[ranking[2][0]],
        "score_3": 1-ranking[2][1]
    }
    
results = [compare_all(tag, embed) for tag, embed in 
           zip(transcriber_df['nospace'].tolist(), transcriber_embed)]

In [21]:
# Convert list of results to dataframe
df = pd.DataFrame(results)
# Sort results by cosine similarity
df = df.sort_values('score_1', ascending=False)
# Write to csv
df.to_csv("../data/tag_approx_"+column+".csv")