# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import scipy
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load model

In [2]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Load tag data

In [3]:
path = "../data/tags.xlsx"
transcriber_df = pd.read_excel(path, converters={'space':str, 'nospace':str}, sheet_name="ZooniverseTags")
expert_df = pd.read_excel(path, converters={'space':str, 'nospace':str}, sheet_name="ExpertTags")
expert_tags = expert_df["nospace"].tolist()

# Get tag embeddings

In [4]:
expert_embed = model.encode(expert_df['space'].tolist())
transcriber_embed = model.encode(transcriber_df['space'].tolist())

# Compare embeddings

In [5]:
def compare_all(i, tag, embed):
    distances = scipy.spatial.distance.cdist([embed], expert_embed, "cosine")[0]
    ranking = sorted(zip(range(len(distances)), distances), key=lambda x: x[1])
    return {
        "index": i,
        "transcriber_tag": tag,
        "expert_tag_1": expert_tags[ranking[0][0]],
        "score_1": 1-ranking[0][1],
        "expert_tag_2": expert_tags[ranking[1][0]],
        "score_2": 1-ranking[1][1],
        "expert_tag_3": expert_tags[ranking[2][0]],
        "score_3": 1-ranking[2][1]
    }
    
results = [compare_all(i, tag, embed) for i, tag, embed in 
           zip(range(len(transcriber_df)), transcriber_df['nospace'].tolist(), transcriber_embed)]

# Save as .csv

In [7]:
# Convert list of results to dataframe
df = pd.DataFrame(results)
# Sort results by cosine similarity
df = df.sort_values('score_1', ascending=False)
# Write to csv
df.to_csv("../data/tag_match.csv", index=False)

# Cluster tags lacking close match

In [8]:
k = 30 # Adjust number of clusters

cluster_results = [results[i] for i in range(len(results))
                   if results[i]['score_1'] < 0.65 ] # Adjust similarity threshold

cluster_embeds = [transcriber_embed[cluster_results[i]['index']] 
                  for i in range(len(cluster_results))]

kmeans = KMeans(n_clusters=k).fit(cluster_embeds)

for i in range(k):
    print("Cluster", i)
    print([cluster_results[j]['transcriber_tag'] 
           for j in range(len(kmeans.labels_))
           if kmeans.labels_[j] == i])

Cluster 0
['everyminutecounts', 'living_day_by_day', 'longtimevolunteer', 'onceasoldieralwaysasoldier', 'plansnotchanged', 'prewarsalary', 'standinghispost']
Cluster 1
['australia', 'australian', 'baseball', 'brothel', 'china', 'communism', 'economist', 'fascism', 'musictheory', 'night', 'northvssouth', 'playboys', 'rats', 'russia', 'stalin', 'visitrussia']
Cluster 2
['biologicalresearch', 'biology', 'bloodtest', 'chemist', 'chemistry', 'civilengineering', 'constructionengineering', 'dieselwork', 'electrician', 'engineschool', 'firefighter', 'hairstylist', 'lithographer', 'metallurgy', 'oilindustry', 'physics', 'safteyengineering']
Cluster 3
['agriculture', 'boy_scout_tactics', 'boyscout', 'farmboy', 'farmboys', 'farmer', 'farmers', 'farming', 'farmlife', 'lumberjack', 'ranches', 'unclesam']
Cluster 4
['diabetes', 'dysentery', 'food_poisoning', 'hayfever', 'malaria-fever', 'malaria-zone', 'malaria', 'mosquitoes', 'tropical-fevers', 'tropical-sickness', 'tropicalfevers']
Cluster 5
['15d