# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import scipy
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load tag data

In [2]:
path = "../data/tags.xlsx"
transcriber_df = pd.read_excel(path, converters={'space':str, 'nospace':str}, sheet_name="ZooniverseTags")
expert_df = pd.read_excel(path, converters={'space':str, 'nospace':str}, sheet_name="ExpertTags")

expert_tags = expert_df['nospace'].tolist()
transcriber_tags = transcriber_df['nospace'].tolist()

# Get tag embeddings

In [3]:
transcriber_embed = model.encode(transcriber_df['space'].tolist())
expert_embed = model.encode(expert_df['space'].tolist())

# Compare transcriber tag embeddings to expert tag embeddings

In [8]:
def compare_all(i, tag, embed):
    distances = scipy.spatial.distance.cdist([embed], expert_embed, "cosine")[0]
    ranking = sorted(zip(range(len(distances)), distances), key=lambda x: x[1])
    return {
        "index": i,
        "transcriber_tag": tag,
        "expert_tag_1": expert_tags[ranking[0][0]],
        "score_1": 1-ranking[0][1],
        "expert_tag_2": expert_tags[ranking[1][0]],
        "score_2": 1-ranking[1][1],
        "expert_tag_3": expert_tags[ranking[2][0]],
        "score_3": 1-ranking[2][1]
    }

results = [compare_all(i, tag, embed) for i, tag, embed in 
           zip(range(len(transcriber_df)), transcriber_tags, transcriber_embed)]

match_df = pd.DataFrame(results)

Time: 10.677935100000013


# Display 50 least-matched expert tags

In [10]:
match_df['expert_tag_1'].value_counts().sort_values().head(50)

opinionofallies           1
heavybombardmentgroups    1
attitudetowardgermany     1
confidenceinship          1
germanradio               1
civilians                 2
germanclothing            2
moraleprogram             2
sexcontact                2
newsmaps                  2
sex                       2
italy                     2
prideinoutfit             2
bbc                       2
mess                      2
japan                     2
usoclubs                  3
pacifictoughness          3
ratings                   3
postwaropportunities      3
aficourses                3
redcross                  3
germans                   3
magazines                 3
localcivilians            3
workingwithcivilians      3
krations                  3
entalhealth               4
basenewspapers            4
hospitalfacilities        4
pxs                       4
strategicbombing          4
honolulu                  4
japanese                  4
attitudetowardengland     4
treatmentofpows     

# Top n transcriber tags to add to expert tags

In [None]:
best_sum = float('inf')
best_tag = None

for i in range(len(transcriber_df)):
    expert_tags = np.append(expert_tags, transcriber_tags[i])
    expert_embed = np.vstack((expert_embed, transcriber_embed[i]))
    
    results = [compare_all(j, tag, embed) for j, tag, embed in
               zip(range(len(transcriber_df)), transcriber_tags, transcriber_embed)
               if j != i]
    
    sum_cs = sum([r['score_1'] for r in results])
        
    if sum_cs < best_sum:
        best_sum = sum_cs
        best_tag = transcriber_tags[i]
    
    expert_tags = np.delete(expert_tags, len(expert_tags)-1,0)
    expert_embed = np.delete(expert_embed, len(expert_embed)-1,0)

print(best_tag)