# Import necessary libraries

In [7]:
import pandas as pd
import numpy as np
import time
import scipy
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Load tag data

In [2]:
path = "../data/tags.xlsx"
transcriber_df = pd.read_excel(path, converters={'space':str, 'nospace':str}, sheet_name="ZooniverseTags")
expert_df = pd.read_excel(path, converters={'space':str, 'nospace':str}, sheet_name="ExpertTags")

expert_tags = expert_df['nospace'].tolist()
transcriber_tags = transcriber_df['nospace'].tolist()

# Get tag embeddings

In [3]:
transcriber_embed = model.encode(transcriber_df['space'].tolist())
expert_embed = model.encode(expert_df['space'].tolist())

# Compare transcriber tag embeddings to expert tag embeddings

In [4]:
def compare_all():
    distances = scipy.spatial.distance.cdist(transcriber_embed, expert_embed, "cosine")
    ranking = [sorted(zip(range(len(distances[i])), distances[i]), key=lambda x: x[1]) for i in range(len(transcriber_df))]
    return [{
        "index": i,
        "transcriber_tag": transcriber_tags[i],
        "expert_tag_1": expert_tags[ranking[i][0][0]],
        "score_1": 1-ranking[i][0][1],
        "expert_tag_2": expert_tags[ranking[i][1][0]],
        "score_2": 1-ranking[i][1][1],
        "expert_tag_3": expert_tags[ranking[i][2][0]],
        "score_3": 1-ranking[i][2][1],}
        for i in range(len(transcriber_df))]

match_df = pd.DataFrame(compare_all())

# Display 50 least-matched expert tags

In [5]:
match_df['expert_tag_1'].value_counts().sort_values().head(50)

confidenceinship          1
heavybombardmentgroups    1
attitudetowardgermany     1
opinionofallies           1
germanradio               1
prideinoutfit             2
sexcontact                2
civilians                 2
newsmaps                  2
germanclothing            2
mess                      2
italy                     2
sex                       2
moraleprogram             2
bbc                       2
japan                     2
aficourses                3
germans                   3
magazines                 3
redcross                  3
krations                  3
pacifictoughness          3
usoclubs                  3
localcivilians            3
postwaropportunities      3
ratings                   3
workingwithcivilians      3
strategicbombing          4
attitudetowardengland     4
treatmentofpows           4
pxs                       4
artillery                 4
honolulu                  4
basenewspapers            4
politicalviews            4
chemicalwarfare     

# Top transcriber tag to add to expert tags

In [15]:
print(transcriber_tags[2365])

noorder


In [18]:
n = 5 # number of top tags to find

best_indices = [-1]*n

for i in range(n):
    best_sum = 0
    best_index = -1
    
    for j in range(len(transcriber_df)):
        if j in best_indices:
            continue
            
        expert_tags = np.append(expert_tags, transcriber_tags[j])
        expert_embed = np.vstack((expert_embed, transcriber_embed[j]))
        
        results = compare_all()
        
        sum_cs = sum([results[k]['score_1'] for k in range(len(results)) if k!=j and k not in best_indices])
        
        if sum_cs > best_sum:
            best_sum = sum_cs
            best_index = j
        
        expert_tags = np.delete(expert_tags, len(expert_tags)-1,0)
        expert_embed = np.delete(expert_embed, len(expert_embed)-1,0)
        
        if j%100==0:
            print("Progress:",i,j)
            
    print("TAG FOUND:",transcriber_tags[best_index])
    best_indices[i] = best_index
    expert_tags = np.append(expert_tags, transcriber_tags[best_index])
    expert_embed = np.vstack((expert_embed, transcriber_embed[best_index]))

Progress: 0 0
Progress: 0 100
Progress: 0 200
Progress: 0 300
Progress: 0 400
Progress: 0 500
Progress: 0 600
Progress: 0 700
Progress: 0 800
Progress: 0 900
Progress: 0 1000
Progress: 0 1100
Progress: 0 1200
Progress: 0 1300
Progress: 0 1400
Progress: 0 1500
Progress: 0 1600
Progress: 0 1700
Progress: 0 1800
Progress: 0 1900
Progress: 0 2000
Progress: 0 2100
Progress: 0 2200
Progress: 0 2300
Progress: 0 2400
Progress: 0 2500
Progress: 0 2600
Progress: 0 2700
Progress: 0 2800
Progress: 0 2900
Progress: 0 3000
Progress: 0 3100
Progress: 0 3200
Progress: 0 3300
Progress: 0 3400
Progress: 0 3500
Progress: 0 3600
Progress: 0 3700
Progress: 0 3800
Progress: 0 3900
TAG FOUND: competence
Progress: 1 0
Progress: 1 100
Progress: 1 200
Progress: 1 300
Progress: 1 400
Progress: 1 500
Progress: 1 600
Progress: 1 700
Progress: 1 800
Progress: 1 900
Progress: 1 1000
Progress: 1 1100
Progress: 1 1200
Progress: 1 1300
Progress: 1 1400
Progress: 1 1500
Progress: 1 1600
Progress: 1 1700
Progress: 1 1800