# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from bert_serving.client import BertClient
bc = BertClient()

# Load tag data

In [2]:
# Load tag data
transcriber_tags = pd.read_excel('../data/Unique_Zooniverse-tags (June2020).xlsx', 
                            converters={'name':str},
                            sheet_name="ZooniverseTags")

expert_tags = pd.read_excel('../data/Unique_Zooniverse-tags (June2020).xlsx', 
                            converters={'name':str},
                           sheet_name="ExpertTags")

# Get only the unique tags
transcriber_tags = transcriber_tags.drop_duplicates('name')

# Determine best approximations for transcriber tags

Notes:
- Consider adding spaces to multi-word tags
- Word2Vec does not recognize some tags, so I'm just using BERT for now.

## Get embeddings

In [None]:
# Redefine encoding function to allow progress tracking
def encode(tag):
    return bc.encode([tag])[0]

# Get embeddings
expert_embed = [encode(i) for i in expert_tags['name']]
transcriber_embed = [encode(i) for i in transcriber_tags['name']]

## Compare embeddings

In [None]:
# Compare transcriber tag at index i to all expert tags
def compare_all(i):
    
    # Track best results
    result = {
        "transcriber_tag": transcriber_tags['name'].iloc[i],
        "best_expert_tag": None,
        "best_score": -1
    }
    
    # Make comparisons
    for j in range(len(expert_tags)):
        score = np.dot(transcriber_embed[i], expert_embed[j]) / \
            (np.linalg.norm(transcriber_embed[i]) * np.linalg.norm(expert_embed[j]))
        if score > result['best_score']:
            result['best_expert_tag'] = expert_tags['name'].iloc[j]
            result['best_score'] = score
            
    return result
    
results = [compare_all(i) for i in range(len(transcriber_tags))]

# Write results to Excel

In [None]:
# Convert list of results to dataframe
df = pd.DataFrame(results)
# Sort results by cosine similarity
df = df.sort_values('best_score', ascending=False)
# Write to excel file
df.to_excel("../data/tag_approx_bert_nospace.xlsx")

# Cluster words that lack a close-enough approximation

In [None]:
# Cluster transcriber words that don't have good expert matches
# kmeans, spectral