# Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from bert_serving.client import BertClient
bc = BertClient()
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

# Use tags with spaces?

In [None]:
column = "space" # Set to "space" or "nospace"

# Load tag data

In [None]:
# Load tag data
transcriber_tags = pd.read_excel("../data/tags.xlsx", converters={column:str}, sheet_name="ZooniverseTags")
expert_tags = pd.read_excel("../data/tags.xlsx", converters={column:str}, sheet_name="ExpertTags")

# Print (some) tags that Word2Vec does not understand.

Many of these tags are short phrases which Word2Vec does not have in its base vocabulary. I do not currently have the resources to fine-tune Word2Vec to add these phrases. For this reason, the current version of this notebook only compares BERT tag embeddings.

In [None]:
for e in expert_tags[column]:
    try:
        wv[e]
    except Exception as err:
        print(err)

# Determine best approximations for transcriber tags

## Get embeddings

In [None]:
# Redefine encoding function to allow progress tracking
count = 0
def encode(tag):
    global count
    if count%500 == 0:
        print(count)
    count += 1
    return bc.encode([tag])[0]

# Get embeddings
expert_embed = [encode(i) for i in expert_tags[column]]
transcriber_embed = [encode(i) for i in transcriber_tags[column]]

## Compare embeddings

In [None]:
# Compare transcriber tag at index i to all expert tags
def compare_all(i):
    
    # Track best results
    result = {
        "index": i,
        "transcriber_tag": transcriber_tags['nospace'].iloc[i],
        "expert_tag_1": None,
        "score_1": -1,
        "expert_tag_2": None,
        "score_2": -1,
        "expert_tag_3": None,
        "score_3": -1
    }
    
    # Make comparisons
    for j in range(len(expert_tags)):
        score = np.dot(transcriber_embed[i], expert_embed[j]) / \
            (np.linalg.norm(transcriber_embed[i]) * np.linalg.norm(expert_embed[j]))
        if score > result['score_1']:
            result['expert_tag_1'] = expert_tags[column].iloc[j]
            result['score_1'] = score
        elif score > result['score_2']:
            result['expert_tag_2'] = expert_tags[column].iloc[j]
            result['score_2'] = score
        elif score > result['score_3']:
            result['expert_tag_3'] = expert_tags[column].iloc[j]
            result['score_3'] = score
            
    return result

results = [compare_all(i) for i in range(len(transcriber_tags))]

# Write results to Excel

In [None]:
# Convert list of results to dataframe
df = pd.DataFrame(results)
# Sort results by cosine similarity
df = df.sort_values('score_1', ascending=False)
# Write to excel file
df.to_excel("../data/tag_approx_"+column+".xlsx")

# Cluster words that lack a close-enough approximation

In [None]:
# Perform KMeans on embeddings with low similarity scores
k = 30 # Adjust number of clusters

cluster_results = [results[i] for i in range(len(results))
                   if results[i]['score_1_bert'] < 0.75 ] # Adjust similarity threshold

cluster_embeds = [transcriber_embed[cluster_results[i]['index']] 
                  for i in range(len(cluster_results))]

kmeans = KMeans(n_clusters=k).fit(cluster_embeds)

for i in range(k):
    print("Cluster", i)
    print([cluster_results[j]['transcriber_tag'] 
           for j in range(len(kmeans.labels_))
           if kmeans.labels_[j] == i])