# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from bert_serving.client import BertClient
bc = BertClient()

# Use tags with spaces?

In [2]:
column = "space" # Set to "space" or "nospace"

# Load tag data

In [20]:
# Load tag data
transcriber_tags = pd.read_excel("../data/tags.xlsx", converters={column:str}, sheet_name="ZooniverseTags")
expert_tags = pd.read_excel("../data/tags.xlsx", converters={column:str}, sheet_name="ExpertTags")

# Determine best approximations for transcriber tags

Notes:
- Consider adding spaces to multi-word tags
- Word2Vec does not recognize some tags, so I'm just using BERT for now.

## Get embeddings

In [None]:
# Redefine encoding function to allow progress tracking
count = 0
def encode(tag):
    global count
    if count%500 == 0:
        print(count)
    count += 1
    return bc.encode([tag])[0]

# Get embeddings
expert_embed = [encode(i) for i in expert_tags[column]]
transcriber_embed = [encode(i) for i in transcriber_tags[column]]

0
500
1000
1500
2000


## Compare embeddings

In [None]:
# Compare transcriber tag at index i to all expert tags
def compare_all(i):
    
    # Track best results
    result = {
        "index": i,
        "transcriber_tag": transcriber_tags[column].iloc[i],
        "best_expert_tag_bert": None,
        "best_score_bert": -1,
        "best_expert_tag_w2v": None,
        "best_score_w2v": -1
    }
    
    # Make comparisons
    for j in range(len(expert_tags)):
        score = np.dot(transcriber_embed[i], expert_embed[j]) / \
            (np.linalg.norm(transcriber_embed[i]) * np.linalg.norm(expert_embed[j]))
        if score > result['best_score_bert']:
            result['best_expert_tag_bert'] = expert_tags[column].iloc[j]
            result['best_score_bert'] = score
            
    return result

results = [compare_all(i) for i in range(len(transcriber_tags))]

# Write results to Excel

In [None]:
# Convert list of results to dataframe
df = pd.DataFrame(results)
# Sort results by cosine similarity
df = df.sort_values('best_score_bert', ascending=False)
# Write to excel file
df.to_excel("../data/tag_approx_"+column+".xlsx")

# Cluster words that lack a close-enough approximation

In [19]:
# Perform KMeans on embeddings with low similarity scores
k = 30

cluster_results = [results[i] for i in range(len(results))
                   if results[i]['best_score_bert'] < 0.75 ]

cluster_embeds = [transcriber_embed[cluster_results[i]['index']] 
                  for i in range(len(cluster_results))]

kmeans = KMeans(n_clusters=k).fit(cluster_embeds)

for i in range(k):
    print("Cluster", i)
    print([cluster_results[j]['transcriber_tag'] 
           for j in range(len(kmeans.labels_))
           if kmeans.labels_[j] == i])

Cluster 0
['disatisfied', 'discrepancies', 'displeased', 'disrespect', 'disrespected', 'inefficiency', 'inefficient', 'miscommunications', 'underappreciated', 'unexperienced']
Cluster 1
['back to school', 'bring me home', 'cant wait to get home', 'get ready for korea', 'getting back to work', 'glad I am back', 'go back and live', 'going home', 'good to be home', 'I am back', 'I want to travel more often', 'keep me home', 'keep the politics in dc', 'late to the party', 'lets go home', 'might stay in', 'ready for home', 'ready to get out', 'returning home', 'returning to the states', 'return to us', 'saying goodbye', 'see where it takes me', 'send me home', 'soon to be married', 'wants to leave', 'want to go back', 'when will it end', 'when will the war be over', 'write home']
Cluster 2
['american way', 'america the great', 'bootcamp', 'bootlicker', 'boo whittaker', 'close by', 'countdown to retirement', 'disable', 'espirit de corps', 'frank sinatra', 'fulough', 'grudge', 'grunt', 'hillb