# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from bert_serving.client import BertClient
bc = BertClient()

# Use tags with spaces?

In [2]:
column = "space" # Set to "space" or "nospace"

# Load tag data

In [3]:
# Load tag data
transcriber_tags = pd.read_excel("../data/tags.xlsx", converters={column:str}, sheet_name="ZooniverseTags")
expert_tags = pd.read_excel("../data/tags.xlsx", converters={column:str}, sheet_name="ExpertTags")

# Determine best approximations for transcriber tags

Notes:
- Consider adding spaces to multi-word tags
- Word2Vec does not recognize some tags, so I'm just using BERT for now.

## Get embeddings

In [4]:
# Redefine encoding function to allow progress tracking
count = 0
def encode(tag):
    global count
    if count%500 == 0:
        print(count)
    count += 1
    return bc.encode([tag])[0]

# Get embeddings
expert_embed = [encode(i) for i in expert_tags[column]]
transcriber_embed = [encode(i) for i in transcriber_tags[column]]

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100


## Compare embeddings

In [20]:
# Compare transcriber tag at index i to all expert tags
def compare_all(i):
    
    # Track best results
    result = {
        "index": i,
        "transcriber_tag": transcriber_tags[column].iloc[i],
        "best_expert_tag_bert": None,
        "best_score_bert": -1,
        "best_expert_tag_w2v": None,
        "best_score_w2v": -1
    }
    
    # Make comparisons
    for j in range(len(expert_tags)):
        score = np.dot(transcriber_embed[i], expert_embed[j]) / \
            (np.linalg.norm(transcriber_embed[i]) * np.linalg.norm(expert_embed[j]))
        if score > result['best_score_bert']:
            result['best_expert_tag_bert'] = expert_tags[column].iloc[j]
            result['best_score_bert'] = score
            
    return result

results = [compare_all(i) for i in range(len(transcriber_tags))]

# Write results to Excel

In [21]:
# Convert list of results to dataframe
df = pd.DataFrame(results)
# Sort results by cosine similarity
df = df.sort_values('best_score_bert', ascending=False)
# Write to excel file
df.to_excel("../data/tag_approx_"+column+".xlsx")

# Cluster words that lack a close-enough approximation

In [31]:
# Perform KMeans on embeddings with low similarity scores
k = 6

cluster_results = [results[i] for i in range(len(results))
                   if results[i]['best_score_bert'] < 0.6 ]

cluster_embeds = [transcriber_embed[cluster_results[i]['index']] 
                  for i in range(len(cluster_results))]

kmeans = KMeans(n_clusters=k).fit(cluster_embeds)

for i in range(k):
    print("Cluster", i)
    print([cluster_results[j]['transcriber_tag'] 
           for j in range(len(kmeans.labels_))
           if kmeans.labels_[j] == i])

Cluster 0
['fayetteville', 'guadalcanal', 'mojave', 'negros', 'ranches', 'rommel']
Cluster 1
['get it over with', 'get them out of here', 'give what you get', 'hell of a deal', 'just do your job', 'make the best of it']
Cluster 2
['bff', 'ww1', 'ww2', 'x1186']
Cluster 3
['bayonet', 'gi bill of rights', 'great depression', 'great depression has ended', 'length of service', 'port of embarkation', 'post world war']
Cluster 4
['amen', 'close-by', 'commonsense', 'cpn', 'europeantheater', 'illegible', 'marginalia', 'on par', 'unassigned']
Cluster 5
['hooah', 'whatsnext']
