# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import scipy
from sklearn.cluster import KMeans

# Load tag data

In [2]:
path = "../data/tags.xlsx"

df_transcriber = pd.read_excel(
    path, 
    converters={'space':str, 'nospace':str}, 
    sheet_name="ZooniverseTags"
)

df_expert = pd.read_excel(
    path, 
    converters={'space':str, 'nospace':str}, 
    sheet_name="ExpertTags"
)

tags_expert = df_expert["nospace"].tolist()
tags_transcriber = df_transcriber["nospace"].tolist()

# Word2Vec

In [3]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Print some tags Word2Vec does not understand

These are a few words/short-phrases that do not appear in Word2Vec's base vocabulary. For the short phrases, it would be possible to use the embeddings for individual words in the phrase to get an embedding for the full phrase. However, for the one-word tags, I do ot have the resources to fine-tune Word2Vec to add these. Another limitation is that Word2Vec does not give contextualized embeddings. For these reasons, we will not be using Word2Vec to find tag approximations.

In [4]:
for e in df_expert["space"]:
    try:
        wv[e]
    except Exception as err:
        print(err)

"word '1 in 10 rations' not in vocabulary"
"word 'afi courses' not in vocabulary"
"word 'army films' not in vocabulary"
"word 'army insurance' not in vocabulary"
"word 'army living' not in vocabulary"
"word 'army scorecard plan' not in vocabulary"
"word 'army talks' not in vocabulary"
"word 'attitude toward england' not in vocabulary"
"word 'attitude toward france' not in vocabulary"
"word 'attitude towards germany' not in vocabulary"
"word 'attitude toward italy' not in vocabulary"
"word 'base newspapers' not in vocabulary"
"word 'base units' not in vocabulary"
"word 'branch of service' not in vocabulary"
"word 'british papers' not in vocabulary"
"word 'business ownership' not in vocabulary"
"word 'cbi roundup' not in vocabulary"
"word 'chemical warfare' not in vocabulary"
"word 'civilian skills' not in vocabulary"
"word 'combat flying' not in vocabulary"
"word 'combat stress' not in vocabulary"
"word 'confidence in ship' not in vocabulary"
"word 'confidence in troops' not in vocabula

# bert-as-service

Make sure to run
- bert-serving-start -model_dir [PATH TO MODEL] -num_worker=4 -max_seq_len=50

I used the base uncased model. Adjust num_worker as needed.

In [5]:
from bert_serving.client import BertClient
bc = BertClient()

## Embed tags

In [6]:
embed_expert = bc.encode(df_expert['space'].tolist())
embed_transcriber = bc.encode(df_transcriber['space'].tolist())

## Compare transcriber tag embeddings to expert tag embeddings

In [7]:
def compare_all():
    distances = scipy.spatial.distance.cdist(embed_transcriber, embed_expert, "cosine")
    
    ranking = [
        sorted(
            zip(range(len(distances[i])), distances[i]), 
            key=lambda x: x[1]
        ) 
        for i in range(len(df_transcriber))
    ]
    
    return [
        {
            "index": i,
            "transcriber_tag": tags_transcriber[i],
            "expert_tag_1": tags_expert[ranking[i][0][0]],
            "score_1": 1-ranking[i][0][1],
            "expert_tag_2": tags_expert[ranking[i][1][0]],
            "score_2": 1-ranking[i][1][1],
            "expert_tag_3": tags_expert[ranking[i][2][0]],
            "score_3": 1-ranking[i][2][1]
        }
        for i in range(len(df_transcriber))
    ]

In [8]:
df_match = pd.DataFrame(compare_all())

## Write to Excel

In [9]:
# Sort results by cosine similarity
df_match = df_match.sort_values('score_1', ascending=False)
# Write to excel file
df_match.to_excel("../data/tag_match_BAS.xlsx")

## Cluster tags lacking "close enough" approximation

In [10]:
def show_clusters(k, thresh):
    
    # Do Kmeans on tags with low similarity score
    df_match_low = df_match[df_match['score_1'] < thresh].reset_index()
    
    embed_transcriber_low = [
        embed_transcriber[df_match_low['index'][i]] 
        for i in range(len(df_match_low))
    ]

    kmeans = KMeans(n_clusters=k).fit(embed_transcriber_low)
    
    # Print results
    for i in range(k):
        print("Cluster", i+1)
        print([
            df_match_low['transcriber_tag'][j]
            for j in range(len(kmeans.labels_)) 
            if kmeans.labels_[j] == i
        ])

In [11]:
show_clusters(30, 0.75)

Cluster 1
['onceasoldieralwaysasoldier', 'lovedones', 'livefortheday', 'stretchingthesamesentencetoreachwordlimi', 'realworld', 'friendsandfamily', 'homeofthebrave', 'newyearnewme', 'reach_for_the_stars', 'peaceandquiet', 'laterhours', 'shortandtothepoint', 'leftinthedark', 'tasteofhome', 'lovedone', 'sunnydaystocome', 'best-friends', 'seewhereittakesme', 'hellwithtomorrow']
Cluster 2
['selectiveservice', 'foreignservices', 'privatesector', 'foreignaffairs', 'med', 'chapel', 'flyingstatus', 'navy', 'traveltime', 'stigma', 'courage', 'physicalexam', 'travelconditions', 'engineers', 'disabled', 'foreigntrade', 'mapping', 'rotc', 'transportationservice', 'hierarchy', 'flyinghours', 'fcc', 'potential', 'publicfunds', 'prescitation', 'rotten', 'rural', 'foreignlands', 'marginalia', 'stipend', 'postoffice', 'cpn', 'timeserved', 'ranches', 'homesteaders']
Cluster 3
['workharder', 'fightforwhatyoubelievein', 'dosomething', 'makemeanofficer', 'takelifeeasy', 'learntoshoot', 'winthewar', 'office

# Sentence-BERT

In [12]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Embed tags

In [13]:
embed_expert = model.encode(df_expert['space'].tolist())
embed_transcriber = model.encode(df_transcriber['space'].tolist())

## Compare transcriber tag embeddings to expert tag embeddings

In [14]:
df_match = pd.DataFrame(compare_all())

## Write to Excel

In [15]:
# Sort results by cosine similarity
df_match = df_match.sort_values('score_1', ascending=False)
# Write to excel file
df_match.to_excel("../data/tag_match_SB.xlsx")

## Cluster tags lacking "close enough" approximation

In [16]:
show_clusters(30, 0.75)

Cluster 1
['engineer', 'engineers', 'chemicalengineer', 'experiments', 'ophthalmology', 'optomistic', 'technicalschool', 'philosophy', 'engineering', 'masteringacraft', 'neurosis', 'railroad', 'techschool', 'psychology', 'mechanicschool', 'bombardier', 'chemist', 'engineschool', 'safteyengineering', 'chemistry', 'lithographer', 'powertrip', 'constructionengineering', 'electrician', 'metallurgy', 'physics', 'biology', 'oilindustry', 'civilengineering', 'dieselwork', 'biologicalresearch', 'amphibianengineers', 'musictheory']
Cluster 2
['islands', 'southernofficers', 'drifting', 'island', 'southerners', 'south', 'atlanticcity', 'southerner', 'slaves', 'tropics', 'northandsouth', 'tropical-ulcers', 'tropical-diseases', 'southerncamps', 'mississippi', 'tropical-sickness', 'sicily', 'southernstates', 'guadalcanal', 'beachhead', 'northvssouth', 'purpleheart', 'theamericansouth', 'nightmare', 'sun-tans', 'deepsouth', 'southseas', 'southamerica', 'bluefalcon', 'southcarolina', 'southerncaliforn

## Display 50 least-matched expert tags

In [17]:
df_match['expert_tag_1'].value_counts().sort_values().head(50)

germanradio               1
confidenceinship          1
heavybombardmentgroups    1
attitudetowardgermany     1
opinionofallies           1
bbc                       2
sex                       2
moraleprogram             2
japan                     2
civilians                 2
germanclothing            2
sexcontact                2
italy                     2
mess                      2
prideinoutfit             2
newsmaps                  2
localcivilians            3
magazines                 3
aficourses                3
workingwithcivilians      3
usoclubs                  3
ratings                   3
postwaropportunities      3
krations                  3
redcross                  3
germans                   3
pacifictoughness          3
hospitalfacilities        4
espiritdcorps             4
basenewspapers            4
politicalviews            4
pxs                       4
treatmentofpows           4
japanese                  4
honolulu                  4
entalhealth         

## Top transcriber tags to add to expert tags

In [None]:
n = 5 # number of top tags to find

best_indices = [-1]*n

for i in range(n):
    best_sum = 0
    best_index = -1
    
    for j in range(len(df_transcriber)):
        if j in best_indices:
            continue
            
        tags_expert = np.append(tags_expert, tags_transcriber[j])
        embed_expert = np.vstack((embed_expert, embed_transcriber[j]))
        
        results = compare_all()
        
        sum_cs = sum([results[k]['score_1'] for k in range(len(results)) if k!=j and k not in best_indices])
        
        if sum_cs > best_sum:
            best_sum = sum_cs
            best_index = j
        
        tags_expert = np.delete(tags_expert, len(tags_expert)-1,0)
        embed_expert = np.delete(embed_expert, len(embed_expert)-1,0)
        
        if j%100==0:
            print("Progress:",i,j)
            
    print("TAG FOUND:",tags_transcriber[best_index])
    best_indices[i] = best_index
    tags_expert = np.append(tags_expert, tags_transcriber[best_index])
    embed_expert = np.vstack((embed_expert, embed_transcriber[best_index]))

Progress: 0 0
Progress: 0 100
Progress: 0 200
Progress: 0 300
Progress: 0 400
Progress: 0 500
Progress: 0 600
Progress: 0 700
Progress: 0 800
Progress: 0 900
Progress: 0 1000
Progress: 0 1100
Progress: 0 1200
Progress: 0 1300
Progress: 0 1400
Progress: 0 1500
Progress: 0 1600
Progress: 0 1700
Progress: 0 1800
Progress: 0 1900
Progress: 0 2000
Progress: 0 2100
Progress: 0 2200
Progress: 0 2300
Progress: 0 2400
Progress: 0 2500
Progress: 0 2600
Progress: 0 2700
Progress: 0 2800
Progress: 0 2900
Progress: 0 3000
Progress: 0 3100
