I first need to find words that often appear together, and words that don't. Then, I can look at how similar their embeddings are. Or rather the other way around. I look at the embeddings of three words, and then see how similar they are. For instance, 'machine' and 'learning' appear together 355 times in our corpus. 'polymer' and 'brush' appear together 33 times. 'magnetic' and 'resonance' appear together 477 times. These pairs of words belong to the fields of computer science, chemistry and medicine, respectively.

In [1]:
import json
import numpy as np
from skipgram.load_data import Dataset

In [2]:
vocab_file = 'data/vocab/vocab_freqs.json'
data_file = 'data/txt/data_lemmas.txt'
vocab = json.load(open(vocab_file))
embeddings = json.load(open('skipgram/embeddings/1642602756/embeddings.json'))

In [3]:
'monomer' in vocab, 'polymer' in vocab, 'machine' in vocab, 'learning' in vocab, 'magnetic' in vocab, 'resonance' in vocab, 'recent' in vocab

(True, True, True, True, True, True, True)

In [4]:
monomer = np.array(embeddings['monomer'])
polymer = np.array(embeddings['polymer'])
machine = np.array(embeddings['machine'])
learning = np.array(embeddings['learning'])
magnetic = np.array(embeddings['magnetic'])
resonance = np.array(embeddings['resonance'])

In [5]:
vectors = {'monomer': monomer, 'polymer': polymer, 'machine': machine, 'learning': learning, 'magnetic': magnetic, 'resonance': resonance}

In [6]:
for v1 in vectors:
  for v2 in vectors:
    print(f'dist({v1},{v2}) = {np.linalg.norm(vectors[v1]-vectors[v2])}')

dist(monomer,monomer) = 0.0
dist(monomer,polymer) = 4.06971362552118
dist(monomer,machine) = 5.708905822013942
dist(monomer,learning) = 5.46126839394766
dist(monomer,magnetic) = 5.280893004212832
dist(monomer,resonance) = 5.607216785495744
dist(polymer,monomer) = 4.06971362552118
dist(polymer,polymer) = 0.0
dist(polymer,machine) = 4.471068949765773
dist(polymer,learning) = 4.469279410336604
dist(polymer,magnetic) = 4.103815695926397
dist(polymer,resonance) = 4.449006480149132
dist(machine,monomer) = 5.708905822013942
dist(machine,polymer) = 4.471068949765773
dist(machine,machine) = 0.0
dist(machine,learning) = 2.9185849057092055
dist(machine,magnetic) = 4.798450081056845
dist(machine,resonance) = 4.8719130221688935
dist(learning,monomer) = 5.46126839394766
dist(learning,polymer) = 4.469279410336604
dist(learning,machine) = 2.9185849057092055
dist(learning,learning) = 0.0
dist(learning,magnetic) = 4.361478147251813
dist(learning,resonance) = 4.449841621366759
dist(magnetic,monomer) = 5.

How often do the pairs of words appear together in the data? Not really together, but among the window the model uses (i.e. 3)

In [7]:
with open('test_embeddings.txt', 'w', encoding='utf-8') as f1:
  with open(data_file, encoding='utf-8') as f2:
    for line in f2.readlines():
      for word in line.split(' '):
        if word in ['monomer', 'polymer', 'machine', 'learning', 'magnetic', 'resonance']:
          f1.write(line)
          break

In [8]:
chem, ml, med = 0, 0, 0
chem_set = set(['monomer', 'polymer'])
ml_set = set(['machine', 'learning'])
med_set = set(['magnetic', 'resonance'])
dataset = Dataset(vocab_file, 'test_embeddings.txt', k=0, w=3)
for idx1, idx2, neg_samples in dataset:
  w1, w2 = dataset.vocab.get_word(idx1), dataset.vocab.get_word(idx2)
  this_set = set([w1, w2])
  if this_set == chem_set:
    chem += 1
  elif this_set == ml_set:
    ml += 1
  elif this_set == med_set:
    med += 1
chem, ml, med

(26, 606, 652)

In [9]:
program = np.array(embeddings['program'])
for v2 in vectors:
  print(f'dist(program, {v2}) = {np.linalg.norm(program-vectors[v2])}')

dist(program, monomer) = 5.35146646249422
dist(program, polymer) = 4.1012284025493715
dist(program, machine) = 3.7070025726093436
dist(program, learning) = 3.077450748003879
dist(program, magnetic) = 4.185310773621567
dist(program, resonance) = 4.369236428017561


In [10]:
'program' in vocab

True

In [11]:
with open('program_embeddings.txt', 'w', encoding='utf-8') as f1:
  with open('test_embeddings.txt', encoding='utf-8') as f2:
    for line in f2.readlines():
      if 'program' in line.split(' '):
        f1.write(line)

In [12]:
chem, ml, med = 0, 0, 0
chem_set = set(['monomer', 'polymer', 'program'])
ml_set = set(['machine', 'learning', 'program'])
med_set = set(['magnetic', 'resonance', 'program'])
dataset = Dataset(vocab_file, 'program_embeddings.txt', k=0, w=3)
for idx1, idx2, neg_samples in dataset:
  this_set = set([dataset.vocab.get_word(idx1), dataset.vocab.get_word(idx2)])
  if 'program' in this_set:
    if this_set.issubset(chem_set):
      chem += 1
    elif this_set.issubset(ml_set):
      ml += 1
    elif this_set.issubset(med_set):
      med += 1
chem, ml, med

(8, 21, 2)