In [1]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 30.0 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.5 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 69.2 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 64.4 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████

In [2]:
from datasets import load_dataset

In [3]:
import io
import glob
import numpy as np
import os
import regex as re
import pandas as pd
import string
import tqdm
import tensorflow as tf
from tensorflow.keras import layers
from collections import Counter

AUTOTUNE = tf.data.AUTOTUNESEED = 42
num_negative = 10


In [4]:
ds = load_dataset("zj88zj/PubMed_200k_RCT")

Using custom data configuration zj88zj--PubMed_200k_RCT-bf8d11840df6705f


Downloading and preparing dataset text/zj88zj--PubMed_200k_RCT to /root/.cache/huggingface/datasets/zj88zj___text/zj88zj--PubMed_200k_RCT-bf8d11840df6705f/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/358M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.74M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.74M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/zj88zj___text/zj88zj--PubMed_200k_RCT-bf8d11840df6705f/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2593169
    })
    test: Dataset({
        features: ['text'],
        num_rows: 34492
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 33941
    })
})

In [6]:
len(ds["test"])

34492

In [7]:
def vocab_counter(data):
  vocab = Counter()
  for row in data:
    for word in str(row).split():
      vocab[word] += 1
  return vocab

In [8]:
vocab = vocab_counter(ds["test"])

In [9]:
vocab_len = len(vocab)
vocab_len

42883

In [10]:
df = pd.DataFrame(ds["test"])
df.describe()

Unnamed: 0,text
count,34492.0
unique,31914.0
top,
freq,2499.0


In [17]:
df["text_len"] = df["text"].apply(lambda x: len(str(x)))
int(df["text_len"].mean()), int(df["text_len"].max())

(136, 930)

In [18]:
seq_len = int(df["text_len"].mean())

In [19]:
df = df.drop("text_len", 1)
df.dropna(inplace=True)

  """Entry point for launching an IPython kernel.


In [20]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [21]:
file_name = "/content/gdrive/My Drive/data/pubmed.txt"
with open(file_name, 'w', encoding = 'utf-8') as f:
    for rec_index, rec in df.iterrows():
        f.write(rec['text'] + '\n')

In [22]:
train_data = tf.data.TextLineDataset(file_name).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [42]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [23]:
def preprocessing(data):
  data = tf.strings.lower(data)
  return tf.strings.regex_replace(data, '[%s]'% re.escape(string.punctuation), "")


In [24]:
# vectorize sentences
Vectorizer = layers.TextVectorization(
    standardize=preprocessing,
    max_tokens=vocab_len,
    output_mode="int",
    output_sequence_length=seq_len
)

In [25]:
# create vocabulary
Vectorizer.adapt(train_data.batch(1024))

# save vocabulary
vocab_vectorized = Vectorizer.get_vocabulary()

In [26]:
train_data_vectorized = train_data.batch(1024).prefetch(AUTOTUNE).map(Vectorizer).unbatch()

In [27]:
seqs = list(train_data_vectorized.as_numpy_iterator())

In [28]:
for seq in seqs[:5]:
  print(f"{seq} :: {[vocab_vectorized[i] for i in seq]}")

[28116     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0] :: ['24562799', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',

In [29]:
def data_generator_sg(seqs, window, num_negative, vocab_size, seed):
  targets, contexts, labels = [], [], []
  sampling = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  for seq in tqdm.tqdm(seqs):
    positive_sg, _ = tf.keras.preprocessing.sequence.skipgrams(
        seq,
        vocabulary_size=vocab_size,
        sampling_table=sampling,
        window_size=window,
        negative_samples=0
    )

    for target_word, context_word in positive_sg:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1
      )
      neg_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_negative,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="neg_sampling"
      )

      neg_sampling_candidates = tf.expand_dims(
          neg_sampling_candidates, 1
      )

      context = tf.concat([context_class, neg_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_negative, dtype="int64")

      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [30]:
targets, contexts, labels = data_generator_sg(
    seqs=seqs,
    window=5,
    num_negative=num_negative,
    vocab_size=vocab_len,
    seed=42
)

100%|██████████| 31993/31993 [14:56<00:00, 35.67it/s]


In [31]:
targets, contexts, labels = np.array(targets), np.array(contexts)[:,:,0], np.array(labels)

In [33]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 11), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 11), dtype=tf.int64, name=None))>


In [34]:
class Word2Vec(tf.keras.Model):

  def __init__(self, vocab_size, emb_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size, 
                                             emb_dim, 
                                             input_length=1,
                                             name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                              emb_dim,
                                              input_length=num_negative+1)
    
  def call(self, pair):
    target, context = pair
    if len(target.shape) == 2:
      target = tf.sqeeze(target, axis=1)
    
    word_emb = self.target_embedding(target)
    context_emb = self.context_embedding(context)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    return dots

In [35]:
# initialize and compile model
embedding_dim = 128
word2vec = Word2Vec(vocab_len, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy']
                 )


In [36]:
# log training statistics
tensorbd_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")


In [37]:
# train model
word2vec.fit(dataset, epochs=20, callbacks=[tensorbd_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f2805083ed0>

In [40]:
# obtain weights from the model
weights = word2vec.get_layer("w2v_embedding").get_weights()[0]
vocab = Vectorizer.get_vocabulary()

In [41]:
path = "/content/gdrive/My Drive/data/"
out_v = io.open("pubmed_vectors.tsv", "w", encoding="utf-8")
out_meta = io.open("pubmed_metadata.tsv", "w", encoding="utf-8")

for i, word in enumerate(vocab):
  if i == 0:
    continue 
  vec = weights[i]
  out_v.write("\t".join([str(x) for x in vec]) + "\n")
  out_meta.write(word+"\n")
out_v.close()
out_meta.close()


In [45]:
try:
  from google.colab import files
  files.download('pubmed_vectors.tsv')
  files.download('pubmed_metadata.tsv')
except Exception:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
https://projector.tensorflow.org/