# Extracting vocabulary from cleaned data (unique words)

In [3]:
from tqdm.notebook import tqdm
import mmap


def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines


def get_vocabulary(paths):
    """
    """
    words = []
    for filepath in paths:
        print("Extracting words of {}...".format(filepath))
        with open(filepath) as file:
            for line in tqdm(file, total=get_num_lines(filepath)):
                words.extend(line.split())
    print("Extracting unique words...")
    vocab = list(set(words))
    return vocab
    
        
# Get paths of the files        
directory = '/raid/antoloui/Master-thesis/Data/Cleaned/'
files = ['dev.raw', 'test.raw', 'train.raw']
paths = [directory+f for f in files]

# Get the vocabulary and save it
vocab = get_vocabulary(paths)
print("Saving vocabulary of size: {}...".format(len(vocab)))
with open('data/cisco_voc.txt', 'w') as file:
    for word in vocab:
        file.write(word+'\n')
print("Done!")

Extracting words of /raid/antoloui/Master-thesis/Data/Cleaned/dev.raw...


HBox(children=(FloatProgress(value=0.0, max=8791712.0), HTML(value='')))


Extracting words of /raid/antoloui/Master-thesis/Data/Cleaned/test.raw...


HBox(children=(FloatProgress(value=0.0, max=8432729.0), HTML(value='')))


Extracting words of /raid/antoloui/Master-thesis/Data/Cleaned/train.raw...


HBox(children=(FloatProgress(value=0.0, max=153475762.0), HTML(value='')))


Extracting unique words...
Saving vocabulary of size: 4751484...
Done!


# Encode each word of the vocabulary

Make sure to start bert-as-a-service server by running

```
export ZEROMQ_SOCK_TMP_DIR=/tmp/
bert-serving-start -num_worker=1 -max_seq_len=25 -model_dir ./models/netbert/tensorflow-checkpoint -pooling_strategy NONE
```
NB1: Note that the tensorflow checkpoint needs to have a bert_model.ckpt file containing the pre-trained weights (which is actually 3 files), a vocab file (vocab.txt) to map WordPiece to word id, and a config file (bert_config.json) which specifies the hyperparameters of the model.

NB2: Refer to the following table for the pooling strategy:

|Strategy|Description|
|---|---|
| `NONE` | no pooling at all, useful when you want to use word embedding instead of sentence embedding. This will results in a `[max_seq_len, 768]` encode matrix for a sequence.|
| `REDUCE_MEAN` | take the average of the hidden state of encoding layer on the time axis |
| `REDUCE_MAX` | take the maximum of the hidden state of encoding layer on the time axis |
| `REDUCE_MEAN_MAX` | do `REDUCE_MEAN` and `REDUCE_MAX` separately and then concat them together on the last axis, resulting in 1536-dim sentence encodes |
| `CLS_TOKEN` or `FIRST_TOKEN` | get the hidden state corresponding to `[CLS]`, i.e. the first token |
| `SEP_TOKEN` or `LAST_TOKEN` | get the hidden state corresponding to `[SEP]`, i.e. the last token |

In [2]:
import numpy as np
import pandas as pd
import string
from bert_serving.client import BertClient


def extract_words(filepath):
    """
    Extract the words in the given file.
    """
    words = []
    with open(filepath) as infile:
        for line in infile:
            tokens = line.lower().split() # Lowercase sentence and split it into words
            tokens = [tok.translate(str.maketrans('', '', string.punctuation)) for tok in tokens]  # Remove punctuation from each word
            tokens = list(filter(None, tokens))  # Remove empty strings
            words.extend(tokens)
    return list(set(words))


# Extract strings from file
infile = 'data/cisco_voc.txt'
outfile = 'data/cisco_voc_encodings.csv'
strings = extract_words(infile)

# Encode strings via bert-as-service
with BertClient() as bc:
    encodings = bc.encode(strings)

# Create dataframe
cols = ['feat'+str(i) for i in range(encodings.shape[1])]
df = pd.DataFrame(data=encodings[:,:], columns=cols)
df['text'] = strings

# Save encodings
df.to_csv(outfile, index=False, sep=',', encoding='utf-8', float_format='%.10f', decimal='.')
print("Encodings saved !")



KeyboardInterrupt: 

# Visualize encodings with Tensorboard Projector

In [21]:
import torch
try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

writer = SummaryWriter()  # writer will output to ./runs/ directory by default


# Load csv dataframe with encodings and associated words
filepath = 'data/cisco_voc_encodings.csv'
df = pd.read_csv(filepath, header=0)
print(df.shape)

# Get word encodings
encodings = df.loc[:, df.columns != 'text'].values

# Get the associated words
words = df.loc[:, df.columns == 'text'].values

# Write to tensorboard
writer.add_embedding(encodings, metadata=words)

# Close writer
writer.close()

In [14]:
from bert_serving.client import BertClient


# Encode strings via bert-as-service
with BertClient() as bc:
        encodings = bc.encode(["dog"],["cat"],["man"])
print(encodings.shape)

TypeError: "dog" must be <class 'list'>, but received <class 'str'>