# *Text2Emoji ([More info](/../../README.md))*

# Export Huggingface models to ONNX for faster inference

## Install packages

In [3]:
!which python

/home/andylo/Projects/Text2Emoji/.venv/bin/python


In [4]:
!pip install onnxruntime-tools onnxruntime-gpu onnx



In [6]:
%load_ext autoreload
%autoreload 2

## Export

In [7]:
!rm -rf models/
from pathlib import Path
from transformers.convert_graph_to_onnx import convert
from config import PrepConfig

# Handles all the above steps for you
convert(framework="pt", model=PrepConfig.DISTILBERT_NAME, output=Path(PrepConfig.DISTILBERT_ONNX_PATH), opset=13)
convert(framework="pt", model=PrepConfig.SENTENCE_TRANSFORMER_NAME, output=Path(PrepConfig.SENTENCE_ONNX_PATH), opset=13)

ONNX opset version set to: 13
Loading pipeline (model: distilbert-base-uncased, tokenizer: distilbert-base-uncased)
Creating folder models/onnx/distilbert
Using framework PyTorch: 1.8.1+cu102
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Ensuring inputs are in correct order
head_mask is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask']
ONNX opset version set to: 13
Loading pipeline (model: sentence-transformers/distilbert-base-nli-stsb-mean-tokens, tokenizer: sentence-transformers/distilbert-base-nli-stsb-mean-tokens)
Creating folder models/onnx/sentence-transformer
Using framework PyTorch: 1.8.1+cu102
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch', 1: 'sequence'}
Ensurin

## Testing

In [2]:
from os import environ
from psutil import cpu_count
from config import PrepConfig
from contextlib import contextmanager
from dataclasses import dataclass
from time import time
from tqdm import trange

# Constants from the performance optimization available in onnxruntime
# It needs to be done before importing onnxruntime
environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True))
environ["OMP_WAIT_POLICY"] = 'ACTIVE'

from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers

def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: 
  
  assert provider in get_all_providers(), f"provider {provider} not found, {get_all_providers()}"

  # Few properties that might have an impact on performances (provided by MS)
  options = SessionOptions()
  options.intra_op_num_threads = 1
  options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

  # Load the model as a graph and prepare the CPU backend 
  session = InferenceSession(model_path, options, providers=[provider])
  session.disable_fallback()
    
  return session
  

@contextmanager
def track_infer_time(buffer: [int]):
    start = time()
    yield
    end = time()

    buffer.append(end - start)


@dataclass
class OnnxInferenceResult:
  model_inference_time: [int]  
  optimized_model_path: str


In [19]:

from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
test_text = PrepConfig.SAMPLE_TEXT
model_inputs = tokenizer([" ".join([test_text]*i) for i in range(1, 2)], 
    return_tensors="np",
    padding=True,
    truncation=True)
inputs_onnx = dict(model_inputs)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /distilbert-base-uncased/resolve/main/vocab.txt HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /distilbert-base-uncased/resolve/main/tokenizer.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /distilbert-base-uncased/resolve/main/added_tokens.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /distilbert-base-uncased/resolve/main/special_tokens_map.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:44

In [20]:
import onnxruntime

print(f"Device: {onnxruntime.get_device()}")
distilbert_onnx = create_model_for_provider(PrepConfig.DISTILBERT_ONNX_PATH, "CUDAExecutionProvider")
sentence_onnx = create_model_for_provider(PrepConfig.SENTENCE_ONNX_PATH, "CUDAExecutionProvider")
print("ONNX models loaded")
# print(distilbert_onnx.run(None, inputs_onnx)[0])
# print(mean_pooling(sentence_onnx.run(None, inputs_onnx), inputs_onnx['attention_mask']))

Device: GPU
ONNX models loaded


In [22]:
from pathlib import Path
import onnx
from onnxruntime.quantization import QuantizationMode
from transformers.convert_graph_to_onnx import quantize
import numpy as np
import torch

from config import PrepConfig

#mean pooling - take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    '''
    model_output dimensions: (1, BATCH_SIZE, SEQ_LEN, HIDDEN_DIM_SIZE)
    '''
    token_embeddings = model_output[0]
    assert isinstance(token_embeddings, np.ndarray)
    if isinstance(attention_mask, np.ndarray):
        input_mask_expanded = np.tile(np.expand_dims(attention_mask, axis=-1), (1, 1, token_embeddings.shape[-1])).astype('float32')
    else:
        assert isinstance(attention_mask, torch.tensor)
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.shape).float()
    sum_embeddings = np.sum(token_embeddings * input_mask_expanded, axis=1)
    sum_mask = np.clip(np.sum(input_mask_expanded, axis=1), a_max=None, a_min=1e-9)
    return sum_embeddings / sum_mask

# Keep track of the inference time
time_buffer = []

provider = "CUDA"

# Warm up the model
word_embeddings = distilbert_onnx.run(None, inputs_onnx)
sentence_embeddings = mean_pooling(
    sentence_onnx.run(None, inputs_onnx), 
    attention_mask=inputs_onnx['attention_mask'])

# Compute 
for _ in trange(100, desc=f"Tracking distilbert inference time on {provider}"):
    with track_infer_time(time_buffer):
        assert np.array_equal(word_embeddings, distilbert_onnx.run(None, inputs_onnx))

for _ in trange(100, desc=f"Tracking sentence transformer inference time on {provider}"):
    with track_infer_time(time_buffer):
        assert np.array_equal(sentence_embeddings, mean_pooling(
            sentence_onnx.run(None, inputs_onnx), 
            attention_mask=inputs_onnx['attention_mask']
            )
        )


Tracking distilbert inference time on CUDA: 100%|██████████| 100/100 [00:00<00:00, 318.31it/s]
Tracking sentence transformer inference time on CUDA: 100%|██████████| 100/100 [00:00<00:00, 323.01it/s]


# Clean up dataset

## Prepare emoji vocab dictionary

In [11]:
import re
import os

import pandas as pd

from config import PrepConfig

df = pd.read_csv('data/emojis/emoji_df.csv', usecols=['emoji', 'name'])
original_len = len(df)
emoji_vocab = set(df['name'].to_list())

def clean_names(row):
    t = row['name']
    flag_re = r"flag: (.*)"
    is_flag = re.match(flag_re, t)
    if is_flag:
        # remove flag prefixes
        row['cleaned_name'] = is_flag[1]
        return row
    details_re = r"(.*): (.*)"
    is_detailed = re.match(details_re, t)
    if is_detailed:
        # remove skin tone adjective
        row['cleaned_name'] = is_detailed[1]
        return row
    row['cleaned_name'] = row['name']
    return row

df = df.apply(clean_names, axis=1)\
    .drop_duplicates(subset=['cleaned_name'], ignore_index=True)

print(f"Original no. of emojis: {original_len}")
print(f"cleaned no. of emojis: {len(df)}")
longest_arg = df['cleaned_name'].str.len().argmax()
print(f"Longest emoji vocab: {df['emoji'].iloc[longest_arg]} {df['cleaned_name'].iloc[longest_arg]}")

df.to_csv(PrepConfig.EMOJI_VOCAB_PATH)


Original no. of emojis: 4581
cleaned no. of emojis: 1752
Longest emoji vocab: 🇬🇸 South Georgia & South Sandwich Islands


## Prepare training data

### Install packages

In [22]:
!pip install tables
!pip install h5py

Collecting h5py
  Downloading h5py-3.2.1-cp38-cp38-manylinux1_x86_64.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 8.8 MB/s 
Installing collected packages: h5py
Successfully installed h5py-3.2.1


### Process dataset

In [62]:
import math

import torch
import numpy as np
import pandas as pd
import sqlite3
from tqdm import tqdm
from config import PrepConfig
import h5py

pd.options.mode.chained_assignment = None

def body_to_embeddings(sub_df, pbar):
    input_ids = np.stack(sub_df['input_ids'].to_numpy())
    mask = np.stack(sub_df['attention_mask'].to_numpy())
    inputs = {
        'input_ids': input_ids,
        'attention_mask': mask,
    }
    word = distilbert_onnx.run(None, inputs)[0]
    sentence = mean_pooling(sentence_onnx.run(None, inputs), mask)
    sub_df['word_embeddings'] = list(word)
    sub_df['sentence_embeddings'] = list(sentence)
    pbar.update(len(sub_df))
    return sub_df


def split_dataframe(df, chunk_size = 1000): 
    num_chunks = math.ceil(len(df) / chunk_size)
    for i in range(num_chunks):
        yield df[i*chunk_size:(i+1)*chunk_size]


sql_conn = sqlite3.connect('data/reddit-comments/database.sqlite')
comments = pd.read_sql_query(
    "SELECT body FROM May2015 LIMIT(12000)", 
    con=sql_conn)
# Full dataset has about 54,000,000 entries

comments = comments.drop_duplicates(subset=['body'], ignore_index=True)
comments = comments[comments['body'].str.count(r'[\u0000-\u007F]') >= (comments['body'].str.len() * PrepConfig.ENGLISH_THRESHOLD)]
comments = comments.reset_index(drop=True)  # Discard non-English entries
!rm {PrepConfig.H5DSET_PATH}  # rm existing
with tqdm(total=len(comments)) as pbar:
    with h5py.File(PrepConfig.H5DSET_PATH, "w") as f:
        training_group = f.create_group('training')
        training_group.attrs['max_len'] = PrepConfig.MAX_LEN
        testing_group = f.create_group('testing')
        testing_group.attrs['max_len'] = PrepConfig.MAX_LEN
        fields = ['body', 'mask', 'word', 'sentence']
        h5dset = {
            'train': {
                'group': training_group,
                **{field: None for field in fields}
            },
            'test': {
                'group': testing_group,
                **{field: None for field in fields}
            },
        }
        for chunk in split_dataframe(comments, PrepConfig.PD_CHUNK_SIZE):
            encodings = tokenizer(
                chunk['body'].to_list(), 
                max_length=PrepConfig.MAX_LEN,
                padding='max_length', 
                truncation=True, 
                return_tensors='np')
            chunk['input_ids'] = list(encodings.input_ids)
            chunk['attention_mask'] = list(encodings.attention_mask)
            chunk = chunk.groupby(np.arange(len(chunk)) // PrepConfig.BATCH_SIZE)\
                .apply(lambda x: body_to_embeddings(x, pbar)).reset_index(drop=True)
            data = {
                'body': chunk['body'].to_numpy(),
                'mask': np.stack(chunk['attention_mask'].to_numpy()),
                'word': np.stack(chunk['word_embeddings'].to_numpy()),
                'sentence': np.stack(chunk['sentence_embeddings'].to_numpy())
            }
            lengths = [len(data[key]) for key in data]
            assert all([x == lengths[0] for x in lengths])
            if pbar.n < len(comments) * (1-PrepConfig.TESTING_SPLIT):
                output_target = h5dset['train']
            else:
                output_target = h5dset['test']
            if output_target['word'] is None:
                assert all([output_target[field] is None for field in fields])
                for field in fields:
                    output_target[field] = output_target['group']\
                        .create_dataset(
                            field,
                            data=data[field],
                            maxshape=(None,)+data[field].shape[1:]
                            )
            else:
                assert all([output_target[field] is not None for field in fields])
                for field in fields:
                    output_target[field].resize(
                        output_target[field].shape[0] + data[field].shape[0],
                        axis=0
                    )
                    output_target[field][-data[field].shape[0]:] = data[field]


100%|██████████| 11/11 [00:00<00:00, 65.95it/s]


### View processed dataset

In [63]:
import random

with h5py.File(PrepConfig.H5DSET_TEST_PATH, 'r') as f:
    print("Training data:")
    for k in f['training']:
        print(repr(f['training'][k]))
    print("\nTesting data:")
    for k in f['testing']:
        print(repr(f['testing'][k]))
    print("\nText samples:")
    for i in random.sample(range(f['training']['body'].shape[0]), 3):
        print(f"{i}. {f['training']['body'][i]}\n")
    with np.printoptions(threshold=100):
        print(repr(f['training']['sentence'][1]))


Training data:
<HDF5 dataset "body": shape (10,), type "|O">
<HDF5 dataset "mask": shape (10, 64), type "<i8">
<HDF5 dataset "sentence": shape (10, 768), type "<f4">
<HDF5 dataset "word": shape (10, 64, 768), type "<f4">

Testing data:
<HDF5 dataset "body": shape (1,), type "|O">
<HDF5 dataset "mask": shape (1, 64), type "<i8">
<HDF5 dataset "sentence": shape (1, 768), type "<f4">
<HDF5 dataset "word": shape (1, 64, 768), type "<f4">

Text samples:
8. b'eagle'

5. b'flying disc'

4. b'dvd'

array([-0.5917937 ,  0.12636021,  0.74226815, ..., -0.16075838,
        0.36404175,  1.2876865 ], dtype=float32)


# Training

## Define model class

In [4]:
from typing import List
import gc
import logging

from transformers import DistilBertConfig, DistilBertTokenizerFast
import torch
import numpy as np

from config import TrainConfig

class Text2EmojiModel(torch.nn.Module):
    EOS_TOKEN = '[EOS]'
    sentence_onnx = create_model_for_provider(PrepConfig.SENTENCE_ONNX_PATH, "CUDAExecutionProvider")
    tokenizer = DistilBertTokenizerFast.from_pretrained(TrainConfig.DISTILBERT_NAME)


    def __init__(self, emoji_vocab:list, device:torch.device):
        super(Text2EmojiModel, self).__init__()
        self.emoji_vocab = emoji_vocab + [self.EOS_TOKEN]
        self.config = DistilBertConfig()
        self.encoder_head = torch.nn.TransformerEncoder(
            encoder_layer=torch.nn.TransformerEncoderLayer(
                d_model=self.config.dim,
                nhead=self.config.n_heads,
            ),
            num_layers=1,
        )
        self.classification = torch.nn.Linear(self.config.dim, len(self.emoji_vocab))
        self.device = device

    
    def forward(self, x, mask):
        '''
        args:
            x: distilbert embeddings of shape (SEQ_LEN, BATCH_SIZE, HIDDEN_DIM)
            mask: attention mask of shape (BATCH_SIZE, SEQ_LEN)
        
        returns:
            token classification logits: (MAX_SEQ_LEN, BATCH_SIZE, EMOJI_VOCAB_SIZE)
        '''
        mask = ~mask.bool()
        # Note: Huggingface masks attends to 1 and ignore 0, pytorch attends to True and ignores False
        _x = self.encoder_head(
            src=x,
            src_key_padding_mask=mask,
        )
        # print(_x.shape)  # (SEQ_LEN, BATCH_SIZE, HIDDEN_DIM)
        zeros = torch.zeros(
                min(TrainConfig.MAX_EMOJI_SENTENCE_LEN-_x.shape[0], _x.shape[0]), 
                _x.shape[1], 
                _x.shape[2], 
                device=self.device,
                requires_grad=True
                )
        _x = torch.cat(
            (_x, zeros),
            dim=0,
        )
        # print(_x.shape)  # (MAX_SEQ_LEN, BATCH_SIZE, HIDDEN_DIM)
        _x = self.classification(_x)
        # print(_x.shape)  # (MAX_SEQ_LEN, BATCH_SIZE, EMOJI_VOCAB_SIZE)
        return _x


    def embedding_to_emojis(self, x, debug=False) -> List[str]:
        '''
        args:
            x: one of self.forward outputs of shape (MAX_SEQ_LEN, EMOJI_VOCAB_SIZE)

        returns:
            list[str], float of shape (<=MAX_SEQ_LEN,), 0
        '''
        output = []
        m = torch.distributions.categorical.Categorical(logits=x, validate_args=True)
        s = m.sample()
        log_prob = m.log_prob(s)
        last = len(s)
        if debug:
            logging.info("embedding_to_emojis: ")
            logging.info(f"sample: {s}")
            torch.manual_seed(0)
        for i in range(len(s)):
            token_id = s[i]
            token = self.emoji_vocab[token_id]
            if debug:
                logging.info(f"token: {token}, token_id: {token_id}")
            if token == self.EOS_TOKEN:
                last = i+1
                break
            output.append(token)
        if debug:
            logging.info(f"last: {last}")
        return (output, torch.unsqueeze(torch.sum(log_prob[:last]),-1))
        
    
    def embeddings_to_emojis(self, x, debug=False) -> List[List[str]]:
        '''
        args:
            x: self.forward outputs of shape (MAX_SEQ_LEN, BATCH_SIZE, EMOJI_VOCAB_SIZE)
        
        returns:
            list[list[str]], list[float] of shape (BATCH_SIZE, <=MAX_SEQ_LEN), (BATCH_SIZE,)
        '''
        emojis = []
        log_probs = None
        if debug:
            logging.info("embeddings_to_emojis: ")
            logging.info(f"x shape: {x.shape}")
        for i in range(x.shape[1]):
            emoji, log_prob = self.embedding_to_emojis(x[:, i, :], debug=debug)
            if debug:
                logging.info(f"log_prob shape: {log_prob.shape}")
            emojis.append(emoji)
            if log_probs is None:
                log_probs = (log_prob)
            else:
                log_probs = torch.cat([log_probs, log_prob])
        return emojis, log_probs


    @classmethod
    def emojis_to_sentence_embedding(self, x, debug=False):
        '''
        args:
            x: embeddings_to_emojis outputs of shape (BATCH_SIZE, MAX_SEQ_LEN, EMOJI_VOCAB_SIZE)
        
        returns:
            array of shape (BATCH_SIZE, SENTENCE_EMBEDDING_DIM)
        '''
        x = [" ".join(y) for y in x]
        inputs = self.tokenizer(
            x, 
            padding=True, 
            truncation=True, 
            return_tensors='np'
            )
        if debug:
            logging.info("emojis_to_sentence_embeddings: ")
            logging.info(f"inputs shape: {inputs.shape}")
        _x = self.sentence_onnx.run(None, dict(inputs))
        _x = self.mean_pooling(_x, inputs['attention_mask'])
        return _x


    def embeddings_to_emojis_to_sentence_embeddings(self, x, debug=False):
        emojis, log_prob = self.embeddings_to_emojis(x, debug=debug)
        _x = self.emojis_to_sentence_embedding(emojis, debug=debug)
        return torch.from_numpy(_x).float().to(self.device), log_prob, emojis


    @staticmethod
    def mean_pooling(x, attention_mask):
        '''
        args:
            x: sentence transformer last layer outputs of shape (1, BATCH_SIZE, SEQ_LEN, HIDDEN_DIM_SIZE)

        returns:
            mean of last layer outpus across SEQ_LEN, taking attention mask into account
        '''
        token_embeddings = x[0]
        assert isinstance(token_embeddings, np.ndarray)
        if isinstance(attention_mask, np.ndarray):
            input_mask_expanded = np.tile(np.expand_dims(attention_mask, axis=-1), (1, 1, token_embeddings.shape[-1])).astype('float32')
        else:
            assert isinstance(attention_mask, torch.tensor)
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.shape).float()
        sum_embeddings = np.sum(token_embeddings * input_mask_expanded, axis=1)
        sum_mask = np.clip(np.sum(input_mask_expanded, axis=1), a_max=None, a_min=1e-9)
        return sum_embeddings / sum_mask
    

if False:
    dummy_vocab = ['a', 'b', 'c']
    dummy_input = torch.rand(64, 3, 768)
    dummy_mask = torch.cat((torch.ones(3, 32), torch.zeros(3, 32)), dim=1)
    model = Text2EmojiModel(dummy_vocab, 'cpu')
    param = ''
    for name, _ in model.named_parameters():
        if name != param:
            param = name
            print(name)
    output = model(dummy_input, dummy_mask)
    print(output.shape)
    embed, log_prob = model.embeddings_to_emojis_to_sentence_embeddings(output)
    print(embed.shape)
    print(log_prob.shape)
    del dummy_input, dummy_vocab, dummy_mask, model, param, output, embed, log_prob

gc.collect()

31

In [6]:
import torch

dist = torch.tensor([
    [
        [0.5, 0.5],
        [0.0, 1.0],
        [0.3, 0.7],
    ],
    [
        [0.5, 0.5],
        [1.0, 0.0],
        [0.3, 0.7],
    ],
    [
        [0.5, 0.5],
        [0.0, 1.0],
        [0.3, 0.7],
    ],
    [
        [0.5, 0.5],
        [1.0, 0.0],
        [0.3, 0.7],
    ]
])

print(dist.size())
m = torch.distributions.Categorical(logits=dist)
s = m.sample((2,))
logprob = m.log_prob(s)
print(s.shape)
lens = torch.tensor([
    [[1,1,1,1],
    [2,2,2,2]]
])
print(torch.arange(s.shape[-1]).view(1, s.shape[-1])\
    .repeat(s.shape[0], s.shape[1], 1).shape)
mask = torch.arange(s.shape[-1]).view(1, s.shape[-1]).repeat(s.shape[0], s.shape[1], 1) < lens.view(s.shape[0], s.shape[1], 1)
# print((logprob*mask).sum(-1))
# print(f"Sample: {s}")
# # print(f"Sample size: {s.size()}")
# print(f"Mask: {mask}")
# print(f"Log prob: {logprob}")
# print(f"Masked log prob: {mask*logprob}")
print(torch.arange(10).view(2,5).repeat(3, 1, 1).transpose(0, 1))

torch.Size([4, 3, 2])
torch.Size([2, 4, 3])
torch.Size([2, 4, 3])
tensor([[[0, 1, 2, 3, 4],
         [0, 1, 2, 3, 4],
         [0, 1, 2, 3, 4]],

        [[5, 6, 7, 8, 9],
         [5, 6, 7, 8, 9],
         [5, 6, 7, 8, 9]]])


## DataSet for training

In [7]:
import torch
import h5py

class Text2EmojiDataSet(torch.utils.data.Dataset):
    def __init__(self):
        self.h5dest = h5py.File(PrepConfig.H5DSET_PATH, 'r')['training']

    def __len__(self):
        return 100
        # return self.h5dest['word'].shape[0]

    def __getitem__(self, i):
        '''
        returns:
            (mask, word_embedding, sentence_embedding)
        '''
        return (
            self.h5dest['body'][i],
            torch.tensor(self.h5dest['mask'][i], requires_grad=True, dtype=torch.float32), 
            torch.tensor(self.h5dest['word'][i], requires_grad=True, dtype=torch.float32),
            torch.tensor(self.h5dest['sentence'][i], requires_grad=True, dtype=torch.float32)
        )

class TestDataSet(torch.utils.data.Dataset):
    def __init__(self):
        self.h5dest = h5py.File(PrepConfig.H5DSET_TEST_PATH, 'r')['training']

    def __len__(self):
        return self.h5dest['word'].shape[0]

    def __getitem__(self, i):
        '''
        returns:
            (body, mask, word_embedding, sentence_embedding)
        '''
        return (
            self.h5dest['body'][i],
            torch.tensor(self.h5dest['mask'][i], requires_grad=True, dtype=torch.float32), 
            torch.tensor(self.h5dest['word'][i], requires_grad=True, dtype=torch.float32),
            torch.tensor(self.h5dest['sentence'][i], requires_grad=True, dtype=torch.float32)
        )

## Training

In [None]:
!pip install tensorboard

In [11]:
import traceback
import math

import torch
from torch.autograd import Variable
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
from tqdm import tqdm

from config import TrainConfig

!rm -rf runs/*

logging.root.setLevel(logging.NOTSET)
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_printoptions(precision=None, threshold=100, edgeitems=None, linewidth=None, profile=None, sci_mode=None)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
writer = SummaryWriter()

# Set up dataset
dset = Text2EmojiDataSet()
train_set, val_set = torch.utils.data.random_split(
    dset,
    lengths=[
        math.ceil(len(dset)*(1-TrainConfig.VAL_SPLIT)), 
        math.floor(len(dset)*TrainConfig.VAL_SPLIT)
        ]
    )
train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=TrainConfig.BATCH_SIZE,
    shuffle=True
)
val_loader = torch.utils.data.DataLoader(
    val_set,
    batch_size=TrainConfig.BATCH_SIZE,
    shuffle=True
)

print(f"Train dataset size: {len(train_loader.dataset)}")
print(f"Validation dataset size: {len(val_loader.dataset)}")

# Setup model
emoji_vocab = pd.read_csv(TrainConfig.EMOJI_VOCAB_PATH, usecols=['cleaned_name'])['cleaned_name'].to_list()
model = Text2EmojiModel(emoji_vocab, device).to(device)

optimizer = torch.optim.SGD(
    model.parameters(), 
    lr=TrainConfig.LEARNING_RATE,
    momentum=0.95)

# print(model)
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, 
    step_size=TrainConfig.SCHEDULER_STEP_SIZE, 
    gamma=TrainConfig.SCHEDULER_GAMMA)

try:
    print('Training...', flush=True)
    print('Please view on tensorboard...', flush=True)
    rewards_history = []
    for epoch in tqdm(range(TrainConfig.EPOCHS)):

        writer.add_scalar('learning rate',
                        scheduler.get_last_lr()[0],
                        epoch * len(train_loader))

        running_loss = .0
        running_setence_reward = .0
        running_length_reward = .0
        emoji_length = .0
        for i, data in enumerate(train_loader, 0):
            body, mask, x, y = data
            mask, x, y = mask.to(device), x.to(device), y.to(device)
            x = torch.transpose(x, 0, 1)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(x, mask)
            y_pred, log_prob, emojis = model.embeddings_to_emojis_to_sentence_embeddings(outputs)
            sentence_reward = -torch.mean((y_pred - y) ** 2, -1)
            length_reward = -(torch.tensor(list(map(len, emojis))).to(device) ** 2) * TrainConfig.SHORT_EMOJI_REWARD
            reward = sentence_reward + length_reward
            expected_reward = np.average(rewards_history) if len(rewards_history) > 0 else 0

            loss = torch.mean(-log_prob * (reward-expected_reward))
            loss.backward()
            optimizer.step()
            
            
            rewards_history.append(torch.mean(reward).detach().cpu().numpy())
            rewards_history = rewards_history[-TrainConfig.REWARD_HISTORY_LEN:]
            running_loss += loss.item()
            running_setence_reward += torch.mean(sentence_reward.float()).detach().cpu().numpy()
            running_length_reward += torch.mean(length_reward.float()).detach().cpu().numpy()
            emoji_length += sum(map(len, emojis))/len(emojis)
            if i % TrainConfig.BATCH_PER_LOG == TrainConfig.BATCH_PER_LOG - 1:
                # Log the running loss
                mean_loss = running_loss / TrainConfig.BATCH_PER_LOG
                writer.add_scalar('Train/loss',
                                mean_loss,
                                epoch * len(train_loader) + i)
                writer.add_scalar('Train/emoji length',
                                    emoji_length / TrainConfig.BATCH_PER_LOG,
                                epoch * len(train_loader) + i)
                writer.add_scalar('Train/expected reward',
                                    np.average(rewards_history),
                                epoch * len(train_loader) + i)
                writer.add_scalar('Train/sentence reward',
                                    running_setence_reward / TrainConfig.BATCH_PER_LOG,
                                epoch * len(train_loader) + i)
                writer.add_scalar('Train/length reward',
                                    running_length_reward / TrainConfig.BATCH_PER_LOG,
                                epoch * len(train_loader) + i)
                if mean_loss < 500:
                    writer.add_text('low loss examples',
                                    " ".join(emojis[0]),
                                    epoch * len(train_loader) + i)
                with torch.no_grad():
                    val_loss = .0
                    for body_val, mask_val, x_val, y_val in val_loader:
                        mask_val = mask_val.to(device)
                        x_val = x_val.to(device)
                        y_val = y_val.to(device)
                        x_val = torch.transpose(x_val, 0, 1)
                        outputs_val = model(x_val, mask_val)
                        y_pred_val, log_prob_val, emojis_val = model.embeddings_to_emojis_to_sentence_embeddings(outputs_val, debug=False)
                        reward_val = -torch.mean((y_pred_val - y_val) ** 2, -1)
                        loss_val = torch.mean(log_prob_val * (reward_val))
                        val_loss += loss_val.item()
                    mean_val_loss = val_loss/len(val_loader)
                    writer.add_scalar(
                        'Validation/loss', 
                        mean_val_loss, 
                        epoch * len(train_loader) + i)
                    writer.add_scalar(
                        'Validation/emoji length', 
                        sum(map(len, emojis_val)) / len(val_loader), 
                        epoch * len(train_loader) + i)
                running_loss = .0
                running_setence_reward = .0
                running_length_reward = .0
                emoji_length = .0
            
        scheduler.step()
    gc.collect()
    print("Done!")
except Exception as e:
    print(e)
    traceback.print_exc()
    gc.collect()


Train dataset size: 90
Validation dataset size: 10
Training...
Please view on tensorboard...


100%|██████████| 100/100 [02:14<00:00,  1.35s/it]

Done!



