# MobileNetV2

In [1]:
"""
Creates a MobileNetV2 model as defined in the paper: M. Sandler, 
A. Howard, M. Zhu, A. Zhmoginov, L.-C. Chen. "MobileNetV2: Inverted 
Residuals and Linear Bottlenecks.", arXiv:1801.04381, 2018."

Code reference: https://github.com/tonylins/pytorch-mobilenet-v2
ImageNet pretrained weights: https://drive.google.com/file/d/1jlto6HRVD3ipNkAl1lNhDbkBp7HylaqR
"""
import math
import torch
import torch.nn as nn



def conv_bn(inp, oup, stride):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )


def conv_1x1_bn(inp, oup):
    return nn.Sequential(
        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
        nn.BatchNorm2d(oup),
        nn.ReLU6(inplace=True)
    )


class InvertedResidual(nn.Module):
    def __init__(self, inp, oup, stride, expand_ratio):
        super(InvertedResidual, self).__init__()
        self.stride = stride
        assert stride in [1, 2]

        hidden_dim = round(inp * expand_ratio)
        self.use_res_connect = self.stride == 1 and inp == oup

        if expand_ratio == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.ReLU6(inplace=True),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )

    def forward(self, x):
        if self.use_res_connect:
            return x + self.conv(x)
        else:
            return self.conv(x)


class MobileNetV2(nn.Module):
    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
        super(MobileNetV2, self).__init__()
        block = InvertedResidual
        input_channel = 32
        last_channel = 1280
        interverted_residual_setting = [
            # t, c, n, s
            [1, 16, 1, 1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # building first layer
        assert input_size % 32 == 0
        input_channel = int(input_channel * width_mult)
        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
        self.features = [conv_bn(3, input_channel, 2)]
        # building inverted residual blocks
        for t, c, n, s in interverted_residual_setting:
            output_channel = int(c * width_mult)
            for i in range(n):
                if i == 0:
                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
                else:
                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
                input_channel = output_channel
        # building last several layers
        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
        # make it nn.Sequential
        self.features = nn.Sequential(*self.features)

        # building classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.last_channel, n_class),
        )

        self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = x.mean(3).mean(2)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                n = m.weight.size(1)
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()
                
def MobileNet(pretrained=True, **kwargs):
    """
    Constructs a MobileNet V2 model.
    
    Parameters
    ----------
    pretrained: bool, use ImageNet pretrained model or not.
    n_class: int, 1000 classes in ImageNet data.
    weight_file: str, path to pretrained weights
    """
    weight_file = kwargs.pop('weight_file', '')
    model = MobileNetV2(**kwargs)
    if pretrained:
        state_dict = torch.load(weight_file)
        model.load_state_dict(state_dict)
    return model

# Encoder CNN

In [2]:
class EncoderCNN(nn.Module):
    """
    Convolutional Neural Network (MobileNetV2) that encodes input image 
    into encoded feature representations.
    """
    def __init__(self, weight_file, feature_size=14, tune_layer=None, finetune=False):
        """
        Parameters
        ----------
        weight_file: str, path to MobileNetV2 pretrained weights.
        feature_size: int, encoded feature map size to be used.
        tune_layer: int, tune layers from this layer onwards. For
            MobileNetV2 select integer from 0 (early) to 18 (final)
        finetune: bool, fine tune layers
        """
        super(EncoderCNN, self).__init__()
        self.weight_file = weight_file
        self.feature_size = feature_size
        self.tune_layer = tune_layer
        self.finetune = finetune
        self.pretrained = True
        
        # MobileNetV2 pretrained on ImageNet
        cnn = MobileNet(pretrained=self.pretrained, weight_file=self.weight_file)
        
        # Remove classification layer
        modules = list(cnn.children())[:-1]
        self.cnn = nn.Sequential(*modules)
        
        # Resize feature maps to fixed size to allow input images of variable size
        self.adaptive_pool = nn.AdaptiveAvgPool2d((self.feature_size, self.feature_size))
        
        # Fine-tune
        self.fine_tune()
        
    def forward(self, images):
        """
        Parameters
        ----------
        images: PyTorch tensor, size: [M, 3, H, W]
        """
        features = self.cnn(images) # size: [M, 1280, H/32, W/32]
        features = self.adaptive_pool(features) # size: [M, 1280, fs, fs]
        features = features.permute(0, 2, 3, 1) # size: [M, fs, fs, 1280]
        return features
    
    def fine_tune(self):
        """
        Fine-tuning CNN.
        """
        # Disable gradient computation
        for param in self.cnn.parameters():
            param.requires_grad = False
            
        # Enable gradient computation for few layers
        for child in list(self.cnn.children())[0][self.tune_layer:]:
            for param in child.parameters():
                param.requires_grad = self.finetune

# Attention Mechanism

In [3]:
class AttentionMechanism(nn.Module):
    """
    Attention Mechanism.
    """
    def __init__(self, encoder_size, decoder_size, attention_size):
        """
        Parameters
        ----------
        encoder_size: int, number of channels in encoder CNN output feature
            map (for MobileNetV2 it is 1280)
        decoder_size: int, number of features in the hidden state, i.e. LSTM 
            output size
        attention_size: int, size of MLP used to compute attention scores
        """
        super(AttentionMechanism, self).__init__()
        self.encoder_size = encoder_size
        self.decoder_size = decoder_size
        self.attention_size = attention_size
        
        # Linear layer to transform encoded features to attention size
        self.encoder_attn = nn.Linear(in_features=self.encoder_size, 
                                      out_features=self.attention_size)
        
        # Linear layer to transform decoders (hidden state) output to attention size
        self.decoder_attn = nn.Linear(in_features=self.decoder_size, 
                                      out_features=self.attention_size)
        
        # ReLU non-linearity
        self.relu = nn.ReLU()
        
        # Linear layer to compute attention scores at time t for L locations
        self.fc_attn = nn.Linear(in_features=self.attention_size, out_features=1)
        
        # Softmax layer to compute attention weights
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, encoder_out, decoder_out):
        """
        Parameters
        ----------
        encoder_out: PyTorch tensor, size: [M, L, D] where, L is feature
            map locations, and D is channels of encoded CNN feature map.
        decoder_out: PyTorch tensor, size: [M, h], where h is hidden
            dimension of the previous step output from decoder
            
        NOTE: M is batch size. k is attention size (see comments)
        
        Returns
        -------
        attn_weighted_encoding: PyTorch tensor, size: [M, D], attention weighted 
            annotation vector
        alpha: PyTorch tensor, size: [M, L], attention weights 
        """
        enc_attn = self.encoder_attn(encoder_out)  # size: [M, L, k]
        dec_attn = self.decoder_attn(decoder_out)  # size: [M, k]
        
        enc_dec_sum = enc_attn + dec_attn.unsqueeze(1)  # size: [M, L, k]
        
        # Compute attention scores for L locations at time t (Paper eq: 4)
        attn_scores = self.fc_attn(self.relu(enc_dec_sum))  # size: [M, L]
        
        # Compute for each location the probability that location i is the right 
        # place to focus for generating next word (Paper eq: 5)
        alpha = self.softmax(attn_scores.squeeze(2))  # size: [M, L]
        
        # Compute attention weighted annotation vector (Paper eq: 6)
        attn_weighted_encoding = torch.sum(encoder_out * alpha.unsqueeze(2), dim=1) # size: [M, D]
        
        return attn_weighted_encoding, alpha

# Decoder RNN /w Attention

In [4]:
class DecoderAttentionRNN(nn.Module):
    """
    RNN (LSTM) decoder to decode encoded images and generate sequences.
    """
    def __init__(self, encoder_size, decoder_size, attention_size, embedding_size, vocab_size, dropout=0.5):
        """
        encoder_size: int, number of channels in encoder CNN output feature
            map (for MobileNetV2 it is 1280)
        decoder_size: int, number of features in the hidden state, i.e. LSTM 
            output size
        attention_size: int, size of MLP used to compute attention scores
        embedding_size: int, size of embedding
        vocab_size: int, vocabulary size
        dropout: float, dropout probability
        """
        super(DecoderAttentionRNN, self).__init__()
        self.encoder_size = encoder_size
        self.decoder_size = decoder_size
        self.attention_size = attention_size
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.drop_prob = dropout
        
        # Create attention mechanism
        self.attention = AttentionMechanism(self.encoder_size, self.decoder_size, self.attention_size)
        
        # Create embedding layer
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)  # size: [V, E]
        
        # Create dropout module
        self.dropout = nn.Dropout(p=self.drop_prob)
        
        # Create LSTM cell (uses for loop) for decoding
        self.rnn = nn.LSTMCell(input_size=self.embedding_size + self.encoder_size, 
                               hidden_size=self.decoder_size, bias=True)
        
        # MLPs for LSTM cell's initial states
        self.init_h = nn.Linear(self.encoder_size, self.decoder_size)
        self.init_c = nn.Linear(self.encoder_size, self.decoder_size)
        
        # MLP to compute beta (gating scalar, paper section 4.2.1)
        self.f_beta = nn.Linear(self.decoder_size, 1) # scalar
        
        # Sigmoid to compute beta
        self.sigmoid = nn.Sigmoid()
        
        # FC layer to compute scores over vocabulary
        self.fc = nn.Linear(self.decoder_size, self.vocab_size)
        
    def init_lstm_states(self, encoder_out):
        """
        Initialize LSTM's initial hidden and cell memory states based on encoded
        feature representation. NOTE: Encoded feature map locations mean is used.
        """
        # Compute mean of encoder output locations
        mean_encoder_out = torch.mean(encoder_out, dim=1)  # size: [M, L, D] -> [M, D]
        
        # Initialize LSTMs hidden state
        h0 = self.init_h(mean_encoder_out)  # size: [M, h]
        
        # Initialize LSTMs cell memory state
        c0 = self.init_c(mean_encoder_out)  # size: [M, h]
        
        return h0, c0
    
    def forward(self, encoder_out, encoded_captions, caption_lengths):
        """
        Parameters
        ----------
        encoder_out: PyTorch tensor, size: [M, fs, fs, D] where, fs is feature
            map size, and D is channels of encoded CNN feature map.
        encoded_captions: PyTorch long tensor
        caption_lengths: PyTorch tensor
        """
        batch_size = encoder_out.size(0)
        
        # Flatten encoded feature maps from size [M, fs, fs, D] to [M, L, D]
        encoder_out = encoder_out.view(batch_size, -1, self.encoder_size)
        num_locations = encoder_out.size(1)
        
        # Sort caption lengths in descending order
        caption_lengths, sorted_idx = torch.sort(caption_lengths.squeeze(1), dim=0, 
                                                 descending=True)
        
        # Compute decode lengths to decode. Sequence generation ends when <END> token
        # is generated. A typical caption is [<START>, ..., <END>, <PAD>, <PAD>], caption
        # lengths only considers [<START>, ..., <END>], so when <END> is generated there
        # is no need to decode further. Decode lengths = caption lengths - 1
        decode_lengths = (caption_lengths - 1).tolist()
        
        # Sort encoded feature maps and captions as per caption lengths. REASON: Since a 
        # batch contains different caption lengths (and decode lengths). At each time step 
        # up to max decode length T in a batch we need to apply attention mechanism to only 
        # those images in batch whose decode length is greater than current time step
        encoder_out = encoder_out[sorted_idx]
        encoded_captions = encoded_captions[sorted_idx]
        
        # Get embeddings for encoded captions
        embeddings = self.embedding(encoded_captions) # size: [M, T, E]
        
        # Initialize LSTM's states
        h, c = self.init_lstm_states(encoder_out) # sizes: [M, h], [M, h]
        
        # Compute max decode length
        T = int(max(decode_lengths))
        
        # Create placeholders to store predicted scores and alphas (alphas for doubly stochastic attention)
        pred_scores = torch.zeros(batch_size, T, self.vocab_size) # size: [M, T, V]
        alphas = torch.zeros(batch_size, T, num_locations) # size: [M, T, L]
        
        # Decoding step: (1) Compute attention weighted encoding and attention weights
        # using encoder output, and initial hidden state; (2) Generate a new encoded word
        for t in range(T):
            # Compute batch size at step t (At step t how many decoding lengths are greater than t)
            batch_size_t = sum([dl > t for dl in decode_lengths])
            
            # Encoder output and encoded captions are already sorted by caption lengths
            # in descending order. So based on the number of decoding lengths that are 
            # greater than current t, extract data from encoded output and initial hidden state
            # as input to attention mechanism. 
            attn_weighted_encoding, alpha = self.attention(encoder_out[:batch_size_t],
                                                           h[:batch_size_t])
                        
            # Compute gating scalar beta (paper section: 4.2.1)
            beta_t = self.sigmoid(self.f_beta(h[:batch_size_t])) # size: [M, 1]
                        
            # Multiply gating scalar beta to attention weighted encoding
            context_vector = beta_t * attn_weighted_encoding  # size: [M, D]
                        
            # Concatenate embeddings and context vector, size: [M, E] and [M, D] -> [M, E+D]
            concat_input = torch.cat([embeddings[:batch_size_t, t, :], context_vector], dim=1) # size: [M, E+D]
                        
            # LSTM input states from time step t-1
            previous_states = (h[:batch_size_t], c[:batch_size_t])
                        
            # Generate decoded word
            h, c = self.rnn(concat_input, previous_states)
            
            # Compute scores over vocabulary
            scores = self.fc(self.dropout(h)) # size: [M, V]
            
            # Populate placeholders for predicted scores and alphas
            pred_scores[:batch_size_t, t, :] = scores
            alphas[:batch_size_t, t, :] = alpha # alpha size: [M, L]
            
        return pred_scores, encoded_captions, decode_lengths, alphas, sorted_idx

# COCO Dataloader

In [5]:
import os
import json
import h5py
import numpy as np

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class COCODataset(Dataset):
    """
    COCO Dataset to be used in DataLoader for creating batches 
    during training.
    """
    def __init__(self, config, split='TRAIN', transform=None):
        self.config = config
        self.split = split
        self.transform = transform
        
        # Open files where images are stored in HDF5 data fromat, captions & their lengths
        if self.split == 'TRAIN':
            self.hdf5 = h5py.File(name=self.config.train_hdf5, mode='r')
            self.captions = self.read_json(self.config.train_captions)
        else:
            self.hdf5 = h5py.File(name=self.config.val_hdf5, mode='r')
            self.captions = self.read_json(self.config.val_captions)
            
        # Get image data
        self.images = self.hdf5['images']
                    
    def read_json(self, json_path):
        with open(json_path, 'r') as j:
            json_data = json.load(j)
        return json_data
        
    def __len__(self):
        return len(self.captions)
    
    def __getitem__(self, idx):
        img = torch.FloatTensor(self.images[idx])
        if self.transform is not None:
            img = self.transform(img)
         
        # There are 5 captions so randomly sample 1 caption
        cap_idx = np.random.randint(0, high=5)
        caption = torch.LongTensor(self.captions[idx][0][cap_idx])
        length = torch.LongTensor([self.captions[idx][1][cap_idx]])
        
        if self.split == 'TRAIN':
            return img, caption, length
        else:
            captions = torch.LongTensor(self.captions[idx][0])
            return img, caption, length, captions

# Helper

In [6]:
class AverageMeter(object):
    """
    Computes and stores the average and current value of some metric.
    
    Reference: https://github.com/pytorch/examples/blob/master/imagenet/main.py
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
class DataConfig(object):
    def __init__(self):
        # Training data 
        self.train_hdf5 = '/home/ankoor/caption/data/TRAIN_IMAGES_COCO_5_WordCountThresh.hdf5'
        self.train_captions = '/home/ankoor/caption/data/TRAIN_CAPTIONS_COCO_5_WordCountThresh.json'
        
        # Validation data
        self.val_hdf5 = '/home/ankoor/caption/data/VAL_IMAGES_COCO_5_WordCountThresh.hdf5'
        self.val_captions = '/home/ankoor/caption/data/VAL_CAPTIONS_COCO_5_WordCountThresh.json'
        
def read_json(file):
    with open(file, 'r') as f:
        data = json.load(f)
    return data

# Optimizer
import torch.optim as optim

def get_optimizer(net, opt_dict):
    params = []
    for key, value in dict(net.named_parameters()).items():
        if value.requires_grad:
            params += [{'params': [value], 'lr': opt_dict['lr']}]

    # Initialize optimizer class: ADAM or SGD (w/wo nesterov)
    if opt_dict['optimizer'] == 'adam':
        optimizer = optim.Adam(params=params, weight_decay=opt_dict['weight_decay'])
    else:
        optimizer = optim.SGD(params=params, momentum=0.9, 
                              weight_decay=opt_dict['weight_decay'],
                              nesterov=(opt_dict['optimizer'] == 'nesterov'))

    return optimizer

# GPU Check 

`RuntimeError: cublas runtime error : resource allocation failed at /pytorch/aten/src/THC/THCGeneral.cpp:333`

In [7]:
import torchvision.transforms as transforms
from torch.nn.utils.rnn import pack_padded_sequence

In [8]:
# Train
encoder_size = 1280
decoder_size = 1024
attention_size = 512
embedding_size = 256

# Word2Index
word2idx_file = './WORD2IDX_COCO.json'
word2idx = read_json(word2idx_file)
vocab_size = len(word2idx)

# Cuda Device
cuda_device = 'cuda:' + str(1)

# Dataset and data loader
imgs_mean = [0.485, 0.456, 0.406]
imgs_std = [0.229, 0.224, 0.225]
normalize = transforms.Normalize(mean=imgs_mean, std=imgs_std)
config = DataConfig()
coco = COCODataset(config, transform=transforms.Compose([normalize]))
loader = DataLoader(coco, batch_size=32, shuffle=True)

# Encoder and decoder
encoder = EncoderCNN(weight_file='./mobilenet_v2.pth.tar')
decoder = DecoderAttentionRNN(encoder_size, decoder_size, attention_size, embedding_size, vocab_size)

# Move encoder and decoder to GPU
encoder = encoder.to(cuda_device)
decoder = decoder.to(cuda_device)

# Optimizer
optimizer = get_optimizer(decoder, opt_dict={'lr': 0.001, 'weight_decay': 0.5, 'optimizer': 'adam'})

# Loss
criterion = nn.CrossEntropyLoss()

# Train loop
for i, (imgs, caps, lengths) in enumerate(loader):
    imgs = imgs.to(cuda_device)
    caps = caps.to(cuda_device)
    lengths = lengths.to(cuda_device)
    encoder_out = encoder(imgs)
    pred_scores, sorted_captions, decode_lengths, alphas, sorted_idx = decoder(encoder_out, caps, lengths)
    
    # Select all words after <START> till <END>
    target_caps = sorted_captions[:, 1:]
            
    # Pack padded sequences. Before computing Cross Entropy Loss (Log Softmax and Negative Log
    # Likelihood Loss) we do not want to take into account padded items in the predicted scores
    scores, _ = pack_padded_sequence(pred_scores, decode_lengths, batch_first=True)
    targets, _ = pack_padded_sequence(target_caps, decode_lengths, batch_first=True)
    
    scores = scores.data.to(cuda_device)
    targets = targets.data.to(cuda_device)
    
    loss = criterion(scores, targets)
    
    loss += (1.0 * ((1.0 - alphas.sum(dim=1))**2).mean()).to(cuda_device)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print('loss: {} at batch: {}'.format(loss, i))
    print('-----' * 10)
    
    if i == 10:
        break

loss: 10.058655738830566 at batch: 0
--------------------------------------------------
loss: 10.050213813781738 at batch: 1
--------------------------------------------------
loss: 10.057500839233398 at batch: 2
--------------------------------------------------
loss: 10.05722427368164 at batch: 3
--------------------------------------------------
loss: 10.058683395385742 at batch: 4
--------------------------------------------------
loss: 10.060348510742188 at batch: 5
--------------------------------------------------
loss: 10.053070068359375 at batch: 6
--------------------------------------------------
loss: 10.047382354736328 at batch: 7
--------------------------------------------------
loss: 10.060755729675293 at batch: 8
--------------------------------------------------
loss: 10.053255081176758 at batch: 9
--------------------------------------------------
loss: 10.048922538757324 at batch: 10
--------------------------------------------------


# Errod due to word2idx length

In [9]:
# First and last word indices
print(word2idx['checkered'])
print(word2idx['<PAD>'])

6652
0


In [10]:
# First and last words
idx2word = {v:k for k, v in word2idx.items()}
print(idx2word[0])
print(idx2word[9490])  # ERROR!

<PAD>


KeyError: 9490

In [11]:
# Embeddings
print('decoder embedding: ', decoder.embedding)

decoder embedding:  Embedding(9490, 256)


In [12]:
# Correct use of embeddings
word_to_ix = {"hello": 0, "world": 1} # NOTE: index must start from 0
V = len(word_to_ix)
embeds = nn.Embedding(V, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print(hello_embed)
print(embeds)

tensor([[-1.3827, -0.6061, -0.9977, -0.8474, -0.3694]], grad_fn=<EmbeddingBackward>)
Embedding(2, 5)


In [13]:
chars = list('abcdefghijklmnopqrstuvwxyz')
char2idx = {w:i for i, w in enumerate(chars)}
idx2char = {i:w for w, i in char2idx.items()}
print(len(chars), len(char2idx), len(idx2char))

CHAR_EMBEDDING = nn.Embedding(num_embeddings=len(char2idx) + 4, embedding_dim=5)

26 26 26


In [14]:
for i in range(len(char2idx)):
    lookup = torch.tensor([i], dtype=torch.long)
    e = CHAR_EMBEDDING(lookup)
    print('Char: "{}" at ID: {} >>> Embedding: {}'.format(chars[i], i, e.data))

Char: "a" at ID: 0 >>> Embedding: tensor([[-2.4745, -0.0988, -0.5518, -0.1357,  0.4571]])
Char: "b" at ID: 1 >>> Embedding: tensor([[-1.4542,  0.5362,  1.7906,  2.0797, -0.0155]])
Char: "c" at ID: 2 >>> Embedding: tensor([[-0.5314, -0.8129, -1.2844, -0.6234,  0.7542]])
Char: "d" at ID: 3 >>> Embedding: tensor([[ 1.7102, -0.5863, -0.6822,  0.6130, -1.2955]])
Char: "e" at ID: 4 >>> Embedding: tensor([[ 0.2786,  0.2219,  0.6588,  1.0720, -0.2213]])
Char: "f" at ID: 5 >>> Embedding: tensor([[ 0.4035,  0.6903, -0.5266,  0.2307,  1.9849]])
Char: "g" at ID: 6 >>> Embedding: tensor([[-0.4465,  0.8677,  0.1557,  0.7353,  0.3335]])
Char: "h" at ID: 7 >>> Embedding: tensor([[ 0.1987, -2.9699, -1.3931, -0.9935,  0.8088]])
Char: "i" at ID: 8 >>> Embedding: tensor([[ 1.0788, -1.0492, -0.9509,  0.1452, -0.0709]])
Char: "j" at ID: 9 >>> Embedding: tensor([[ 0.7014,  0.0016, -1.4311, -0.3827, -0.6886]])
Char: "k" at ID: 10 >>> Embedding: tensor([[ 0.5310,  1.0626,  0.8286, -0.5336, -0.1391]])
Char: "l"

In [15]:
# Word2Index
word2idx_file = './WORD2IDX_COCO.json'
word2idx = read_json(word2idx_file)
vocab_size = len(word2idx)
print(vocab_size)

# Index2word
idx2word = {i:w for w, i in word2idx.items()}
print(len(idx2word))

9490
9490


In [17]:
# Word2Index
word2idx_file = './WORD2IDX_COCO.json'
word2idx = read_json(word2idx_file) # <PAD> at 0th position
vocab_size = len(word2idx)
print(vocab_size)

# Index2word
idx2word = {i:w for w, i in word2idx.items()}
print(len(idx2word))

9490
9490


In [18]:
idx2word[0], idx2word[len(idx2word)-1]

('<PAD>', '<END>')

In [19]:
chars = list('abcdefghijklmnopqrstuvwxyz')
char2idx = {w:i+1 for i, w in enumerate(chars)}
char2idx['#'] = 0
idx2char = {i:w for w, i in char2idx.items()}
print(len(chars), len(char2idx), len(idx2char))

idx2char[0], idx2char[len(chars)]

26 27 27


('#', 'z')

In [21]:
chars = list('abcdefghijklmnopqrstuvwxyz')
char2idx = {w:i for i, w in enumerate(chars)}
char2idx['#'] = len(char2idx) + 1
idx2char = {i:w for w, i in char2idx.items()}
print(len(chars), len(char2idx), len(idx2char))

idx2char[0]

26 27 27


'a'