In [1]:
import torch
import numpy as np

### Dropout layer

In [9]:
tensor = torch.randn(4, 2, 2)
print(tensor)
tensor.shape

tensor([[[ 0.9995,  0.1025],
         [-0.4816,  1.2860]],

        [[-0.1652,  0.4024],
         [ 0.1378, -1.2458]],

        [[ 1.8742, -0.9989],
         [-0.8966, -0.8257]],

        [[ 0.2927,  0.7121],
         [ 1.3994, -1.1206]]])


torch.Size([4, 2, 2])

In [10]:
dropout = torch.nn.Dropout(0.6)

In [11]:
op = dropout(tensor)
print(op)
op.shape

tensor([[[ 0.0000,  0.0000],
         [-0.0000,  0.0000]],

        [[-0.4129,  0.0000],
         [ 0.3444, -0.0000]],

        [[ 4.6855, -2.4972],
         [-0.0000, -0.0000]],

        [[ 0.0000,  0.0000],
         [ 0.0000, -2.8015]]])


torch.Size([4, 2, 2])

### `view` and `transpose` operations

In [13]:
tensor = torch.randn(1, 4, 4)
print(tensor)
tensor = tensor.view(1, 4, 2, 2)
tensor

tensor([[[-1.1589,  0.5802,  0.4998, -1.3935],
         [ 1.1295, -0.1280,  0.4573,  1.2726],
         [ 0.4439, -0.8118,  1.8655, -0.1175],
         [ 0.8269,  1.5722, -0.0410,  0.5759]]])


tensor([[[[-1.1589,  0.5802],
          [ 0.4998, -1.3935]],

         [[ 1.1295, -0.1280],
          [ 0.4573,  1.2726]],

         [[ 0.4439, -0.8118],
          [ 1.8655, -0.1175]],

         [[ 0.8269,  1.5722],
          [-0.0410,  0.5759]]]])

In [16]:
tensor_tr = tensor.transpose(1, 2)
print(tensor_tr)
tensor_tr.shape

tensor([[[[-1.1589,  0.5802],
          [ 1.1295, -0.1280],
          [ 0.4439, -0.8118],
          [ 0.8269,  1.5722]],

         [[ 0.4998, -1.3935],
          [ 0.4573,  1.2726],
          [ 1.8655, -0.1175],
          [-0.0410,  0.5759]]]])


torch.Size([1, 2, 4, 2])

In [15]:
torch.transpose?

[0;31mDocstring:[0m
transpose(input, dim0, dim1) -> Tensor

Returns a tensor that is a transposed version of :attr:`input`.
The given dimensions :attr:`dim0` and :attr:`dim1` are swapped.

If :attr:`input` is a strided tensor then the resulting :attr:`out`
tensor shares its underlying storage with the :attr:`input` tensor, so
changing the content of one would change the content of the other.

If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` then the
resulting :attr:`out` tensor *does not* share the underlying storage
with the :attr:`input` tensor.

If :attr:`input` is a :ref:`sparse tensor <sparse-docs>` with compressed
layout (SparseCSR, SparseBSR, SparseCSC or SparseBSC) the arguments
:attr:`dim0` and :attr:`dim1` must be both batch dimensions, or must
both be sparse dimensions. The batch dimensions of a sparse tensor are the
dimensions preceding the sparse dimensions.

.. note::
    Transpositions which interchange the sparse dimensions of a `SparseCSR`
    or `SparseCSC` 

In [18]:
def transpose_any_dims(tensor, dim0, dim1):
    # Create a list of the dimensions in their original order
    dims = list(range(len(tensor)))

    # Swap the two dimensions
    dims[dim0], dims[dim1] = dims[dim1], dims[dim0]

    # Create a recursive function to transpose the tensor
    def transpose_recursive(tensor, dims):
        if len(dims) == 1:
            return tensor
        else:
            return [transpose_recursive(list(t), dims[1:]) for t in zip(*tensor)]

    # Use the recursive function with the new order of dimensions
    return transpose_recursive(tensor, dims)

# Example usage:
tensor_ex = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]  # A 3D tensor
print(np.array(tensor_ex).shape)
print(transpose_any_dims(tensor_ex, 0, 1))  # Transpose dimensions 0 and 1

(2, 2, 2)
[[[1, 2], [5, 6]], [[3, 4], [7, 8]]]


### `nn.Parameter`

In [22]:
a = torch.nn.Parameter(torch.randn(1, 1))
isinstance(a, torch.Tensor)

True

In [23]:
torch.Tensor.contiguous?

[0;31mDocstring:[0m
contiguous(memory_format=torch.contiguous_format) -> Tensor

Returns a contiguous in memory tensor containing the same data as :attr:`self` tensor. If
:attr:`self` tensor is already in the specified memory format, this function returns the
:attr:`self` tensor.

Args:
    memory_format (:class:`torch.memory_format`, optional): the desired memory format of
        returned Tensor. Default: ``torch.contiguous_format``.
[0;31mType:[0m      method_descriptor

In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset('cfilt/iitb-english-hindi', split = 'train')

In [3]:
data

Dataset({
    features: ['translation'],
    num_rows: 1659083
})

In [4]:
32_000 * 0.9

28800.0

In [5]:
type(data)

datasets.arrow_dataset.Dataset

In [6]:
# Take 100 samples from the dataset
data = data.select(range(100))

In [7]:
len(data)

100

In [8]:
# Print one sample from the train dataset
sample = data[0]
print(type(sample))
print(sample)

<class 'dict'>
{'translation': {'en': 'Give your application an accessibility workout', 'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'}}


In [9]:
from torch.utils.data import  random_split

In [10]:
splits = random_split(data, [90, 10])
for split in splits:
    print(split.indices)

[43, 74, 61, 95, 72, 11, 88, 47, 82, 63, 35, 81, 6, 44, 42, 48, 16, 73, 26, 14, 18, 69, 83, 70, 89, 45, 10, 25, 1, 19, 68, 52, 38, 12, 80, 59, 21, 56, 64, 76, 97, 53, 57, 55, 49, 96, 5, 90, 13, 4, 65, 37, 71, 46, 51, 87, 75, 66, 39, 22, 3, 79, 30, 2, 86, 85, 17, 33, 62, 34, 50, 7, 67, 94, 9, 40, 24, 41, 99, 91, 36, 58, 27, 98, 77, 28, 60, 84, 31, 29]
[54, 92, 0, 23, 32, 15, 93, 8, 78, 20]


## `torch.triu`

In [2]:
a = torch.randn(3, 3)
a

tensor([[ 0.1309, -0.1964, -1.0162],
        [ 0.7027, -0.0395, -1.3087],
        [-0.6452, -1.2540,  0.3653]])

In [3]:
torch.triu(a)

tensor([[ 0.1309, -0.1964, -1.0162],
        [ 0.0000, -0.0395, -1.3087],
        [ 0.0000,  0.0000,  0.3653]])

In [4]:
torch.triu(a, diagonal=1)

tensor([[ 0.0000, -0.1964, -1.0162],
        [ 0.0000,  0.0000, -1.3087],
        [ 0.0000,  0.0000,  0.0000]])

## Huggingface `WordLevel` tokenizer

In [5]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

In [6]:
tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
tokenizer

<tokenizers.Tokenizer at 0x7f3ed64fe230>

In [7]:
tokenizer.pre_tokenizer = Whitespace()

In [8]:
trainer = WordLevelTrainer(special_tokens = ["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
                                   min_frequency = 2)

In [9]:
ds = ["This is a test", "This is another test", "This is a test",
      "Hello world", "Hello world", "Hello world", "Hello world"]

In [18]:
len(set([word for sent in ds for word in sent.split(' ')]))

7

In [13]:
def get_all_sentences(ds):
    for item in ds:
        yield item

In [14]:
tokenizer.train_from_iterator(get_all_sentences(ds), trainer = trainer)

In [19]:
# Tokenize a sentence using the tokenizer
# "another" should not be in the vocabulary, because it appears only once
s = "This is another test"
print(s)
output = tokenizer.encode(s)
print(output.ids)
print(output.tokens)

This is another test
[6, 7, 0, 8]
['This', 'is', '[UNK]', 'test']


In [17]:
tokenizer.get_vocab_size()

10

In [22]:
tokenizer.token_to_id("[UNK]")

0

In [27]:
torch.randn(3, 3).shape

torch.Size([3, 3])

In [32]:
torch.squeeze?

[0;31mDocstring:[0m
squeeze(input, dim=None) -> Tensor

Returns a tensor with all specified dimensions of :attr:`input` of size `1` removed.

For example, if `input` is of shape:
:math:`(A \times 1 \times B \times C \times 1 \times D)` then the `input.squeeze()`
will be of shape: :math:`(A \times B \times C \times D)`.

When :attr:`dim` is given, a squeeze operation is done only in the given
dimension(s). If `input` is of shape: :math:`(A \times 1 \times B)`,
``squeeze(input, 0)`` leaves the tensor unchanged, but ``squeeze(input, 1)``
will squeeze the tensor to the shape :math:`(A \times B)`.

.. note:: The returned tensor shares the storage with the input tensor,
          so changing the contents of one will change the contents of the other.

          will also remove the batch dimension, which can lead to unexpected
          errors. Consider specifying only the dims you wish to be squeezed.

Args:
    input (Tensor): the input tensor.
    dim (int or tuple of ints, optional): if

In [43]:
op_ids = tokenizer.encode(s).ids
pad_token = torch.tensor([tokenizer.token_to_id("[PAD]")], dtype = torch.int64)
sos_token = torch.tensor([tokenizer.token_to_id("[SOS]")], dtype = torch.int64)
eos_token = torch.tensor([tokenizer.token_to_id("[EOS]")], dtype = torch.int64)
# print(pad_token)

enc_input = torch.cat([
    sos_token,
    torch.tensor(op_ids, dtype = torch.int64),
    eos_token,
    torch.tensor([pad_token] * 10, dtype = torch.int64).squeeze()
], dim = 0)

enc_input

tensor([2, 6, 7, 0, 8, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [47]:
(enc_input != pad_token).unsqueeze(0).unsqueeze(0).int()

tensor([[[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]], dtype=torch.int32)

In [46]:
torch.unsqueeze?

[0;31mDocstring:[0m
unsqueeze(input, dim) -> Tensor

Returns a new tensor with a dimension of size one inserted at the
specified position.

The returned tensor shares the same underlying data with this tensor.

A :attr:`dim` value within the range ``[-input.dim() - 1, input.dim() + 1)``
can be used. Negative :attr:`dim` will correspond to :meth:`unsqueeze`
applied at :attr:`dim` = ``dim + input.dim() + 1``.

Args:
    input (Tensor): the input tensor.
    dim (int): the index at which to insert the singleton dimension

Example::

    >>> x = torch.tensor([1, 2, 3, 4])
    >>> torch.unsqueeze(x, 0)
    tensor([[ 1,  2,  3,  4]])
    >>> torch.unsqueeze(x, 1)
    tensor([[ 1],
            [ 2],
            [ 3],
            [ 4]])
[0;31mType:[0m      builtin_function_or_method

In [48]:
from pathlib import Path

In [49]:
str(Path('.') / "hello" / "1.pt")

'hello/1.pt'

In [6]:
for i in range(100, 105):
    print(f'Processing {i:02d}')

Processing 100
Processing 101
Processing 102
Processing 103
Processing 104


In [7]:
sm = 0.1
f'I am {sm:6.3f}'

'I am  0.100'