## 1. Tokenization

#### 1.1 Load text file

In [1]:
with open("the-verdict.txt", "r", encoding = "utf-8") as t:
    raw_text = t.read()

print(f"Total number of characters: {len(raw_text)}")
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


#### 1.2 RE tokenizer

In [2]:
import re

sample = "Hey, what's good?"
result = re.split(r'([,.:;?!"()\'/]|--|\s)', sample)

print(result)

['Hey', ',', '', ' ', 'what', "'", 's', ' ', 'good', '?', '']


In [3]:
result = [item for item in result if item.strip()] # retunrs false for whitespaces / no spaces
print(result)

['Hey', ',', 'what', "'", 's', 'good', '?']


Now, apply RE tokenizer to main text.

In [4]:
preprocessed = re.split(r'([.,:;?!"()\'/]|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.split()]

print(f"Number of tokens: {len(preprocessed)}")
print(preprocessed[:30])

Number of tokens: 4654
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


#### 1.3 Token ID creation

In [5]:
all_words = sorted(set(preprocessed))
print(f"Length of vocabulary: {len(all_words)}")

Length of vocabulary: 1139


In [6]:
vocab = {token:integer for integer, token in enumerate(all_words)}

for t, i in vocab.items():
    print(t, i)
    if i >= 20:
        break

! 0
" 1
' 2
( 3
) 4
, 5
-- 6
. 7
: 8
; 9
? 10
A 11
Ah 12
Among 13
And 14
Are 15
Arrt 16
As 17
At 18
Be 19
Begin 20


#### 1.4 Tokenizer class

In [7]:
class TokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\'/]|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip() # remove white spaces
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join(self.int_to_str[i] for i in ids) # int_to_str gives back the text in a list. " ".join joins them into a normal sentence
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # fixes space before punctuations
        return text

In [8]:
tokenizer = TokenizerV1(vocab)
s2i = tokenizer.encode("I HAD always thought.")
tokenizer.decode(s2i)

'I HAD always thought.'

#### 1.5 Special Context Tokens

In [9]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(['<|endoftext|>', '<|unk|>'])

vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(f"Length of vocabulary with special context tokens: {len(vocab.items())}")

Length of vocabulary with special context tokens: 1141


In [10]:
keys = []
for k, v in enumerate(vocab.keys()):
    keys.append(v)
print(keys[-5:])

['younger', 'your', 'yourself', '<|endoftext|>', '<|unk|>']


In [11]:
class TokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\'/]|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip() # remove white spaces
        ]
        # if item not in vocab, replace it with <|unk|> token
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        preprocessed.append("<|endoftext|>")
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join(self.int_to_str[i] for i in ids) # int_to_str gives back the text in a list. " ".join joins them into a normal sentence
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # fixes space before punctuations
        return text

In [12]:
tokenizer = TokenizerV2(vocab)
s2i = tokenizer.encode("Hello, how are you doing?")
tokenizer.decode(s2i)

'<|unk|>, how are you doing? <|endoftext|>'

#### 1.6 Byte Pair Encoding

In [13]:
import importlib
import tiktoken

In [14]:
tokenizer = tiktoken.get_encoding('gpt2')

In [15]:
text = "Hello, Ilham. <|endoftext|> Would you like a cupoftea?"
integers = tokenizer.encode(text, allowed_special = {'<|endoftext|>'})
print(integers)

[15496, 11, 13778, 2763, 13, 220, 50256, 10928, 345, 588, 257, 6508, 1659, 660, 64, 30]


In [16]:
strings = tokenizer.decode(integers)
print(strings)

Hello, Ilham. <|endoftext|> Would you like a cupoftea?


---

## 2. Input-Target Pairs

In [17]:
with open("the-verdict.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print("Total number of tokens from byte pair encoding:", len(enc_text))

Total number of tokens from byte pair encoding: 5145


In [18]:
context_size = 5 # input will have 5 tokens
x = enc_text[:context_size]
y = enc_text[1:context_size + 1]
print(f"X: {x}")
print(f"y:     {y}")

X: [40, 367, 2885, 1464, 1807]
y:     [367, 2885, 1464, 1807, 3619]


#### 2.1 Using Dataloader

In [19]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_len, stride): # max_len is context size
        self.input_ids = []
        self.target_ids = []

        # tokenize the text
        token_ids = tokenizer.encode(txt, allowed_special = {"<|endoftext|>"})

        # sliding window to create overlapping sequences
        for i in range(0, len(token_ids) - max_len, stride):
            input_chunk = token_ids[i:i + max_len]
            target_chunk = token_ids[i + 1:i + max_len + 1]
            self.input_ids.append(input_chunk)
            self.target_ids.append(target_chunk)
    
    # the below 2 methods is required for Dataloader to be used
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx): # we are basically saying that if the input is the 50th tensor, then the output is the 50th tensor
        return (
            torch.tensor(self.input_ids[idx], dtype=torch.long),
            torch.tensor(self.target_ids[idx], dtype=torch.long)
        )

The idea is to form something like as follows:<br><br>
[[1, 2, 3, 4],<br>
[5, 6, 7, 8],<br>
[9, 10, 11, 12]]<br><br>
[[2, 3, 4, 5],<br>
[6, 7, 8, 9],<br>
[10, 11, 12, 13]]<br><br>
...where the first matrix is X and the second matrix is y. Note that in the above example, the stride as well as the max length is 4. If the stride was 2:<br><br>
[[1, 2, 3, 4],<br>
[3, 4, 5, 6],<br>
[5, 6, 7, 8]]<br><br>
[[2, 3, 4, 5],<br>
[4, 5, 6, 7],<br>
[6, 7, 8, 9]]

In [20]:
def create_dataloader_v1(txt, batch_size = 4, max_len = 256, stride = 128, shuffle = True, drop_last = True, num_workers = 0):
    # drop last if last tensor is shorter than max_len
    # batch size is the number of training ip-op data pairs to be used for training by whcih the parameters are updated
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDatasetV1(txt, tokenizer, max_len, stride)
    dataloader = DataLoader(
        dataset, 
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )
    return dataloader

In [21]:
with open("the-verdict.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()

In [22]:
import torch
dataloader = create_dataloader_v1(raw_text, batch_size = 1, max_len = 4, stride = 1, shuffle = False) # looking into how the function will work
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


Using a batch size of 1 is not preferred as this leads to noisy updates, even though good for memory.<br>
Note that a higher overlap (lower stride) can lead to overfitting.

---

## 3. Vector Embeddings

In [23]:
# sample

input_ids = torch.tensor([2, 3, 5, 1])
vocab_size = 6
output_dim = 3 # embedding dimention

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim) # intialize mebedding matrix randomly
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [24]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


The embedding weight matrix is basically used for lookup operation.

In [25]:
print(embedding_layer(input_ids)) # looking up vector embeddings for the sample input

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


Note that this is essencially a one hot encoded represenation of the input IDs passed into a linear layer to get the output embeddings where the weights of the neural net are randomly initialized. But we dom't use this because it's not efficient due to the sparsity of the one hot encoded input matrix.

In [26]:
# sample two

vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [27]:
max_len = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size = 8, max_len = max_len,
    stride = max_len, shuffle = False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [28]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [29]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


This is basically a batch of 8 with 4 tokens each, and each token is converted to a vector of dimention 256. 

---

## 4. Positional Embedding

Now we create positional embedding the same way as we did for the token embeddings.

In [30]:
context_len = max_len
pos_embedding_layer = torch.nn.Embedding(context_len, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(max_len)) # arange creates ids from 0 to max_len - 1 and pos_embedding_layer converts them to embedding matrix where each row corresponds to the positional embedding for that position id
print(pos_embeddings.shape)

torch.Size([4, 256])


It must also be noted that each row will have the same set of positional embedding values. In other words, the PE value repeats for each row. So the final embedding matrix will only be 4x256 and not 8x4x256. We only care about the position in this case.

We can directly add the token and position embeddings, even though the dimentions don't match exactly via broadcasting.

In [31]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


---

## 5. Simplified Attention Mechanism

In [32]:
inputs = torch.tensor(
    [[0.43, 0.15, 0.89], # Your
     [0.55, 0.87, 0.66], # journey
     [0.57, 0.85, 0.64], # starts
     [0.22, 0.58, 0.33], # with
     [0.77, 0.25, 0.10], # one 
     [0.05, 0.80, 0.55]] # step
)

We know that attention scores are calculated by taking the dot product between the query token and all the other input tokens.

In [33]:
query = inputs[1] # let the query token be journey

attention_scores_x_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attention_scores_x_2[i] = torch.dot(x_i, query)

print(attention_scores_x_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [34]:
# now normalize the scores

attention_weights_x_2 = attention_scores_x_2 / attention_scores_x_2.sum()
print(attention_weights_x_2)
print(attention_weights_x_2.sum())

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
tensor(1.0000)


Note that attention __scores__ are not normalized, but attention __weights__ are, and they sum up to one.

In [35]:
# softmax normalization

def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim = 0)

attention_weights_x_2_naive_sm = softmax_naive(attention_scores_x_2) # sm: softmax
print(attention_weights_x_2_naive_sm)
print(attention_weights_x_2_naive_sm.sum())

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor(1.)


PyTorch implementation of Softmax is preffered to control instability.

In [36]:
# pytorch softmax operation

attention_weights_x_2_pt_sm = torch.softmax(attention_scores_x_2, dim = 0) # pt: pytorch
print(attention_weights_x_2_pt_sm)
print(attention_weights_x_2_pt_sm.sum())

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor(1.)


#### 5.1 Context vector calculation for 'journey'.

In [37]:
query = inputs[1]

context_vector_x2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vector_x2 += attention_weights_x_2_pt_sm[i] * x_i

print(context_vector_x2)

tensor([0.4419, 0.6515, 0.5683])


#### 5.2 Calculate attention matrix

In [38]:
attention_scores = inputs @ inputs.T
print(attention_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


This can be done using 2 for loops but that's computationally very expensive. Rather, we can do the above transpose operation.

In [39]:
attention_weights = torch.softmax(attention_scores, dim = -1) 
print(attention_weights)

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


Setting dimention to -1 means it will normalize accross the columns. This is because the matrix dimention is n_row x n_col.

In [40]:
# context vectors calculation (z_i)

context_vectors = attention_weights @ inputs
print(context_vectors)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


---

## 6. Self Attention

In [41]:
inputs = torch.tensor(
    [[0.43, 0.15, 0.89], # Your
     [0.55, 0.87, 0.66], # journey
     [0.57, 0.85, 0.64], # starts
     [0.22, 0.58, 0.33], # with
     [0.77, 0.25, 0.10], # one 
     [0.05, 0.80, 0.55]] # step
)

Now we randomly initialize W_q, W_k & W_v. Each of them will have dimentiones were the number of row count will be eqaul to the input vector dimention (column count of input matrix).

In [42]:
# we will be working with the sample word 'journey' again

x_2 = inputs[1]
d_in = inputs.shape[1]
d_out = 2 # this will be the number of columns in the key, quey and value matrices

In [43]:
torch.manual_seed(123)

# set requires_grad to True later for model training
W_q = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad = False)
W_k = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad = False)
W_v = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad = False)

print(W_q)
print(W_k)
print(W_v)

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])
Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])
Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])


In [44]:
# now we calculate the query, key and value for the sample input word 'journey'

q_2 = x_2 @ W_q
k_2 = x_2 @ W_k
v_2 = x_2 @ W_v

print(q_2)
print(k_2)
print(v_2)

tensor([0.4306, 1.4551])
tensor([0.4433, 1.1419])
tensor([0.3951, 1.0037])


Note that conventionally, just like how things were implemented in section 5, the output from these dot product operations must have the same dimention as the input vector. 

In [45]:
# get the overall query, key and value

query = inputs @ W_q
key = inputs @ W_k
value = inputs @ W_v

print(query)
print(key)
print(value)

tensor([[0.2309, 1.0966],
        [0.4306, 1.4551],
        [0.4300, 1.4343],
        [0.2355, 0.7990],
        [0.2983, 0.6565],
        [0.2568, 1.0533]])
tensor([[0.3669, 0.7646],
        [0.4433, 1.1419],
        [0.4361, 1.1156],
        [0.2408, 0.6706],
        [0.1827, 0.3292],
        [0.3275, 0.9642]])
tensor([[0.1855, 0.8812],
        [0.3951, 1.0037],
        [0.3879, 0.9831],
        [0.2393, 0.5493],
        [0.1492, 0.3346],
        [0.3221, 0.7863]])


Now we compute the attention scores. In self attention, this is essencially the dot product between the query and the key vectors.

In [46]:
# for 'journey':

query_2 = query[1]
key_2 = key[1]

attention_scores_2 = query_2 @ key.T
print(attention_scores_2)

tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])


This is basically saying how much the word __journey__ attends to all the other words. Obviously, this will be highest for the second word (itself).

In [47]:
# overall attention

attention_scores = query @ key.T
print(attention_scores)

tensor([[0.9231, 1.3545, 1.3241, 0.7910, 0.4032, 1.1330],
        [1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440],
        [1.2544, 1.8284, 1.7877, 1.0654, 0.5508, 1.5238],
        [0.6973, 1.0167, 0.9941, 0.5925, 0.3061, 0.8475],
        [0.6114, 0.8819, 0.8626, 0.5121, 0.2707, 0.7307],
        [0.8995, 1.3165, 1.2871, 0.7682, 0.3937, 1.0996]])


For now, these don't mean anything because they are not trained. Next, we normalize these scores. We normalize by first scaling the scores by square root of d_out or embedding dimention of each word of the key matrix (number of columns). Next, we apply softmax over it.

In [48]:
# normalize to get attention weights (this is just for 'journey')

d_k = key.shape[1]
attention_weights_2 = torch.softmax(attention_scores_2 / d_k ** 0.5, dim = -1)
print(attention_weights_2)

tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])


Why take square root? Multiply any 2 numbers (here, we are multiplying the key and query) increases the variance. So to stabilize it back, we take the root. Another reason is for bringing stability to the softmax outputs and to have an even distribution. If not, the scores can get overly confident for a single input word. (Refer lecture 15, 46th minute for more detail).<br><br>
This is why self attention is also called __sclaed dot product attention__.

In [49]:
# get attention weights for enitre sentence

attention_weights = torch.softmax(attention_scores / d_k ** 0.5, dim = -1)
print(attention_weights)

tensor([[0.1551, 0.2104, 0.2059, 0.1413, 0.1074, 0.1799],
        [0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820],
        [0.1503, 0.2256, 0.2192, 0.1315, 0.0914, 0.1819],
        [0.1591, 0.1994, 0.1962, 0.1477, 0.1206, 0.1769],
        [0.1610, 0.1949, 0.1923, 0.1501, 0.1265, 0.1752],
        [0.1557, 0.2092, 0.2048, 0.1419, 0.1089, 0.1794]])


These attentions weights are now multiplied with the _value_ matrix to get the __context vectors__.

In [50]:
context = attention_weights @ value
print(context)

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]])


Now we make a self attention calss for further usage.

In [57]:
import torch.nn as nn

class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_q = torch.nn.Parameter(torch.rand(d_in, d_out))
        self.W_k = torch.nn.Parameter(torch.rand(d_in, d_out))
        self.W_v = torch.nn.Parameter(torch.rand(d_in, d_out))
    
    def forward(self, x):
        keys = x @ self.W_k
        queries = x @ self.W_q
        values = x @ self.W_v

        attention_scores = queries @ keys.T
        attention_weights = torch.softmax(attention_scores / keys.shape[-1] ** 0.5, dim = -1)

        context = attention_weights @ values
        return context

In [59]:
# test class

torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


In [None]:
# version 2, which is more optimized due to the Linear class from PyTorch

class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias = False):
        super().__init__()
        self.W_q = torch.nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_k = torch.nn.Linear(d_in, d_out, bias = qkv_bias)
        self.W_v = torch.nn.Linear(d_in, d_out, bias = qkv_bias)
    
    # change here compared to v1
    def forward(self, x): 
        keys = self.W_k(x)
        queries = self.W_q(x)
        values = self.W_v(x)

        attention_scores = queries @ keys.T
        attention_weights = torch.softmax(attention_scores / keys.shape[-1] ** 0.5, dim = -1)

        context = attention_weights @ values
        return context

In [65]:
# test class

torch.manual_seed(789)
sa_v1 = SelfAttention_v2(d_in, d_out)
print(sa_v1(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


---