### GPT Implementation 
This implementation references the terminology in https://nlp.seas.harvard.edu/annotated-transformer/ and minGPT/nanoGPT. 

In [2]:
import torch
import torch.nn as nn 
import math
from torch.nn.functional import log_softmax, pad

In [3]:
# Implementation in tutorial
class EncoderDecoder(nn.Module):
    """
    # define the basic encoder/decoder architecture
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        return self.decode(memory=self.encode(src, src_mask),src_mask=src_mask,
                           tgt=tgt, tgt_mask=tgt_mask)
        
class Generator(nn.Module):
    "Define standard linear + softmax generation step"
    def __init__(self, d_model, vocab_size):
        super(Generator).__init__()
        self.proj = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        return log_softmax(self.proj(x),dim=-1)
    
# My Implementation
class MyEncoderDecoder(nn.Module):
    # defining all the blocks that is part of the encoder decoder class
    def __init__(self, encoder_block, decoder_block, input_embedding_block, output_embedding_block, generator_block):
        super(EncoderDecoder).__init__()
        self.encoder = encoder_block
        self.decoder = decoder_block
        self.input_embedder = input_embedding_block
        self.output_embedder = output_embedding_block
        self.generator = generator_block
        
    def forward(self, input, input_mask, output, output_mask):
        encoded_input = self.encode(input, input_mask)
        return self.decode(encoded_input, output, output_mask)
    
    # encode function
    def encode(self, input, input_mask):
        input_embedding = self.input_embedder(input)
        return self.encoder(input_embedding,input_mask)
    
    # decode function
    def decode(self, encoded_input, output, output_mask):
        output_embedding = self.output_embedder(output)
        return self.decoder(output_embedding, encoded_input, output, output_mask)
    
class MyGenerator(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(Generator).__init__()
        self.proj = nn.Linear(d_model, vocab_size)
    
    def forward(self,x):
        return log_softmax(self.proj(x), dim=-1)

In [None]:
# define the repetable layers
import copy
def clones(module : nn.Module, N : int):
    return nn.ModuleList( [ copy.deepcopy(module) for _ in range(N) ])

# define the encoder


# define the LayerNorm