In [54]:
import numpy as np
import torch
import torch.nn as nn
d_k = 64
d_v = 64
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()        
    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) 
        scores.masked_fill_(attn_mask, -1e9) 
        weights = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(weights, V)
        return context, weights

In [55]:
d_embedding = 512
n_heads = 8
batch_size = 3
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_embedding, d_k * n_heads)
        self.W_K = nn.Linear(d_embedding, d_k * n_heads)
        self.W_V = nn.Linear(d_embedding, d_v * n_heads)
        self.linear = nn.Linear(n_heads * d_v, d_embedding)
        self.layer_norm = nn.LayerNorm(d_embedding)

    def forward(self, Q, K, V, attn_mask):      
        residual, batch_size = Q, Q.size(0)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)        
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
        context, weights = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) 
        output = self.linear(context)
        output = self.layer_norm(output + residual)
        return output, weights

In [56]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=d_embedding, out_channels=2048, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=2048, out_channels=d_embedding, kernel_size=1)
        self.layer_norm = nn.LayerNorm(d_embedding)

    def forward(self, inputs):   
        residual = inputs
        output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))
        output = self.conv2(output).transpose(1, 2)
        output = self.layer_norm(output + residual)
        return output

In [57]:
import numpy as np
def get_sin_enc_table(n_position, embedding_dim):
    sinusoid_table = np.zeros((n_position, embedding_dim))    
    for pos_i in range(n_position):
        for hid_j in range(embedding_dim):
            angle = pos_i / np.power(10000, 2 * (hid_j // 2) / embedding_dim)
            sinusoid_table[pos_i, hid_j] = angle    
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
    return torch.FloatTensor(sinusoid_table)

In [58]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)
    pad_attn_mask = pad_attn_mask.expand(batch_size, len_q, len_k) 
    return pad_attn_mask

In [59]:
def get_attn_subsequent_mask(seq):
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
    subsequent_mask = np.triu(np.ones(attn_shape), k=1)
    subsequent_mask = torch.from_numpy(subsequent_mask).byte()
    return subsequent_mask

In [60]:
class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention()
        self.feed_forward = PoswiseFeedForwardNet()
        self.norm1 = nn.LayerNorm(d_embedding)
        self.norm2 = nn.LayerNorm(d_embedding)

    def forward(self, dec_inputs, attn_mask=None):
        attn_output, _ = self.self_attn(dec_inputs, dec_inputs, dec_inputs, attn_mask)

        norm1_outputs = self.norm1(dec_inputs + attn_output)

        ff_outputs = self.feed_forward(norm1_outputs)
        
        dec_outputs = self.norm2(norm1_outputs + ff_outputs)
        return dec_outputs

In [61]:
n_layers = 6
device = "cuda" if torch.cuda.is_available() else "cpu"
class Decoder(nn.Module):
    def __init__(self, vocab_size, max_seq_len):
        super(Decoder, self).__init__()
        self.src_emb = nn.Embedding(vocab_size, d_embedding)
        self.pos_emb = nn.Embedding(max_seq_len, d_embedding)       
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])

    def forward(self, dec_inputs):        
        positions = torch.arange(len(dec_inputs), device=dec_inputs.device).unsqueeze(-1)      
        inputs_embedding = self.src_emb(dec_inputs) + self.pos_emb(positions)      
        attn_mask = get_attn_subsequent_mask(inputs_embedding).to(device)    
        for layer in self.layers:
            dec_outputs = layer(inputs_embedding, attn_mask)
        return dec_outputs

In [62]:
class GPT(nn.Module):
    def __init__(self, vocab_size, max_seq_len):
        super(GPT, self).__init__()
        self.decoder = Decoder(vocab_size, max_seq_len)
        self.projection = nn.Linear(d_embedding, vocab_size)

    def forward(self, dec_inputs):        
        dec_outputs = self.decoder(dec_inputs)
        logits = self.projection(dec_outputs)
        return logits

##  Building Dataset and DataLoader with WikiText2
pip install torchtext  0.14.1

In [None]:
#step 1 : Download the corpus and build the vocabulary
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset

tokenizer = get_tokenizer("basic_english")

train_iter = WikiText2(split='train')
valid_iter = WikiText2(split='valid')

def yield_tokens(data_iter):
    for item in data_iter:
        yield tokenizer(item)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), 
                                  specials=["<pad>", "<sos>", "<eos>"])
vocab.set_default_index(vocab["<pad>"])

print("Vocabulary size:", len(vocab))
print("Vocabulary example (word to index):", 
      {word: vocab[word] for word in ["<pad>", "<sos>", "<eos>", "the", "apple"]})

In [None]:
#step 2 : Construct the pytorch dataset
from torch.utils.data import Dataset
max_seq_len = 256

class WikiDataset(Dataset):
    def __init__(self, data_iter, vocab, max_len=max_seq_len):
        self.data = []        
        for sentence in data_iter:
            tokens = tokenizer(sentence)[:max_len - 2]
            tokens = [vocab["<sos>"]] + vocab(tokens) + [vocab["<eos>"]]         
            self.data.append(tokens)
    
    def __len__(self):
        return len(self.data)    
    
    def __getitem__(self, idx):       
        source = self.data[idx][:-1]    
        target = self.data[idx][1:]      
        return torch.tensor(source), torch.tensor(target)

train_dataset = WikiDataset(train_iter, vocab)
valid_dataset = WikiDataset(valid_iter, vocab)
print(f"Dataset entries: {len(train_dataset)}")
sample_source, sample_target = train_dataset[100]
print(f"Input sequence tensor example: {sample_source}")
print(f"Target sequence tensor example: {sample_target}")
decoded_source = ' '.join(vocab.lookup_tokens(sample_source.tolist()))
decoded_target = ' '.join(vocab.lookup_tokens(sample_target.tolist()))
print(f"Input sequence example text: {decoded_source}")
print(f"Target sequence example text: {decoded_target}")

In [65]:
#step 3 : Build the dataloader class.
from torch.utils.data import DataLoader

def pad_sequence(sequences, padding_value=0, length=None):
    max_length = max(len(seq) for seq in sequences) if length is None else length    
    result = torch.full((len(sequences), max_length), padding_value, dtype=torch.long)    

    for i, seq in enumerate(sequences):
        end = len(seq)
        result[i, :end] = seq[:end]
    return result

def collate_fn(batch):
    sources, targets = zip(*batch)    

    max_length = max(max(len(s) for s in sources), max(len(t) for t in targets))    

    sources = pad_sequence(sources, padding_value=vocab["<pad>"], length=max_length)
    targets = pad_sequence(targets, padding_value=vocab["<pad>"], length=max_length)    

    return sources, targets

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, 
                              shuffle=True, collate_fn=collate_fn)

valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size,
                              shuffle=False, collate_fn=collate_fn)

In [66]:
import torch.optim as optim
device = "cuda" if torch.cuda.is_available() else "cpu"
model = GPT(len(vocab), max_seq_len).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"])
optimizer = optim.Adam(model.parameters(), lr=0.0001)
epochs = 2

for epoch in range(epochs):
    epoch_loss = 0
    for batch_idx, (source, target) in enumerate(train_dataloader):
        inputs, targets = source.to(device), target.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, len(vocab)), targets.view(-1))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()        
        if (batch_idx + 1) % 500 == 0:
            print(f"Batch {batch_idx + 1}/{len(train_dataloader)}, Loss: {loss.item()}")   
    epoch_loss /= len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {epoch_loss}")


#Evaluating the training process using the evaluation dataset


In [67]:
# import time
# from datetime import datetime

# # Save the trained model
# timestamp = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
# model_file_name = f"trained_model_{timestamp}.pt"
# torch.save(model.state_dict(), model_file_name)
# print(f"Model saved as {model_file_name}")

gready search and beam_search:


In [None]:
def generate_text_beam_search(model, input_str, max_len=50, beam_width=5):
    model.eval()
    input_tokens = [vocab[token] for token in input_str.split()]
    candidates = [(input_tokens, 0.0)]

    with torch.no_grad():
        for _ in range(max_len):
            new_candidates = []
            for candidate, candidate_score in candidates:
                inputs = torch.LongTensor(candidate).unsqueeze(0).to(device)
                outputs = model(inputs)
                logits = outputs[:, -1, :]

                scores, next_tokens = torch.topk(logits, beam_width, dim=-1)
                final_results = []
                for score, next_token in zip(scores.squeeze(), next_tokens.squeeze()):
                    new_candidate = candidate + [next_token.item()]
                    new_score = candidate_score - score.item()
                    if next_token.item() == vocab["<eos>"]:
                        final_results.append((new_candidate, new_score))
                    else:
                        new_candidates.append((new_candidate, new_score))

            candidates = sorted(new_candidates, key=lambda x: x[1])[:beam_width]

    best_candidate, _ = sorted(candidates, key=lambda x: x[1])[0]
    output_str = " ".join([vocab.get_itos()[token] for token in best_candidate if vocab.get_itos()[token] != "<pad>"])
    return output_str

model.load_state_dict(torch.load('trained_model_2024-...pt'))
input_str = ""
generated_text = generate_text_beam_search(model, input_str)
print("Generated text:", generated_text)

HW:
1.Use greedy search to complete the task
2.Utilize the HuggingFace Transformers library to download new models for inference and compare their performance.
3.Within LangChain,use both the HuggingFaceEndpoint and HuggingFace Pipline interfaces to call the currently most popular llm(meta-llama/Meta-Llama-3-8B,google/flan-t5-base).