In [2]:
import torch
import torch.nn as nn
import math
from einops import reduce
import numpy as np
# 示例 2: 从向量创建对角矩阵 (返回2D tensor)
a = torch.rand(5, 5)
torch.tril(a)

tensor([[0.0457, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5393, 0.7695, 0.0000, 0.0000, 0.0000],
        [0.8812, 0.0073, 0.8768, 0.0000, 0.0000],
        [0.3503, 0.3577, 0.0549, 0.8055, 0.0000],
        [0.3590, 0.1970, 0.9858, 0.1102, 0.1255]])

In [46]:
import torch
import torch.nn as nn
import math
from einops import rearrange
from einops import einsum
import sys
sys.path.append("/Users/bitkira/Documents/GitHub/Stanford-CS336/assignment1-basics-main/")
from cs336_basics.RMSnorm import RMSnorm
from cs336_basics.SwiGLU import SwiGLU
from cs336_basics.MHA import MultiheadSelfAttention
from cs336_basics.Linear import linear
from cs336_basics.Softmax import softmax
from cs336_basics.RoPE import rope

class rope(nn.Module):
    def __init__(self, theta: float, d_k: int, max_seq_len: int, device=None):
        super().__init__()
        THETA = torch.tensor([math.pow(theta, (2*k)/d_k) for k in range(int(d_k/2))])

        R_list = []
        for j in range(max_seq_len):
            R = np.zeros((d_k, d_k), dtype=np.float32)
            R[0::2, 0::2] = np.diag(torch.cos(j/THETA))  
            R[0::2, 1::2] = np.diag(-torch.sin(j/THETA))   
            R[1::2, 0::2] = np.diag(torch.sin(j/THETA)) 
            R[1::2, 1::2] = np.diag(torch.cos(j/THETA))
            R_list.append(R)
        self.register_buffer("RoPE" ,torch.tensor(R_list, dtype=torch.float32),persistent=False)
    def forward(self, x: torch.Tensor, token_positions: torch.Tensor) -> torch.Tensor:
        self.RoPE = self.RoPE[0:x.shape[-2], 0:x.shape[-2]]
        print("self.RoPE",x.shape,self.RoPE[token_positions].shape)
        return einsum(x, self.RoPE[token_positions], "... sequence_length d_k, ... sequence_length dk d_k-> ... sequence_length dk")

def scaled_dot_product_attention(q, k, v, mask=None):
    k = rearrange(k, "... seq_len d_k -> ... d_k seq_len")
    out = torch.matmul(q, k) / torch.sqrt(torch.tensor(q.shape[-1], dtype=q.dtype))
    if mask is not None:
        out.masked_fill_(~mask,float('-inf'))
    return torch.matmul(softmax(out, i=-1), v)

class MultiheadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads, max_seq_len=None, theta=None, token_positions=None):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.num_heads = num_heads
        self.d_model = d_model
        self.Q = linear(d_model, d_model)
        self.K = linear(d_model, d_model)
        self.V = linear(d_model, d_model)
        self.O = linear(d_model, d_model)
        if theta is not None:
            self.rope = rope(theta, d_model//num_heads, max_seq_len)
            self.token_pos = token_positions
        else:
            self.rope = None
    def forward(self, x):
        mask = torch.tril(torch.ones(x.shape[1], x.shape[1], dtype=int)).bool()
        Q = rearrange(self.Q(x), "batch_size seq_len (h dk) -> batch_size h seq_len dk", h=self.num_heads, dk=self.d_model//self.num_heads)
        K = rearrange(self.K(x), "batch_size seq_len (h dk) -> batch_size h seq_len dk", h=self.num_heads, dk=self.d_model//self.num_heads)
        if self.rope is not None:
            Q = self.rope(Q, self.token_pos)
            K = self.rope(K, self.token_pos)
        V = rearrange(self.V(x), "batch_size seq_len (h dv) -> batch_size h seq_len dv", h=self.num_heads, dv=self.d_model//self.num_heads)

        attention_score = scaled_dot_product_attention(Q, K, V, mask=mask)
        attention_score = rearrange(attention_score, "batch_size h seq_len dv -> batch_size seq_len (h dv)")
        return self.O(attention_score)


class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, max_seq_len, theta, token_pos):
        super().__init__()
        pos = torch.tensor([i for i in range(token_pos)], dtype=int)
        print(type(max_seq_len))
        self.norm1 = RMSnorm(d_model)
        self.norm2 = RMSnorm(d_model)
        self.MHA_layer = MultiheadSelfAttention(d_model, num_heads, max_seq_len, theta, pos)
        self.SwiGLU = SwiGLU(d_model, d_ff)
    
    def forward(self, x):
        sub_result1 = x + self.MHA_layer(self.norm1(x))
        sub_result2 = sub_result1 + self.SwiGLU(self.norm2(sub_result1))
        return sub_result2



In [47]:
a = TransformerBlock(64,4,128,16,1000,12)

<class 'int'>


In [49]:
a(torch.rand(2, 12, 64))

self.RoPE torch.Size([2, 4, 12, 16]) torch.Size([12, 12, 16])
self.RoPE torch.Size([2, 4, 12, 16]) torch.Size([12, 12, 16])


tensor([[[0.0043, 0.0998, 0.4859,  ..., 0.0446, 0.2757, 0.5248],
         [0.1893, 0.4030, 0.5727,  ..., 0.1572, 0.1872, 0.3205],
         [0.0764, 0.7939, 0.0839,  ..., 0.6664, 0.5288, 0.3886],
         ...,
         [0.2704, 0.8901, 0.3727,  ..., 0.2958, 0.8465, 0.1121],
         [0.8617, 0.4179, 0.1906,  ..., 0.9061, 0.0991, 0.2995],
         [0.6133, 0.4797, 0.2635,  ..., 0.3329, 0.6796, 0.1284]],

        [[0.6101, 0.9964, 0.6968,  ..., 0.1056, 0.2436, 0.7811],
         [0.0854, 0.1881, 0.2927,  ..., 0.2243, 0.5200, 0.0094],
         [0.5552, 0.5403, 0.2444,  ..., 0.9025, 0.9709, 0.0321],
         ...,
         [0.0281, 0.3953, 0.8058,  ..., 0.9588, 0.2706, 0.5672],
         [0.7009, 0.3475, 0.2917,  ..., 0.0375, 0.6552, 0.9332],
         [0.6638, 0.9952, 0.8414,  ..., 0.8560, 0.3369, 0.4599]]],
       grad_fn=<AddBackward0>)

In [26]:
from einops import rearrange, reduce
import torch
import sys
import torch.nn.functional as F

sys.path.append("/Users/bitkira/Documents/GitHub/Stanford-CS336/assignment1-basics-main/")
from cs336_basics.Softmax import softmax

def CrossEntropy(logits ,targts):
    logits = softmax(logits, i=-1)
    print(logits.shape)
    num_classes = logits.shape[-1]
    print(num_classes)
    targts = F.one_hot(targts, num_classes).bool()
    logits.masked_fill_(~targts, value=0)
    logits = torch.log(logits)
    return reduce(logits, "batchsizeseqlen vocabsize -> 1", "mean")





In [30]:

b = torch.randint(100, (4,))
a = torch.rand(4,1000)
CrossEntropy(a, b)

torch.Size([4, 1000])
1000


tensor([-inf])

In [22]:
torch.randint(100, (4,))

tensor([ 7, 22,  9, 77])