In [1]:
import torch.nn as nn
import torch

In [4]:
linear = nn.Linear(2,4)

In [10]:
linear.weight

Parameter containing:
tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]], requires_grad=True)

In [9]:
nn.init.zeros_(linear.weight)

Parameter containing:
tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]], requires_grad=True)

In [16]:
class MOELoraLayer(nn.Module):
    def __init__(self, dim, r, expert_num, hydra=False):
        super().__init__()
        self.expert_num = expert_num
        self.hydra = hydra # hydra lora

        self.router = nn.Linear(dim, expert_num, bias=False)

        if hydra:
            self.lora_A = nn.Linear(dim, r, bias=False)
        else:
            self.lora_A = nn.ModuleList()
            for i in range(expert_num):
                self.lora_A.append(nn.Linear(dim, r, bias=False))
            
        self.lora_B = nn.ModuleList()
        for i in range(expert_num):
            self.lora_B.append(nn.Linear(r, dim, bias=False))

        # initial lora B to zeros
        for linear in self.lora_B:
            nn.init.zeros_(linear.weight)

    def forward(self, x: torch.Tensor):
        route_weight = nn.functional.softmax(self.router(x), dim=-1, dtype=torch.float32).to(x.dtype)
        # try lora_alpha
        for i in range(self.expert_num):
            if self.hydra:
                x = x + torch.unsqueeze(route_weight[:,:,i], -1) * self.lora_B[i](self.lora_A(x))
            else:
                x = x + torch.unsqueeze(route_weight[:,:,i], -1) * self.lora_B[i](self.lora_A[i](x))
        return x

In [17]:
moelora = MOELoraLayer(10,2,5)

In [29]:
moelora.state_dict().keys()

odict_keys(['router.weight', 'lora_A.0.weight', 'lora_A.1.weight', 'lora_A.2.weight', 'lora_A.3.weight', 'lora_A.4.weight', 'lora_B.0.weight', 'lora_B.1.weight', 'lora_B.2.weight', 'lora_B.3.weight', 'lora_B.4.weight'])

In [24]:
x = torch.randn(3,4)
a = torch.randn(4,2)
a1= torch.randn(4,2)
a2= torch.randn(4,2)

b1 = torch.randn(2,4)
b2= torch.randn(2,4)

relu = nn.ReLU()

In [20]:
(relu(x @ a1) @ b1)*0.4 + (relu(x @ a2) @ b2)*0.6

tensor([[ 0.0131,  0.8114, -0.1674,  0.1921],
        [ 0.3173,  1.5902,  1.2141,  0.6184],
        [ 0.0000,  0.0000,  0.0000,  0.0000]])

In [21]:
relu(x @ a1) @ (b1*0.4) + relu(x @ a2) @ (b2*0.6)

tensor([[ 0.0131,  0.8114, -0.1674,  0.1921],
        [ 0.3173,  1.5902,  1.2141,  0.6184],
        [ 0.0000,  0.0000,  0.0000,  0.0000]])

In [25]:
relu(x @ a) @ ( b1*0.4) + relu(x @ a) @ (b2*0.6)

tensor([[-0.5762, -0.2077, -0.6032, -0.1846],
        [-0.3817, -0.1376, -0.3996, -0.1223],
        [ 0.0645, -0.0288,  0.0293, -0.0455]])

In [26]:
relu(x @ a) @ ( b1*0.4 + b2*0.6)

tensor([[-0.5762, -0.2077, -0.6032, -0.1846],
        [-0.3817, -0.1376, -0.3996, -0.1223],
        [ 0.0645, -0.0288,  0.0293, -0.0455]])

In [2]:
a = [1,2,3,4]
ta = torch.FloatTensor(a)

In [8]:
ta.dtype

torch.float32

In [6]:
nn.functional.sigmoid(ta)

tensor([0.7311, 0.8808, 0.9526, 0.9820])

In [12]:
nn.functional.softmax(ta)*4


  nn.functional.softmax(ta)*4


tensor([0.1282, 0.3486, 0.9475, 2.5757])

In [14]:
ta.unsqueeze(0).unsqueeze(0).shape

torch.Size([1, 1, 4])

In [2]:
a = torch.randn(3,4,2)

In [3]:
b = torch.randn(3,4,2,5)

In [9]:
torch.sum(a.unsqueeze(-1) * b, 2, keepdim=True).shape

torch.Size([3, 4, 1, 5])

In [4]:
a == None

False

# math length

In [4]:
from LLaMA3_lora_bias.llama import Tokenizer
model_path = '/home2/caojie/pretrain_models/Meta-Llama-3-8B/'
tokenizer = Tokenizer(model_path= f"{model_path}/tokenizer.model")

  from .autonotebook import tqdm as notebook_tqdm


------ flash attention2 enable -----


In [13]:
import json
# with open(f'/home2/caojie/projects/LLM-Adapters/ft-training_set/math_14k.json', 'r') as f:
# with open(f'/home2/caojie/projects/LLM-Adapters/ft-training_set/commonsense_15k.json', 'r') as f:
with open(f'/home2/caojie/datasets/math_commonsense/hellaswag/test.json', 'r') as f:
    data = f.read()

data = json.loads(data)

In [14]:
data[1]

{'instruction': 'Please choose the correct ending to complete the given sentence: Clean and jerk: A lady walks to a barbell. She bends down and grabs the pole. the lady\n\nEnding1: swings and lands in her arms. Ending2: pulls the barbell forward. Ending3: pulls a rope attached to the barbell. Ending4: stands and lifts the weight over her head.\n\nAnswer format: ending1/ending2/ending3/ending4',
 'input': '',
 'output': 'the correct answer is ending4',
 'answer': 'ending4'}

In [15]:
input_lens=[]
output_lens=[]
for x in data:
    input_lens.append(len(tokenizer.encode(x['instruction']+x['output'], bos=False, eos=False)))
    output_lens.append(len(tokenizer.encode(x['output'], bos=False, eos=False)))
print(f'average tokens:{sum(input_lens)/len(input_lens)}')
print(f'average tokens:{sum(output_lens)/len(output_lens)}')

average tokens:223.45678151762598
average tokens:6.0


In [16]:
max(input_lens)


391

In [17]:
len(input_lens)

10042

In [1]:
a = {'a':1, 'b':2}

In [7]:
sum(a.values())/len(a.values())

1.5

In [1]:
list(range(0,32))

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31]

In [2]:
a = torch.randn(2,4)

In [4]:
a.unsqueeze(1).shape

torch.Size([2, 1, 4])

In [2]:
a = torch.nn.Parameter(torch.zeros(1))

In [3]:
b = torch.randn(3,4)

In [4]:
a * b

tensor([[-0., -0., 0., -0.],
        [0., 0., -0., -0.],
        [0., 0., -0., -0.]], grad_fn=<MulBackward0>)