In [1]:
from torch import nn

In [2]:
class FeedForward(nn.Module):
    
    def __init__(self,input_dim,hidden_dim,output_dim):
        super().__init__()
        self.relu = nn.ReLU()
        self.linear1 = nn.Linear(input_dim,hidden_dim)
        self.linear2 = nn.Linear(hidden_dim,output_dim)
        
    
    def forward(self,x):
        x = self.relu(self.linear1(x))
        x = self.linear2(x)
        return x

In [3]:
import torch

In [4]:
x = torch.randn((4,5))

In [5]:
fd = FeedForward(5,7,6)

In [6]:
fd(x).shape

torch.Size([4, 6])

In [7]:
import torch.nn.functional as F

In [8]:
class LayerNorm(nn.Module):
    
    def __init__(self,input_dim):
        super().__init__()
        self.ln = nn.LayerNorm(input_dim)

    
    def forward(self,x):
        return self.ln(x)

In [9]:
x = torch.randn(5,4)
ln = LayerNorm(4)
ln(x)

tensor([[-0.3281,  0.9805, -1.4920,  0.8396],
        [ 1.4041, -1.4004,  0.1815, -0.1853],
        [ 1.0092,  0.7120, -0.1560, -1.5653],
        [-1.3875,  0.3816,  1.3465, -0.3405],
        [ 0.4233,  0.6256,  0.6754, -1.7243]],
       grad_fn=<NativeLayerNormBackward0>)

In [10]:
class Attention(nn.Module):
    
    def __init__(self,input_dim,hidden_dim):
        super().__init__()
        self.qw = nn.Linear(input_dim,hidden_dim)
        self.kw = nn.Linear(input_dim,hidden_dim)
        self.vw = nn.Linear(input_dim,hidden_dim)
    
    def forward(self,x):
        ## B,T,C
        B,T,C = x.shape
        q = self.qw(x)
        k = self.kw(x)
        v = self.vw(x)
        print(q.shape,k.shape,k.T.shape)
        att = q @ k.permute(0,2,1)
#         att = att.masked_fill(mask, value)
        att = F.softmax(att,dim=-1)
        v = att @ v
        return v

In [11]:
x = torch.randn(5,3,4)

In [12]:
att = Attention(4,6)

In [13]:
att(x).shape

torch.Size([5, 3, 6]) torch.Size([5, 3, 6]) torch.Size([6, 3, 5])


  print(q.shape,k.shape,k.T.shape)


torch.Size([5, 3, 6])

In [14]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self,input_dim,head_size,hidden_dim):
        super().__init__()        
        self.head_size = head_size
        self.hidden_size = hidden_dim
        self.qw = nn.Linear(input_dim,head_size * hidden_dim)
        self.kw = nn.Linear(input_dim,head_size * hidden_dim)
        self.vw = nn.Linear(input_dim,head_size * hidden_dim)
        
    def forward(self,q,k,v,masked=False):
        #### q ==> B,T,C
        q = self.qw(q)
        k = self.kw(k)
        v = self.vw(v)
        #### q ===> B,head_size,T,hidden_size
        B,T,C = q.shape
        q = q.reshape(B,T,self.head_size,self.hidden_size).permute(0,2,1,3)
        B,T,C = k.shape
        k = k.reshape(B,T,self.head_size,self.hidden_size).permute(0,2,1,3)
        B,T,C = v.shape
        v = v.reshape(B,T,self.head_size,self.hidden_size).permute(0,2,1,3)
        B,head_size,T,hidden_size = q.shape
        att = q @ k.permute(0,1,3,2) # B,head_size,T,T
        if masked:
            _,_,m,n = att.shape
            mask = torch.ones(m,n)
            mask = torch.tril(mask)
            att = att.masked_fill(mask==0,float('-inf'))
        att = F.softmax(att,dim=-1)
        v = att @ v  # B,head_size,T,hidden_size
        v = v.permute(0,2,1,3) # B,T,head_size,hidden_size
        v = v.reshape(B,T,self.head_size * self.hidden_size)
        return v       
        

In [15]:
x = torch.randn(5,4,3)

In [16]:
att1 = MultiHeadAttention(3,2,3)

In [17]:
att = torch.randn(1,1,2,4)

In [18]:
att

tensor([[[[-0.1281, -0.5120, -0.9899, -1.0107],
          [-0.7422,  1.2215, -0.4566, -0.5804]]]])

In [19]:
 _,_,m,n = att.shape
mask = torch.ones(m,n)

In [20]:
mask = torch.tril(mask)

In [21]:
mask == 0

tensor([[False,  True,  True,  True],
        [False, False,  True,  True]])

In [22]:
att = att.masked_fill(mask==0,float('-inf'))

In [23]:
torch.softmax(att,dim=-1)

tensor([[[[1.0000, 0.0000, 0.0000, 0.0000],
          [0.1231, 0.8769, 0.0000, 0.0000]]]])

In [24]:
class EncoderBlock(nn.Module):
    
    def __init__(self,input_dim,head_size,hidden_dim):
        super().__init__()
        self.mha = MultiHeadAttention(input_dim,head_size,hidden_dim)
        self.ln1 = LayerNorm(input_dim)
        self.fd = FeedForward(input_dim,hidden_dim,input_dim)
        self.ln2 = LayerNorm(input_dim)
    
    def forward(self,q,k,v):
        x = q + self.mha(q,k,v)
        x = self.ln1(x)
        x = x + self.fd(x)
        x = self.ln2(x)
        return x

In [25]:
block = EncoderBlock(4,2,2)

In [26]:
x = torch.randn(5,3,4)

In [27]:
class DecoderBlock(nn.Module):
    
    def __init__(self,input_dim,head_size,hidden_dim):
        super().__init__()
        self.mha = MultiHeadAttention(input_dim,head_size,hidden_dim)
        self.ln1 = LayerNorm(input_dim)
        self.fd = FeedForward(input_dim,hidden_dim,input_dim)
        self.ln2 = LayerNorm(input_dim)
        self.mha2 = MultiHeadAttention(input_dim,head_size,hidden_dim)
        self.fd2 = FeedForward(input_dim,hidden_dim,input_dim)
        self.ln3 = LayerNorm(input_dim)
        
    def forward(self,x,k,v):
#         x,k0,v0 = self.mha.qkv(x) ### 需要masked
        x = x + self.mha(x,k,v)
        x = self.ln1(x)
        x = x + self.mha2(x,k,v) ### cross attention
        x = self.ln2(x)
        x = x + self.fd2(x)
        x = self.ln3(x)
        return x

In [28]:
class Tokenizer:
    
    def __init__(self,sentences):
        self.vocab_set = set()
        for sentence in sentences:
            self.vocab_set.update(sentence)
        self.vocab_set = list(self.vocab_set)
        self.vocab_set = ['<pad>','<bos>','<eos>'] + self.vocab_set
        self.token2id = {c:i for i,c in enumerate(self.vocab_set)}
        self.id2token = {i:c for c,i in self.token2id.items()}
    
    def convert_token_to_id(self,tokens):
        return [self.token2id.get(t,'') for t in tokens]
    
    def convert_id_to_token(self,ids):
        return [self.id2token.get(i,-1) for i in ids]

In [29]:
from torch import nn

In [30]:
from torch.optim import AdamW

In [31]:
##### GPT

In [32]:
class GPT(nn.Module):
    
    def __init__(self,n,input_dim,head_size,hidden_dim,output_vocab_size):
        super().__init__()
        self.decoder_blocks = nn.ModuleList(
           [DecoderBlock(input_dim,head_size,hidden_dim)   for _ in range(n)]
        )
        self.output_embeddings = nn.Embedding(output_vocab_size,input_dim)
        self.output_linear = nn.Linear(head_size * hidden_dim,output_vocab_size)
        self.output_pos_embedding = nn.Embedding(1024,input_dim)
        

    def forward(self,y):
        B,T = y.shape
        y = self.output_embeddings(y) ### B,T,C
        y_pos = self.output_pos_embedding(torch.arange(T).to(y.device))
        y = y + y_pos
        for block in self.decoder_blocks:
            y = block(y,y,y) ### B,head_size,T,hidden_size
        logits = self.output_linear(y) # B,T,output_vocab_size
        return logits

In [33]:
# sentences = ['你好，世界',
#              '好奇怪']

In [34]:
# tokenizer = Tokenizer(sentences)

In [35]:
# tokenizer.id2token

In [36]:
# n = 10
# input_dim = 64
# head_size = 4
# hidden_dim = input_dim // head_size
# output_vocab_size = len(tokenizer.id2token)

In [37]:
# gpt = GPT(n,input_dim,head_size,hidden_dim,output_vocab_size)

In [38]:
# optim = AdamW(gpt.parameters(),lr=1e-3)

In [39]:
# criterion = nn.CrossEntropyLoss()

In [40]:
# def process(sentences,tokenizer,max_length):
#     res = []
#     length = []
#     for sentence in sentences:
#         arr = tokenizer.convert_token_to_id(sentence) + tokenizer.convert_token_to_id(['<eos>'])
#         length.append(len(arr))
#         if len(arr) > max_length:
#             arr = arr[:max_length]
#         else:
#             arr = arr + tokenizer.convert_token_to_id(['<pad>']) * (max_length - len(arr))
#         res.append(arr)
#     return res,length

In [41]:
# y,lengths = process(sentences,tokenizer,6)

In [42]:
# y = torch.LongTensor(y)

In [43]:
# y.shape

In [44]:
# batch_length = torch.LongTensor(lengths)

In [45]:
# batch_length

In [46]:
# mask

In [47]:
# for _ in range(1000):
#     y_inputs = y[:,:-1]
#     y_targets = y[:,1:]
#     logits = gpt(y_inputs)
#     B,T = y_targets.shape
#     # 计算损失
# #     loss = criterion(logits.reshape(B*T,-1), y_targets.reshape(B*T))
    
    
#     # 创建mask来标记有效位置
#     mask = torch.arange(T, device=y_targets.device)[None,:] < (batch_length-1)[:,None]  # shape: (B,T)
#     mask = mask.reshape(-1)  # shape: (B*T)

#     # 只计算有效位置的loss
#     logits_flat = logits.reshape(-1, logits.size(-1))  # shape: (B*T,vocab_size) 
#     targets_flat = y_targets.reshape(-1)  # shape: (B*T)

#     # 方法1: 使用mask选择有效位置
#     valid_logits = logits_flat[mask]  # shape: (num_valid,vocab_size)
#     valid_targets = targets_flat[mask]  # shape: (num_valid)
#     loss = criterion(valid_logits, valid_targets)
#     print(loss)
    
#     optim.zero_grad()
#     loss.backward()
#     optim.step()

In [48]:
# def predict(model,inputs,tokenizer):
#     ids = tokenizer.convert_token_to_id(inputs)
#     print(ids)
#     y = torch.LongTensor([ids])
#     print('yyyy shape',y.shape)
#     for _ in range(100):
#         logits = model(y)
#         ### logits B,T,vocab_size
#         logits = logits[:,-1,:]
#         ### logits B,T,vocab_size
#         predicts = logits.argmax(dim=-1,keepdim=True) # B,1
#         y = torch.cat((y,predicts),dim=-1)
#     print(y.shape)
#     for b in range(y.shape[0]):
#         for i in y[b]:
#             print(tokenizer.convert_id_to_token([int(i)]))

In [49]:
# predict(gpt,'你',tokenizer)

In [50]:
##### 解析本地抓取数据

In [51]:
from transformers import GPT2Tokenizer

# 初始化 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# 编码文本
text = "你好，世界！"
tokens = tokenizer.encode(text)

# 解码 token
decoded_text = tokenizer.decode(tokens)



In [52]:
# 输出结果
print(f"Tokens: {tokens}",type(tokens))
print(f"Decoded Text: {decoded_text}")

Tokens: [19526, 254, 25001, 121, 171, 120, 234, 10310, 244, 45911, 234, 171, 120, 223] <class 'list'>
Decoded Text: 你好，世界！


In [53]:
import json

In [54]:
with open('final_data.json','r') as f:
    law_data = json.load(f)

In [55]:
len(law_data)

4501

In [56]:
def get_all_raw_content(data):
    ret = []
    for k,v in data.items():
        if 'content' in v:
            ret.append(v['content'])
    return ret

In [57]:
all_contents = get_all_raw_content(law_data)

In [58]:
all_contents[:10]

['2022年12月30日, 中国证券投资基金业协会（“基金业协会”）发布通知, 就《私募投资基金登记备案办法（征求意见稿）草案/征求意见稿》及配套指引公开征求意见。基金业协会先后召开多次研讨会, 就各方反馈意见进行讨论评估, 认真听取业内各界的意见和建议。时隔两个多月, 基金业协会在认真研究并吸收采纳了各方面反馈的意见建议基础上, 于2023年2月24日正式发布了《私募投资基金登记备案办法》(“《私募登记备案办法》”)及配套指引。《私募登记备案办法》及其配套指引旨在对私募基金管理人登记和私募基金备案的标准和流程等重要问题进行修订完善, 在基础自律规则层面做出完整和集中的规定。《私募登记备案办法》对私募基金备案及业务规范进行了系统规定和细化更新, 从募、投、管、退等关键环节强化行业合规运作要求, 后续基金业协会将针对私募基金备案出台指引, 对相关规则要求进一步细化明确。本文结合现行规定及《私募登记备案办法》与《私募投资基金登记备案办法（征求意见稿）草案/征求意见稿》(“《征求意见稿草案/征求意见稿》”)的异同, 就《私募登记备案办法》对私募证券投资基金的主要影响进行简要介绍。为免疑义, 除非另有说明, 本文提及私募基金均指私募证券投资基金。一、增加基金文件必备要素《私募登记备案办法》第二十八条、第二十九条对现行规定中募集推介材料、风险揭示书以及基金合同的必备要素进行了梳理和重申, 其中, 对于风险揭示书及基金合同的必备要素, 《私募登记备案办法》在现行规定基础上有如下新变化。1. 基金合同《私募登记备案办法》第二十九条保留了《征求意见稿草案/征求意见稿》中相关表述, 在现行规定基础上, 进一步明确和细化了基金合同关于关联交易机制及私募基金管理人实际不能履职情况下的退出安排要求。基金合同中关联交易条款应当包含《私募登记备案办法》第三十八条规定的关联交易识别认定、交易决策、对价确定、信息披露和回避等机制。此外, 基金合同中应当明确私募基金管理人因失联、注销私募基金管理人登记、破产等原因无法履行或者怠于履行管理职责等情况时的处理机制。《私募登记备案办法》第五十八条为前述情形引入市场化退出机制, 即私募基金因管理人失联、注销私募基金管理人登记或出现重大风险等情形无法履职或怠于履职导致无法正常退出的, 管理人、托管人、份额持有人大会、或一定比例的投资者, 可以按照基金合同约定

In [59]:
def convert2tokens(contents):
    ret = []
    for content in contents:
        if content is not None:
            ret.append(tokenizer.encode(content))
    return ret

In [60]:
import os
if not os.path.exists('tokens.json'):
    all_tokens = convert2tokens(all_contents)
    with open('tokens.json','w') as f:
        json.dump(all_tokens,f)
else:
    with open('tokens.json','r') as f:
        all_tokens = json.load(f)

In [61]:
tokenizer.vocab_size

50257

In [62]:
import random

In [63]:
def get_random_batch(tokens,batch_size,maxlength):
    ret = []
    idxs = list(range(len(tokens)))
    random.shuffle(idxs)
    for _idx in idxs[:batch_size]:
        arr = tokens[_idx]
        t = 0
        while t + maxlength < len(arr):
            ret.append(arr[t:t+maxlength])
            t += maxlength
    return ret  

In [64]:
def predict(model,inputs,tokenizer,device):
    ids = tokenizer.encode(inputs)
    print(ids)
    y = torch.LongTensor([ids])
    y = y.to(device)
    print('yyyy shape',y.shape)
    for _ in range(10):
        logits = model(y)
        ### logits B,T,vocab_size
        logits = logits[:,-1,:]
        ### logits B,T,vocab_size
        predicts = logits.argmax(dim=-1,keepdim=True) # B,1
        y = torch.cat((y,predicts),dim=-1)
    print(y.shape)
    for b in range(y.shape[0]):
        print(tokenizer.decode(list(y[b])))

In [65]:
n = 10
input_dim = 64
head_size = 4
hidden_dim = input_dim // head_size
output_vocab_size = tokenizer.vocab_size

In [66]:
output_vocab_size

50257

In [67]:
if not os.path.exists('simple_model_state_dict.pth'):
    gpt = GPT(n,input_dim,head_size,hidden_dim,output_vocab_size)
else:
    gpt = GPT(n,input_dim,head_size,hidden_dim,output_vocab_size)
    gpt.load_state_dict(torch.load('simple_model_state_dict.pth'))



  gpt.load_state_dict(torch.load('simple_model_state_dict.pth'))


In [68]:
criterion = nn.CrossEntropyLoss()

In [69]:
optim = AdamW(gpt.parameters(),lr=1e-3)

In [70]:
import torch

def get_device():
    if torch.backends.mps.is_available():
        device = "mps"
    elif torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    return device

device = get_device()
print(f"当前设备: {device}")

当前设备: mps


In [71]:
def train(gpt,res):
    gpt.to(device)
    for _ in range(100):
        gpt.train()
        y = get_random_batch(res,2,1024)
        if y and y[0]:
            y = torch.LongTensor(y)
            y = y.to(device)
            y_inputs = y[:,:-1]
            y_targets = y[:,1:]
            logits = gpt(y_inputs)
            B,T = y_targets.shape
            loss = criterion(logits.reshape(-1, logits.size(-1)), y_targets.reshape(-1))
            print(loss)
            optim.zero_grad()
            loss.backward()
            optim.step()
        
        gpt.eval()
        predict(gpt,'律师',tokenizer,device)
        torch.save(gpt.state_dict(), 'simple_model_state_dict.pth')
        

In [72]:
train(gpt,all_tokens)

tensor(7.4001, device='mps:0', grad_fn=<NllLossBackward0>)
[36181, 233, 30585, 230]
yyyy shape torch.Size([1, 4])
torch.Size([1, 14])
律师����������
tensor(7.3379, device='mps:0', grad_fn=<NllLossBackward0>)
[36181, 233, 30585, 230]
yyyy shape torch.Size([1, 4])
torch.Size([1, 14])
律师����������


KeyboardInterrupt: 

In [74]:
!pip list | grep torch

[0m