In [None]:
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from datasets import load_dataset
import matplotlib.pyplot as plt

#
# # softmax对于-inf的计算结果
# a = torch.tensor([1, 2, float('-inf')]).float()
# print(F.softmax(a))
#
# # K: (B, T, H)
# # Q: (B, T, H)
# # K @ Q.transpose(-2, -1): (B, T, T)
# scores = torch.randn(1, 4, 4)
# print(scores)
#
# # 定义下三角矩阵
# tril = torch.tril(torch.ones(4, 4))
# s = scores.masked_fill(tril == 0, float('-inf'))
# print(s)
#
# # 定义权重分布
# print(s.shape)
# w = F.softmax(s, dim = -1)
# print(w)
#
# # softmax对方差的敏感性
# x1 = torch.randn(1, 8)
# print(x1.std(), F.softmax(x1, dim = -1))
#
# x2 = 1000* x1
# print(x2.std(), F.softmax(x2, dim = -1))
#
# # 对齐分数的方差变化
# B, T, H = 32, 100, 100
# K = torch.randn(B, T, H)
# Q = torch.randn(B, T, H)
# scores = K @ Q.transpose(-2, -1) / H ** 0.5  # 归一化处理降低方差
# print(scores.std())


def attention(query, key, value, dropout, mask = None):
    # query, key, value: (B, T, H)
    # mask:                 (T, T)
    # output:            (B, T, H)
    B, T, H = query.shape
    # **求相似度这个地方可能可以改进**
    scores = query @ key.transpose(-2, -1)/ H** 0.5
    if mask != None:
        scores = scores.masked_fill(mask == 0, float('-inf'))
    w_att = F.softmax(scores, dim = - 1)  # (B, T, T)
    out = w_att @ value                   # (B, T, H)
    return out


class MaskedAttention(nn.Module):

    def __init__(self, emb_size, head_size):
        # emb_size: C, head_size: H
        super().__init__()
        self.key = nn.Linear(emb_size, head_size, bias = False)
        self.query = nn.Linear(emb_size, head_size, bias = False)
        self.value = nn.Linear(emb_size, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(sequence_len, sequence_len)))
        self.dp = nn.Dropout(0.4)

    def forward(self, x):
        # x:   (B, T, C)
        # out: (B, T, H)
        B, T, C = x.shape
        k = self.key(x)    # (B, T, H)
        q = self.query(x)  # (B, T, H)
        v = self.value(x)  # (B, T, H)
        mask = self.tril[:T, :T]
        out = attention(q, k, v, self.dp, mask)
        return out

# m = MaskedAttention(3, 4)
# x = torch.randn(5, 10, 3)
# print(m(x).shape


class MaskedMultiHeadAttention(nn.Module):

    def __init__(self, emb_size, head_size):
        super().__init__()
        # 计算单头注意力的个数
        n_head = emb_size // head_size
        heads = [MaskedAttention(emb_size, head_size) for _ in range(n_head)]
        self.heads = nn.ModuleList(heads)
        # 线性转换层和随机失活层
        self.proj = nn.Linear(emb_size, emb_size)
        self.dp = nn.Dropout(0.4)

    def forward(self, x):
        # x:   (B, T, C)
        # out: (B, T, C)
        out = torch.concat([h(x) for h in self.heads], dim = -1)  # (B, T, C)
        out = self.dp(self.proj(out))
        return out


class FeedForward(nn.Module):

    def __init__(self, emb_size):
        super().__init__()
        self.ln1 = nn.Linear(emb_size, 4 * emb_size)
        self.ln2 = nn.Linear(4 * emb_size, emb_size)
        self.dp = nn.Dropout(0.4)

    def forward(self, x):
        # x: (B, T, C)
        out = F.gelu(self.ln1(x))     # (B, T, C)
        out = self.dp(self.ln2(out))  # (B, T, C)
        return out


class Block(nn.Module):

    # 解码块
    def __init__(self, emb_size, head_size):
        super().__init__()
        self.l1 = nn.LayerNorm(emb_size)
        self.mha = MaskedMultiHeadAttention(emb_size, head_size)
        self.l2 = nn.LayerNorm(emb_size)
        self.ff = FeedForward(emb_size)

    def forward(self, x):
        # x:   (B, T, C)
        # out: (B, T, C)
        # 不能使用 += 这样的符号，会导致Pytorch bug
        x = x + self.mha(self.l1(x))
        x = x + self.ff(self.l2(x))
        return x

# 注意力机制忽略了文本的相关关系

class CharGPT(nn.Module):

    def __init__(self, vs, emb_size, sequence_len, head_size, n_layer):
        super().__init__()
        self.token_emb = nn.Embedding(vs, emb_size)
        self.pos_emb = nn.Embedding(sequence_len, emb_size)
        block = [Block(emb_size, head_size) for _ in range(n_layer)]
        self.blocks = nn.Sequential(*block)
        self.l = nn.LayerNorm(emb_size)
        self.lm = nn.Linear(emb_size, vs)

    def forward(self, x):
        # x: (B, T)
        # logits: (B, T, vs)
        B, T = x.shape
        pos = torch.arange(0, T, dtype = torch.long, device = x.device)
        token_embeddings = self.token_emb(x)        # (B, T, C)
        position_embeddings = self.pos_emb(pos)     # (B, T, C)
        h = token_embeddings + position_embeddings  # (B, T, C)
        h = self.blocks(h)                          # (B, T, C)
        logits = self.lm(self.l(h))                 # (B, T, vs)
        return logits


class char_tokenizer:

    def __init__(self, data):
        # 数据中出现的所有字符构成字典
        chars = sorted(list(set(''.join(data))))
        # 预留一个位置给结尾的特殊字符
        self.char2ind = {s : i + 1 for i, s in enumerate(chars)}
        self.char2ind['<|e|>'] = 0
        self.ind2char = {i : s for s, i in self.char2ind.items()}

    def encode(self, text):
        # print("encode:\t", text)
        # print(len(self.char2ind))
        result = [self.char2ind[c] for c in text]
        # print("len:\t", len(result))
        return result

    def decode(self, enc):
        if isinstance(enc, int):
            return self.ind2char[enc]
        return [self.ind2char[i] for i in enc]


@torch.no_grad()
def generate_batch(model, idx, sequence_len, max_new_tokens=300):
    '''
    利用模型生成文本（反复使用模型进行预测）
    参数
    ----
    model ：CharGPT，生成文本的模型
    idx ：torch.LongTensor，当前字母在字典中的位置，形状为(1, T), T是输入文本的长度
    max_new_tokens ：int，生成文本的最大长度
    返回
    ----
    out ：list[int]，生成的文本
    '''
    # 将模型切换至评估模式
    model.eval()
    for _ in range(max_new_tokens):
        # print(_)
        # 限制背景长度，否则会报错
        context = idx[:, -sequence_len:]
        # print("value:\t", context)
        # print("type:\t", type(context))
        # time.sleep(111)
        # 在文本生成时，模型的计算效率很低，因为有很多重复计算
        logits = model(context)
        # 只使用最后一个预测结果
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        # 根据模型预测的概率，得到最终的预测结果（下一个字母）
        # 这一步运算有一定随机性
        ix = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, ix), dim=1)
        if ix.item() == 0:
            break
    # 将模型切换至训练模式
    model.train()
    return idx.tolist()[0]

def process(data):
    '''
    根据文本生成训练数据
    '''
    sequence_len = 64
    # text是字符串列表
    text = data['whole_func_string']
    inputs, labels = [], []
    for i in text:
        enc = tok.encode(i)
        # 0对应着文本结束
        enc += [0]
        # 将文本转换为多个训练数据
        for i in range(len(enc) - sequence_len):
            inputs.append(enc[i: i + sequence_len])
            # 预测标签是下一个字母，因此只需要挪动一个位置即可
            labels.append(enc[i + 1: i + 1 + sequence_len])
    return {'inputs': inputs, 'labels': labels}


def estimate_loss(model):
    re = {}
    # 将模型切换至评估模式
    model.eval()
    re['train'] = _loss(model, train_loader)
    re['test'] = _loss(model, test_loader)
    # 将模型切换至训练模式
    model.train()
    return re

@torch.no_grad()
def _loss(model, data_loader):
    '''
    计算模型在不同数据集下面的评估指标
    '''
    loss = []
    data_iter= iter(data_loader)
    # 随机使用多个批量数据来预估模型效果
    for k in range(eval_iters):
        data = next(data_iter, None)
        if data is None:
            data_iter = iter(data_loader)
            data = next(data_iter, None)
        inputs, labels = data['inputs'], data['labels']
        logits = model(inputs)
        # 根据cross_entropy的定义，需要对logits进行转置运算
        # 具体细节请参考cross_entropy的官方文档
        logits = logits.transpose(-2, -1)
        loss.append(F.cross_entropy(logits, labels).item())
    return torch.tensor(loss).mean().item()


def train_gpt(model, optimizer, data_loader, epochs=10):
    lossi = []
    for epoch in range(epochs):
        for i, data in enumerate(data_loader, 0):
            inputs, labels = data['inputs'], data['labels']
            optimizer.zero_grad()
            logits = model(inputs)
            # 根据cross_entropy的定义，需要对logits进行转置运算
            # 具体细节请参考cross_entropy的官方文档
            logits = logits.transpose(-2, -1)
            loss = F.cross_entropy(logits, labels)
            lossi.append(loss.item())
            loss.backward()
            optimizer.step()
        # 评估模型，并输出结果
        stats = estimate_loss(model)
        train_loss = 'train loss'+ str(stats['train'])
        test_loss = 'test loss'+ str(stats['test'])
        print(f'epoch {epoch:>2}: {train_loss}, {test_loss}')
    return lossi

# 一些超参数
emb_size = 128
head_size = 8
n_layer = 12
sequence_len = 64
learning_rate = 1e-3
eval_iters = 20
batch_size=500


# 如果有GPU，该脚本将使用GPU进行计算
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 获取数据集
raw_datasets = load_dataset('code_search_net', 'python')
datasets = raw_datasets['train'].filter(lambda x: 'apache/spark' in x['repository_name'])

# 计算token字典
tok = char_tokenizer(datasets['whole_func_string'])
# print(len(tok.char2ind))

# 实例化模型
model = CharGPT(len(tok.char2ind), emb_size, sequence_len, head_size, n_layer).to(device)

# 统计模型的参数个数
# print(f'{sum(p.numel() for p in model.parameters())} parameters')
# print(model)

# 使用模型来生成文本
begin_text = torch.tensor(tok.encode('def'), device = device).unsqueeze(0)
# print(begin_text)
# print(tok.encode('def'))
# time.sleep(111)
# print(''.join(tok.decode(generate_batch(model, begin_text, sequence_len = sequence_len))))


# 将数据分为训练集和测试集
tokenized = datasets.train_test_split(test_size=0.1, seed=1024, shuffle=True)
# 将文本转换为训练数据，里面包含inputs和labels
tokenized = tokenized.map(process, batched=True, remove_columns=datasets.column_names)
tokenized.set_format(type='torch', device=device)

print(tokenized['train']['inputs'].shape, tokenized['train']['labels'].shape)

# 构建数据读取器
train_loader = DataLoader(tokenized['train'], batch_size=batch_size, shuffle=True)
test_loader = DataLoader(tokenized['test'], batch_size=batch_size, shuffle=True)
# 获取一个批量的数据
next(iter(test_loader))


print("loss before trained:\t", estimate_loss(model))
l = train_gpt(model, optim.AdamW(model.parameters(), lr=learning_rate), train_loader)
print("loss after trained:\t", estimate_loss(model))
plt.plot(torch.tensor(l).view(-1, 10).mean(1).numpy())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

code_search_net.py:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

The repository for code_search_net contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/code_search_net.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


python.zip:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

Filter:   0%|          | 0/412178 [00:00<?, ? examples/s]

Map:   0%|          | 0/617 [00:00<?, ? examples/s]

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

torch.Size([605913, 64]) torch.Size([605913, 64])
loss before trained:	 {'train': 4.761275291442871, 'test': 4.757631778717041}
epoch  0: train loss0.8010479807853699, test loss1.0632340908050537
epoch  1: train loss0.6124584674835205, test loss1.0055476427078247
epoch  2: train loss0.5213474035263062, test loss0.9848357439041138


In [None]:
device

NameError: name 'device' is not defined

In [None]:
import pickle
with open("CharGPT2.pickle", "wb") as file:
    pickle.dump(model, file)

NameError: name 'model' is not defined

In [None]:
import pickle