In [None]:
#@title 链接Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [5]:
#@title 自定义GPT2模型
import sys

sys.path.append("..")
from transformers import BertTokenizer, GPT2LMHeadModel,GPT2Config
from torch import nn

# from utils.utils import get_project_rootpath
import os


class GPT2(nn.Module):
    def __init__(self):
        super(GPT2, self).__init__()

        # self.gpt = GPT2LMHeadModel.from_pretrained(os.path.join(get_project_rootpath(), "gpt2-chinese-cluecorpussmall"))

        # self.gpt = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-cluecorpussmall")
        config = GPT2Config()
        print(config)
        config.n_embd = 30
        config.n_head = 6
        config.n_layer = 6
        self.gpt = GPT2LMHeadModel(config)


    def forward(self, batch_inputs):
        outputs = self.gpt(input_ids=batch_inputs)
        return outputs

    @property
    def config(self):
        # 返回模型的配置
        return self.gpt.config


    @property
    def device(self):
        # Provide the device attribute for the model
        return next(self.parameters()).device

    def to(self, device):
        # Move the model and its parameters to the specified device
        self.gpt.to(device)
        return self



## 数据

In [6]:
#@title 数据加载
import json
import torch
import torch.utils.data as Data
from torch import nn, optim
import numpy as np

# 将文本数据转换为模型输入的数字编码
def make_data(file_path, tokenizer):
    # 读取文件内容
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    train_datas = []  # 初始化列表存储处理后的数据
    for line in lines:
        line = line.strip()  # 去除每行的前后空白字符
        # 将文本行中的制表符（\t）替换为[SEP]，并在行尾加上[SEP]
        train_data = [i if i != '\t' else "[SEP]" for i in line] + ['[SEP]']
        # 使用tokenizer将文本数据编码为数字序列
        # print("train_data: ", train_data)
        train_num_data = tokenizer.encode(train_data)
        train_num_data = train_num_data[:-1]  # 去掉最后一个标记（通常是[SEP]）
        train_datas.append(train_num_data)  # 将编码后的数据添加到列表中

    return train_datas  # 返回所有处理后的数据


# 自定义数据集类，继承自torch.utils.data.Dataset
class MyDataSet(Data.Dataset):
    def __init__(self, datas, vocab2id):
        self.datas = datas  # 保存数据
        self.vocab2id = vocab2id  # 保存词汇到ID的映射

    def __getitem__(self, item):
        data = self.datas[item]  # 获取指定索引的样本数据
        # 划分数据为输入和输出
        decoder_input = data[:-1]  # 输入数据，去掉最后一个标记
        decoder_output = data[1:]  # 输出数据，从第二个标记开始

        # print("decoder_input: ", decoder_input, " decoder_output: ", decoder_output)

        # 计算输入和输出的长度
        decoder_input_len = len(decoder_input)
        decoder_output_len = len(decoder_output)

        # 返回样本的输入和输出以及它们的长度
        return {"decoder_input": decoder_input, "decoder_input_len": decoder_input_len,
                "decoder_output": decoder_output, "decoder_output_len": decoder_output_len}

    def __len__(self):
        return len(self.datas)  # 返回数据集的总长度

    def padding_batch(self, batch):
        # 获取批次中每个样本的输入和输出长度
        decoder_input_lens = [d["decoder_input_len"] for d in batch]
        decoder_output_lens = [d["decoder_output_len"] for d in batch]

        # 找到输入和输出的最大长度
        decoder_input_maxlen = max(decoder_input_lens)
        decoder_output_maxlen = max(decoder_output_lens)

        # 对每个样本进行填充，使其长度一致
        for d in batch:
            d["decoder_input"].extend([self.vocab2id["[PAD]"]] * (decoder_input_maxlen - d["decoder_input_len"]))
            d["decoder_output"].extend([self.vocab2id["[PAD]"]] * (decoder_output_maxlen - d["decoder_output_len"]))
            # print("decoder_inputsdecoder_inputs: ", d["decoder_input"], "decoder_outputdecoder_output:", d["decoder_output"])

        # 将填充后的输入和输出转换为张量
        decoder_inputs = torch.tensor([d["decoder_input"] for d in batch], dtype=torch.long)
        decoder_outputs = torch.tensor([d["decoder_output"] for d in batch], dtype=torch.long)



        return decoder_inputs, decoder_outputs  # 返回填充后的输入和输出张量




In [None]:
#@title 自定义数据

%%writefile selfTxt.txt
谢谢你所做的一切
你开心就好
开心

你们宿舍都是这么厉害的人吗
是的
又高又厉害

今天好点了吗？
一天比一天严重
吃药不管用，去打一针。别拖着

是的。下辈子想做只萤火虫
可是萤火虫太容易被抓了还是改一个吧
不，我只想奋不顾身扑火

加油，三月动起来，五月笑起来
正解你为什么就那么厉害呢
哈哈，没办法，智商就是这么高

好身材，秀出来
哈哈哈其实我是胖的
谢谢


## 训练

In [3]:
#@title AverageMeter
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        # 初始化函数，设置各变量初始值为None，并调用reset函数进行重置
        self.count = None  # 记录数据的数量
        self.sum = None  # 记录数据的总和
        self.avg = None  # 记录当前的平均值
        self.val = None  # 记录当前值
        self.reset()  # 重置所有变量

    def reset(self):
        # 重置所有变量的值为初始状态
        self.val = 0  # 当前值设为0
        self.avg = 0  # 平均值设为0
        self.sum = 0  # 总和设为0
        self.count = 0  # 数据的数量设为0

    def update(self, val, n=1):
        # 更新函数，用于更新当前值、总和、数量及重新计算平均值
        self.val = val  # 更新当前值为传入的值
        self.sum += val * n  # 根据传入的数量n，将总和增加val * n
        self.count += n  # 更新数据的数量
        self.avg = self.sum / self.count  # 计算新的平均值


In [4]:
#@title 训练过程
import json
import os
import torch
import sys
from torch import nn, optim
import numpy as np
import time
from tqdm import tqdm
from transformers import BertTokenizer

# 添加上级目录到系统路径中
sys.path.append("..")

# 定义训练参数的类
class TrainArgs:
    def __init__(self):
        self.device = "cpu"  # 训练设备（默认为CPU）
        self.batch_size = 4  # 批次大小
        self.epochs = 1  # 训练轮数
        self.print_every = 10  # 每隔多少步打印一次信息
        self.clip = 1  # 梯度裁剪的阈值
        # 训练数据文件路径
        self.train_file_path = "/content/drive/MyDrive/train.txt"
        self.save_path = "GPT2.pt"  # 模型保存路径
        self.lr = 1e-4  # 学习率

# 实例化 TrainArgs 类
train_args = TrainArgs()

# 设置训练参数（这些设置在 TrainArgs 类中已经定义）
train_args.device = "cuda"
train_args.batch_size = 1
train_args.epochs = 10
train_args.print_every = 10
train_args.clip = 1
# train_args.train_file_path = "/content/drive/MyDrive/train.txt"
train_args.train_file_path = "selfTxt.txt"
train_args.save_path = "GPT2.pt"
train_args.lr = 1e-4

# 计算训练时间的函数
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time  # 计算总时间
    elapsed_mins = int(elapsed_time / 60)  # 转换为分钟
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))  # 剩余秒数
    return elapsed_mins, elapsed_secs

# 单步训练函数
def train_step(model, data_loader, epoch, optimizer, criterion, clip=1, print_every=None):
    model.train()  # 设置模型为训练模式

    if print_every is None:
        print_every = 1  # 如果未指定打印频率，则默认每步打印一次

    epoch_loss = 0  # 初始化本轮的总损失
    losses = AverageMeter()  # 用于记录损失的类实例
    temp_time = time.time()  # 记录当前时间

    # 遍历数据加载器中的每个批次
    for step, (dec_inputs, dec_outputs) in enumerate(data_loader):
        '''
        dec_inputs: [batch_size, tgt_len]
        dec_outputs: [batch_size, tgt_len]
        '''
        # print("dec_inputs: ", dec_inputs, "dec_outputs: ", dec_outputs)
        optimizer.zero_grad()  # 清除之前的梯度
        dec_inputs, dec_outputs = dec_inputs.to(device), dec_outputs.to(device)  # 将数据移动到设备上

        # 使用模型进行前向传播，输出：[batch_size * tgt_len, tgt_vocab_size]
        outputs = model(dec_inputs)
        outputs = outputs.logits  # 获取模型的输出
        outputs = outputs.view(-1, outputs.size(-1))  # 调整输出的维度
        loss = criterion(outputs, dec_outputs.view(-1))  # 计算损失
        epoch_loss += loss.item()  # 累加损失
        losses.update(loss.item(), batch_size)  # 更新损失记录

        loss.backward()  # 反向传播计算梯度

        # 梯度裁剪，防止梯度爆炸
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()  # 更新模型参数

        # 打印训练进度
        if print_every and (step + 1) % print_every == 0:
            minutes, seconds = epoch_time(temp_time, time.time())
            print('Epoch: [{0}][{1}/{2}] '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Elapsed {minutes:s}min {seconds:s}s '
                  .format(epoch, step + 1, len(data_loader),
                          minutes=minutes.__str__(),
                          seconds=seconds.__str__(),
                          loss=losses))
            temp_time = time.time()  # 重置计时器

    return epoch_loss / len(data_loader)  # 返回每轮的平均损失

# 训练函数
def train(model, dataloader, train_args):
    criterion = nn.CrossEntropyLoss(ignore_index=0).to(device)  # 定义损失函数
    lr = train_args.lr  # 学习率
    CLIP = train_args.clip  # 梯度裁剪的阈值
    print_every = train_args.print_every  # 打印频率
    save_path = train_args.save_path  # 模型保存路径
    optimizer = optim.Adam(model.parameters(), lr=lr)  # 定义优化器

    for epoch in range(train_args.epochs):  # 遍历每个训练轮次
        start_time = time.time()  # 记录轮次开始时间
        train_loss = train_step(model, dataloader, epoch, optimizer, criterion, CLIP, print_every=print_every)  # 进行训练
        end_time = time.time()  # 记录轮次结束时间

        torch.save(model.state_dict(), save_path)  # 保存模型参数

        torch.save(model.gpt.state_dict(), f'simpleGPT2.pt')  # 保存模型参数

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)  # 计算本轮时间
        print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f}')  # 打印训练损失

# 打印模型参数的总数和可训练参数的数量
def print_num_parameters(model):
    # 计算总参数数
    total_params = sum(p.numel() for p in model.parameters())
    print(f'{total_params:,} total parameters.')
    # 计算可训练参数数
    total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'{total_trainable_params:,} training parameters.')

# 主函数
if __name__ == '__main__':
    device = train_args.device  # 获取设备
    # 初始化分词器
    tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall")
    # torch.save(tokenizer.state_dict(), f'tokenizer.pt')  # 保存模型参数# 保存分词器
    tokenizer.save_pretrained('./tokenizer')  # 保存到指定目录

    epochs = train_args.epochs  # 训练轮次
    batch_size = train_args.batch_size  # 批次大小

    train_file_path = train_args.train_file_path  # 训练数据文件路径
    datas = make_data(train_file_path, tokenizer)  # 处理数据
    dataset = MyDataSet(datas, tokenizer.vocab)  # 创建数据集实例
    dataloader = Data.DataLoader(dataset, batch_size=batch_size, collate_fn=dataset.padding_batch)  # 创建数据加载器

    model = GPT2().to(device)  # 初始化模型，并将其移动到设备上
    train(model, dataloader, train_args)  # 开始训练





GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 50257
}



OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 1.95 GiB of which 1.06 MiB is free. Including non-PyTorch memory, this process has 1.94 GiB memory in use. Of the allocated memory 1.76 GiB is allocated by PyTorch, and 135.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
#@title 训练参数参考
import argparse

import sys

sys.path.append("..")
# from utils.utils import get_project_rootpath
import os

# checkpoints_dir = os.path.join(get_project_rootpath(), "model_checkpoints")


def train_parse_args():
    parser = argparse.ArgumentParser(description="训练参数配置")
    parser.add_argument("--device", type=str, default="cuda", help="batch size")
    parser.add_argument("--batch_size", type=int, default=4, help="batch size")
    parser.add_argument("--epochs", type=int, default=1, help="epochs")
    parser.add_argument("--print_every", type=int, default=10, help="print every")
    parser.add_argument("--clip", type=int, default=1, help="clip")


    parser.add_argument("--train_file_path", type=str, default=os.path.join("","/content/drive/MyDrive/train.txt"),
                        help="train_file_path")

    parser.add_argument('--save_path', type=str, default=os.path.join("", "GPT2.pt"),
                        help='decay step')
    parser.add_argument('--lr', type=float, default=1e-4, help='learning rate')


    return parser.parse_args()


## 验证



In [None]:
def get_project_rootpath():
    """
    获取项目根目录。此函数的能力体现在，不论当前module被import到任何位置，都可以正确获取项目根目录
    :return:
    """
    path = os.path.realpath(os.curdir)
    while True:
        # PyCharm项目中，'.idea'是必然存在的，且名称唯一
        if '.idea' in os.listdir(path):
            return path
        path = os.path.dirname(path)


In [None]:
#@title validate result

from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline
tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall")
model = GPT2().to(device)
# 加载模型权重
model.load_state_dict(torch.load('GPT2.pt'))

# 设置模型为评估模式
model.eval()

# 创建文本生成管道
text_generator = TextGenerationPipeline(model.gpt, tokenizer)

# 使用模型进行文本生成
result = text_generator("今天好点了吗？", max_length=100, do_sample=True)
print(result)

# 使用模型进行文本生成
result = text_generator("好身材，秀出来", max_length=100, do_sample=True)
print(result)


In [None]:
#@title validate result
from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline
import torch

# 加载tokenizer和模型
tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall")
model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-cluecorpussmall").to('cpu')

# 加载训练后的模型权重
# model.load_state_dict(torch.load('GPT2.pt', map_location=torch.device('cpu')))

# 设置模型为评估模式
model.eval()

# 创建文本生成管道
text_generator = TextGenerationPipeline(model, tokenizer)

# 对话互动
history = []

while True:
    # 用户输入
    user_input = input("你: ")

    # 将用户输入加入到对话历史中
    history.append(user_input)

    # 构造输入给模型
    input_text = " ".join(history)

    # 使用模型生成响应
    response = text_generator(input_text, max_length=1000, do_sample=True, top_k=50, top_p=0.95)[0]['generated_text']

    # 提取模型生成的响应
    generated_text = response[len(input_text):].strip()

    # 打印模型的响应
    print("AI:", generated_text)

    # 将模型的响应加入到对话历史中
    history.append(generated_text)

    # 结束对话条件（可选）
    if user_input.lower() in ["exit", "quit", "再见", "拜拜"]:
        print("AI: 再见！")
        break


## 安卓加载pt模型文件

In [None]:
#@title 简单加载pt模型文件

import torch
from transformers import GPT2LMHeadModel, BertTokenizer

# 1. 定义或加载 GPT-2 模型架构
model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-cluecorpussmall")

# 2. 加载保存的模型权重（如果有的话）
model.load_state_dict(torch.load('simpleGPT2.pt'))

# 3. 设置模型为推理模式
model.eval()

# 4. 进行推理示例
tokenizer = BertTokenizer.from_pretrained("/content/tokenizer")
input_text = "你好"
input_ids = tokenizer(input_text, return_tensors='pt')['input_ids']

# 推理
with torch.no_grad():
    outputs = model(input_ids)

# 获取生成的 token ID 序列
generated_ids = torch.argmax(outputs.logits, dim=-1)

# 结果解码
decoded_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# 打印解码后的文本
print(decoded_text)


In [None]:
!pip install onnx


In [None]:
#@title 转成安卓加载

import torch
from transformers import GPT2LMHeadModel

# 定义并加载 GPT-2 模型架构
model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chinese-cluecorpussmall")

# 将模型转换为 ONNX 格式
model.eval()
dummy_input = torch.randint(0, 1000, (1, 10))  # 修改为适当的输入大小

torch.onnx.export(
    model,
    dummy_input,
    "model.onnx",
    export_params=True,
    opset_version=14,  # 使用较高的 opset 版本
    input_names=['input_ids'],
    output_names=['output']
)




In [None]:
!cp model.onnx /content/drive/MyDrive/