In [4]:
from transformers import AutoTokenizer

# 设置本地模型路径
local_model_path = "models/gpt2-chinese-cluecorpussmall"  # 请替换为你的本地路径

# 加载本地的分词器
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

print(tokenizer)

# 编码试算
encoded = tokenizer.batch_encode_plus([
    '欲出未出光辣达,千山万山如火发.须臾走向天上来,逐却残星赶却月.',
    '满目江山四望幽,白云高卷嶂烟收.日回禽影穿疏木,风递猿声入小楼.远岫似屏横碧落,断帆如叶截中流.'
])

print(encoded)


BertTokenizerFast(name_or_path='models/gpt2-chinese-cluecorpussmall', vocab_size=21128, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
{'input_ids': [[101, 3617, 1139, 3313, 1139, 1045, 6793, 6809, 117, 1283, 2255, 674, 2255, 1963, 4125, 1355,

In [6]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self):
        # 使用utf-8编码打开文件
        with open('chinese_poems.txt', encoding='utf-8') as f:
            lines = f.readlines()
        lines = [i.strip() for i in lines]

        self.lines = lines

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, i):
        return self.lines[i]


dataset = Dataset()

len(dataset), dataset[0]


(304752, '欲出未出光辣达,千山万山如火发.须臾走向天上来,逐却残星赶却月.')

In [7]:
import torch
import os
import pandas as pd


#更多数据数据集
class Dataset(torch.utils.data.Dataset):

    def __init__(self):
        data = []
        for i in os.listdir('more_datas'):
            if i == '.ipynb_checkpoints':
                continue
            data.append(pd.read_csv('more_datas/%s' % i))

        data = pd.concat(data).reset_index()

        data = data['内容']

        data = data.str.strip()

        #移除一些标点符号
        data = data.str.replace('[《》“”「」]', '', regex=True)

        #正则过滤
        select = data.str.match('^[\w，。？、！：；]+$', na=False)
        data = data[select]

        #标点符号合并
        data = data.str.replace('[？！；]', '。', regex=True)
        data = data.str.replace('[、：]', '，', regex=True)

        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data.iloc[i]


dataset = Dataset()

len(dataset), dataset[0]

(839587, '云髻高梳鬓不分，扫除虚室事元君。新糊白纸屏风上，尽画蓬莱五色云。')

In [8]:
def collate_fn(data):
    data = tokenizer.batch_encode_plus(data,
                                       padding=True,
                                       truncation=True,
                                       max_length=512,
                                       return_tensors='pt')

    data['labels'] = data['input_ids'].clone()

    return data


#数据加载器
loader = torch.utils.data.DataLoader(
    dataset=dataset,
    batch_size=8,
    collate_fn=collate_fn,
    shuffle=True,
    drop_last=True,
)

for i, data in enumerate(loader):
    break

for k, v in data.items():
    print(k, v.shape)

len(loader)

input_ids torch.Size([8, 158])
token_type_ids torch.Size([8, 158])
attention_mask torch.Size([8, 158])
labels torch.Size([8, 158])


104948

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# 设置本地模型路径
local_model_path = "models/gpt2-chinese-cluecorpussmall"

# 加载本地预训练模型和tokenizer
model = AutoModelForCausalLM.from_pretrained(local_model_path)
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

# 统计模型参数数量
param_count = sum(i.numel() for i in model.parameters()) / 10000
print(f'Model parameter count (in 10k): {param_count}')

# 准备输入数据
text = "今天的天气真好！"
data = tokenizer(text, return_tensors="pt")

# 在不计算梯度的情况下进行推理
with torch.no_grad():
    out = model(**data)

# 输出logits的形状
print(out.logits.shape)


Model parameter count (in 10k): 10206.8736
torch.Size([1, 10, 21128])


In [14]:
def generate(text, row, col):

    def generate_loop(data):
        with torch.no_grad():
            out = model(**data)

        #取最后一个字
        #[5, b, 50257]
        out = out['logits']
        #[5, 50257]
        out = out[:, -1]

        #第50大的值,以此为分界线,小于该值的全部赋值为负无穷
        #[5, 50257] -> [5, 50]
        topk_value = torch.topk(out, 50).values
        #[5, 50] -> [5] -> [5, 1]
        topk_value = topk_value[:, -1].unsqueeze(dim=1)

        #赋值
        #[5, 50257]
        out = out.masked_fill(out < topk_value, -float('inf'))

        #不允许写特殊符号
        out[:, tokenizer.sep_token_id] = -float('inf')
        out[:, tokenizer.unk_token_id] = -float('inf')
        out[:, tokenizer.pad_token_id] = -float('inf')
        for i in '，。':
            out[:, tokenizer.get_vocab()[i]] = -float('inf')

        #根据概率采样,无放回,所以不可能重复
        #[5, 50257] -> [5, 1]
        out = out.softmax(dim=1)
        out = out.multinomial(num_samples=1)

        #强制添加标点符号
        c = data['input_ids'].shape[1] / (col + 1)
        if c % 1 == 0:
            if c % 2 == 0:
                out[:, 0] = tokenizer.get_vocab()['。']
            else:
                out[:, 0] = tokenizer.get_vocab()['，']

        data['input_ids'] = torch.cat([data['input_ids'], out], dim=1)
        data['attention_mask'] = torch.ones_like(data['input_ids'])
        data['token_type_ids'] = torch.zeros_like(data['input_ids'])
        data['labels'] = data['input_ids'].clone()

        if data['input_ids'].shape[1] >= row * col + row + 1:
            return data

        return generate_loop(data)

    #重复3遍
    data = tokenizer.batch_encode_plus([text] * 3, return_tensors='pt')
    data['input_ids'] = data['input_ids'][:, :-1]
    data['attention_mask'] = torch.ones_like(data['input_ids'])
    data['token_type_ids'] = torch.zeros_like(data['input_ids'])
    data['labels'] = data['input_ids'].clone()

    data = generate_loop(data)

    for i in range(3):
        print(i, tokenizer.decode(data['input_ids'][i]))


generate('秋', row=4, col=5)

0 [CLS] 秋 冬 感 应 的 ， 感 应 是 你 的 。 感 应 的 用 途 ， 感 应 的 用 途 。
1 [CLS] 秋 冬 季 节 交 ， 和 田 子 头 的 。 一 直 都 喜 欢 ， 今 年 的 元 首 。
2 [CLS] 秋 冬 季 节 时 ， 人 们 有 哪 些 。 ？ 秋 冬 季 节 ， 人 们 如 何 享 。


In [19]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AdamW
from transformers.optimization import get_scheduler
import torch

# 1. 设置本地模型路径
local_model_path = "models/gpt2-chinese-cluecorpussmall"

# 2. 加载 GPT2LMHeadModel 和 tokenizer
model = GPT2LMHeadModel.from_pretrained(local_model_path)
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

# 训练数据 loader
# loader = ...

# 训练函数
from tqdm import tqdm

def train():
    global model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)

    optimizer = AdamW(model.parameters(), lr=5e-5)
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)

    model.train()
    
    # 使用 tqdm 包装数据加载器，显示进度条
    for i, data in tqdm(enumerate(loader), total=len(loader)):
        for k in data.keys():
            data[k] = data[k].to(device)
        
        # 前向传播和损失计算
        out = model(**data)
        loss = out.loss

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        optimizer.zero_grad()
        model.zero_grad()

        if i % 1000 == 0:
            labels = data['labels'][:, 1:]
            out = out.logits.argmax(dim=2)[:, :-1]

            select = labels != 0
            labels = labels[select]
            out = out[select]
            del select

            accuracy = (labels == out).sum().item() / labels.numel()

            lr = optimizer.state_dict()['param_groups'][0]['lr']

            # 输出当前训练状态
            print(i, loss.item(), lr, accuracy)

    print("Training finished, saving model...")
    torch.save(model, 'save.model')
    print("Model saved successfully.")


# 调用训练函数
train()

  0%|                                                                                                            | 1/104948 [00:03<98:23:39,  3.38s/it]

0 9.139518737792969 4.99995235735793e-05 0.12335526315789473


  0%|                                                                                                          | 28/104948 [01:38<102:47:29,  3.53s/it]


KeyboardInterrupt: 

In [16]:
model = torch.load('save.model')

generate('秋', row=4, col=5)

AttributeError: 'GPT2Model' object has no attribute '_attn_implementation'