#### gemma  text-generation

In [2]:
from transformers import pipeline

prompt = "Hugging Face is a community-based open-source platform for machine learning."
generator = pipeline(task="text-generation", model="models/gemma-2b")
generator(prompt)
prompt = "You are very smart"
generator = pipeline(task="text-generation", model="models/gemma-2b", num_return_sequences=1)
generator(prompt)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[{'generated_text': 'You are very smart very smart very smart very smart very smart very smart very smart very smart very smart'}]

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [5]:
generator(prompt, num_return_sequences=1)

[{'generated_text': 'You are very smart very smart very smart very smart very smart very smart very smart very smart very smart'}]

#### 设置文本生成最大长度

In [6]:
generator(prompt, num_return_sequences=1, max_length=16)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'You are very smart very smart very smart very smart very smart very smart very smart'}]

### 使用 nlp_bert_backbone_large_std 实现中文补全

In [7]:
from transformers import pipeline

fill_mask = pipeline(task="fill-mask", model="models/nlp_bert_backbone_large_std")

Some weights of BertForMaskedLM were not initialized from the model checkpoint at models/nlp_bert_backbone_large_std and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
text = "人民是[MASK]可战胜的"

fill_mask(text, top_k=1)

[{'score': 0.9783893823623657,
  'token': 679,
  'token_str': '不',
  'sequence': '人 民 是 不 可 战 胜 的'}]

In [9]:
text = "美国的首都是[MASK]"

fill_mask(text, top_k=1)

[{'score': 0.6861461997032166,
  'token': 511,
  'token_str': '。',
  'sequence': '美 国 的 首 都 是 。'}]

In [10]:
text = "巴黎是[MASK]国的首都。"
fill_mask(text, top_k=1)

[{'score': 0.9895037412643433,
  'token': 3791,
  'token_str': '法',
  'sequence': '巴 黎 是 法 国 的 首 都 。'}]

In [11]:
text = "美国的首都是[MASK]"
fill_mask(text, top_k=3)

[{'score': 0.6861461997032166,
  'token': 511,
  'token_str': '。',
  'sequence': '美 国 的 首 都 是 。'},
 {'score': 0.27111881971359253,
  'token': 8038,
  'token_str': '：',
  'sequence': '美 国 的 首 都 是 ：'},
 {'score': 0.016462570056319237,
  'token': 131,
  'token_str': ':',
  'sequence': '美 国 的 首 都 是 :'}]

In [12]:
text = "美国的首都是[MASK][MASK][MASK]"

fill_mask(text, top_k=1)

[[{'score': 0.8753869533538818,
   'token': 5294,
   'token_str': '纽',
   'sequence': '[CLS] 美 国 的 首 都 是 纽 [MASK] [MASK] [SEP]'}],
 [{'score': 0.884882926940918,
   'token': 5276,
   'token_str': '约',
   'sequence': '[CLS] 美 国 的 首 都 是 [MASK] 约 [MASK] [SEP]'}],
 [{'score': 0.8815923929214478,
   'token': 511,
   'token_str': '。',
   'sequence': '[CLS] 美 国 的 首 都 是 [MASK] [MASK] 。 [SEP]'}]]

## 使用 AutoClass 高效管理 `Tokenizer` 和 `Model`

### 

In [13]:
from transformers import AutoTokenizer, AutoModel

model_name = "models/nlp_bert_backbone_large_std"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

### 分词

In [14]:
# 第一步：分词
sequence = "美国的首都是华盛顿特区"
tokens = tokenizer.tokenize(sequence)
print(tokens)

['美', '国', '的', '首', '都', '是', '华', '盛', '顿', '特', '区']


In [15]:
# 第二步：映射
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [16]:
print(token_ids)

[5401, 1744, 4638, 7674, 6963, 3221, 1290, 4670, 7561, 4294, 1277]


In [17]:
token_ids_e2e = tokenizer.encode(sequence)

In [18]:
token_ids_e2e

[101, 5401, 1744, 4638, 7674, 6963, 3221, 1290, 4670, 7561, 4294, 1277, 102]

In [19]:
tokenizer.decode(token_ids)

'美 国 的 首 都 是 华 盛 顿 特 区'

In [20]:
tokenizer.decode(token_ids_e2e)

'[CLS] 美 国 的 首 都 是 华 盛 顿 特 区 [SEP]'

### muti text

In [21]:
sequence_batch = ["美国的首都是华盛顿特区", "中国的首都是北京"]

In [22]:
token_ids_batch = tokenizer.encode(sequence_batch)

In [23]:
tokenizer.decode(token_ids_batch)

'[CLS] 美 国 的 首 都 是 华 盛 顿 特 区 [SEP] 中 国 的 首 都 是 北 京 [SEP]'

### i

In [24]:
embedding_batch = tokenizer("美国的首都是华盛顿特区", "中国的首都是北京")
print(embedding_batch)

{'input_ids': [101, 5401, 1744, 4638, 7674, 6963, 3221, 1290, 4670, 7561, 4294, 1277, 102, 704, 1744, 4638, 7674, 6963, 3221, 1266, 776, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [25]:
# 优化下输出结构
for key, value in embedding_batch.items():
    print(f"{key}: {value}\n")

input_ids: [101, 5401, 1744, 4638, 7674, 6963, 3221, 1290, 4670, 7561, 4294, 1277, 102, 704, 1744, 4638, 7674, 6963, 3221, 1266, 776, 102]

token_type_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]

attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]



In [26]:
len(tokenizer.vocab.keys())

21128

In [27]:
from itertools import islice

# 使用 islice 查看词表部分内容
for key, value in islice(tokenizer.vocab.items(), 10):
    print(f"{key}: {value}")

##赁: 19652
##肯: 18564
鑾: 7149
镕: 7258
30g: 12008
benz: 13065
叛: 1361
##墅: 14920
牺: 4295
she: 9374


In [28]:
new_tokens = ["天干", "地支"]

In [29]:
# 将集合作差结果添加到词表中
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())

In [None]:
new_tokens

In [None]:
tokenizer.add_tokens(list(new_tokens))

In [None]:
# 新增加了2个Token，词表总数由 21128 增加到 21130
len(tokenizer.vocab.keys())

In [None]:
new_special_token = {"sep_token": "NEW_SPECIAL_TOKEN"}

In [None]:
tokenizer.add_special_tokens(new_special_token)

In [None]:
# 新增加了1个特殊Token，词表总数由 21128 增加到 21131
len(tokenizer.vocab.keys())

### save new models

In [None]:
tokenizer.save_pretrained("./models/new-nlp_bert_backbone_large_std")

In [None]:
model.save_pretrained("./models/new-nlp_bert_backbone_large_std")