In [1]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [2]:
classifier("We are very happy to show you the 🤗 Transformers library.")

[{'label': 'POSITIVE', 'score': 0.9997795224189758}]

In [4]:
# 有多个输入的情况

results = classifier([
    "We hope you don't hate it.",
    "That's disgusting."
])

for result in results:
    print(f"label: {result['label']}, score: {round(result['score'], 4)}")

label: NEGATIVE, score: 0.5309
label: NEGATIVE, score: 0.9996


In [7]:
# 以上是使用默认模型，进行文本情绪分析
# 其中pipeline是大模型的实例化方法

In [8]:
# 2. 自动音频转文本
# 这里会指定一个大模型,并进行实例化

import torch
from transformers import pipeline

speech_recon = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

In [9]:
# 加载音频数据

from datasets import load_dataset, Audio


dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
# 这里需要注意，加载音频文件的采样率要和模型训练的采样率匹配
# 重采样
dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recon.feature_extractor.sampling_rate))

# 抽取前4个采样数据进行语音识别
result = speech_recon(dataset[:4]["audio"])


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
print([d['text']  for d in result])

['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT']


In [18]:
# 3. 分词器

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier("Hugging Face is a French company based in New York City.")

[{'label': '3 stars', 'score': 0.2712884843349457}]

In [14]:
# 分词器负责将文本预处理为数字数组，作为模型的输入。有多种规则控制标记化过程，包括如何拆分单词以及应在什么级别拆分单词。
# 最重要的是要记住，您需要实例化具有相同模型名称的分词器，以确保您使用与模型预训练时相同的分词规则。

from transformers import AutoTokenizer

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(model_name)

encoding = tokenizer("We are very happy to show you the Transformers library.")
print(encoding)

{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 58263, 13299, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [15]:
# 分词器返回一个字典，其中包含：
# input_ids：token的数字表示。
# Attention_mask：指示应注意哪些标记。

encoding = tokenizer("We are very happy")
print(encoding)

{'input_ids': [101, 11312, 10320, 12495, 19308, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [16]:
# 分词器还可以接受输入列表，并填充和截断文本以返回具有统一长度的批次：
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

In [17]:
print(pt_batch)

{'input_ids': tensor([[  101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103,   100,
         58263, 13299,   119,   102],
        [  101, 11312, 18763, 10855, 11530,   112,   162, 39487, 10197,   119,
           102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}


In [19]:
# 4. AutoModel
# Transformers 提供了一种简单且统一的方式来加载预训练实例。这意味着您可以像加载AutoTokenizer一样加载AutoModel。
# 唯一的区别是为任务选择正确的AutoModel 。对于文本（或序列）分类，您应该加载AutoModelForSequenceClassification：

In [20]:
from transformers import AutoModelForSequenceClassification

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)

# 现在将预处理后的一批输入直接传递给模型。您只需添加以下内容即可解压字典**
pt_outputs = pt_model(**pt_batch)


# 模型输出logits属性中的最终激活。将 softmax 函数应用于 来logits检索概率：
from torch import nn

pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
print(pt_predictions)

tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)


In [22]:
# 5. 保存模型

pt_save_directory = "./pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)
pt_model.save_pretrained(pt_save_directory)

In [23]:
# 加载模型

pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")

In [24]:
# 6. 定制模型构建
# 您可以修改模型的配置类来更改模型的构建方式。配置指定模型的属性，例如隐藏层或注意力头的数量。当您从自定义配置类初始化模型时，您将从头开始。
# 模型属性是随机初始化的，您需要先训练模型，然后才能使用它来获得有意义的结果。

# (1)首先导入AutoConfig，然后加载要修改的预训练模型。在AutoConfig.from_pretrained()中，您可以指定要更改的属性，例如注意力头的数量：

from transformers import AutoConfig

my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)

# (2)使用AutoModel.from_config()从自定义配置创建模型：

from transformers import AutoModel

my_model = AutoModel.from_config(my_config)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [10]:
# 7. Trainer - PyTorch 优化的训练循环

# 所有模型都是标准的torch.nn.Module，因此您可以在任何典型的训练循环中使用它们。虽然您可以编写自己的训练循环，
# Transformers 为 PyTorch 提供了一个Trainer类，其中包含基本训练循环，并为分布式训练、混合精度等功能添加了附加功能。

# (1) 加载预训练模型
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")

# (2) 更改的模型超参数，例如学习率、批量大小和训练周期数。如果您不指定任何训练参数，则使用默认值。

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="quick_tour_train_folder/",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
)

# (3) 加载预处理类，例如分词器、图像处理器、特征提取器或处理器

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

# (4) 加载数据集

from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT

# (5) 标记数据集，然后使用map将其应用到整个数据集， 即对数据集进行预处理

def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])

dataset = dataset.map(tokenize_dataset, batched=True)

# (6) 从数据集中创建一批示例, DataCollatorWithPadding的作用就是将features特征数据转换为tensor类型的dataset。

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# (7) 开始模型训练

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)  # doctest: +SKIP

trainer.train()



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.452
1000,0.3853
1500,0.2638
2000,0.2801


Checkpoint destination directory quick_tour_train_folder/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory quick_tour_train_folder/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory quick_tour_train_folder/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory quick_tour_train_folder/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Exception: Impossible to guess which tokenizer to use. Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer.

In [12]:
pt_save_directory_01 = "./pt_save_pretrained_01"
tokenizer.save_pretrained(pt_save_directory_01)
model.save_pretrained(pt_save_directory_01)

In [7]:
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [8]:
output

BaseModelOutput(last_hidden_state=tensor([[[ 4.4111e-04, -2.6241e-01, -1.0192e-01,  ..., -6.2764e-02,
           2.7584e-01,  3.7014e-01],
         [ 7.2233e-01,  1.6449e-01,  4.0025e-01,  ...,  1.9161e-01,
           4.0458e-01, -5.8094e-02],
         [ 2.8198e-01, -1.7430e-01,  3.9075e-02,  ...,  2.7681e-02,
           1.1886e-01,  9.1439e-01],
         ...,
         [ 6.8016e-01,  7.9712e-02,  8.3603e-01,  ..., -4.8959e-01,
          -2.5017e-01, -2.3519e-01],
         [ 3.8105e-02, -8.1751e-01, -3.4076e-01,  ...,  4.4815e-01,
           9.6726e-02, -2.0311e-01],
         [ 3.5750e-01,  1.9968e-01,  1.7437e-01,  ...,  1.5028e-01,
          -2.3665e-01,  5.4390e-02]]], grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [14]:
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained("./pt_save_pretrained_01")
model = DistilBertModel.from_pretrained("./pt_save_pretrained_01")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [15]:
output

BaseModelOutput(last_hidden_state=tensor([[[ 1.3635e-01,  6.5187e-01,  2.1305e-01,  ...,  3.8233e-01,
           5.2273e-01,  1.1455e+00],
         [ 6.8016e-01,  1.0462e+00,  6.2537e-01,  ...,  3.7748e-01,
           6.6270e-01,  2.6323e-01],
         [ 1.1680e-03,  5.2970e-01,  1.6268e-01,  ...,  3.8032e-01,
           6.2372e-01,  1.2062e+00],
         ...,
         [ 3.6534e-01,  2.7263e-01,  7.4363e-01,  ..., -4.2071e-01,
           7.0869e-02,  2.4367e-01],
         [-1.1648e-01,  2.2981e-01, -1.7811e-01,  ...,  6.2127e-01,
           2.8514e-01,  7.1928e-01],
         [-2.4972e-01,  7.3815e-01,  2.3068e-01,  ...,  5.7425e-02,
           1.5427e-01,  5.3942e-01]]], grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [None]:
output()