In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
input_ids = tokenizer("It was a dark and stormy", return_tensors="pt").input_ids
input_ids



tensor([[1026,  373,  257, 3223,  290, 6388,   88]])

In [4]:
for t in input_ids[0]:
    print(t, "\t:", tokenizer.decode(t))

tensor(1026) 	: It
tensor(373) 	:  was
tensor(257) 	:  a
tensor(3223) 	:  dark
tensor(290) 	:  and
tensor(6388) 	:  storm
tensor(88) 	: y


In [5]:
from transformers import AutoModelForCausalLM

gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")

In [6]:
outputs = gpt2(input_ids)
outputs.logits.shape

torch.Size([1, 7, 50257])

In [7]:
final_logits = gpt2(input_ids).logits[0, -1]
final_logits.argmax()

tensor(1755)

In [8]:
tokenizer.decode(final_logits.argmax())

' night'

In [10]:
import torch

top10_logits = torch.topk(final_logits, 10)

for index in top10_logits.indices:
    print(tokenizer.decode(index))

 night
 day
 evening
 morning
 afternoon
 summer
 time
 winter
 weekend
,


In [11]:
top10 = torch.topk(final_logits.softmax(dim=0), 10) 
for value, index in zip(top10.values, top10.indices): 
    print(f"{tokenizer.decode(index):<10} {value.item():.2%}")

 night     46.18%
 day       23.46%
 evening   5.87%
 morning   4.42%
 afternoon 4.11%
 summer    1.34%
 time      1.33%
 winter    1.22%
 weekend   0.39%
,          0.38%


In [12]:
output_ids = gpt2.generate(input_ids, max_new_tokens=20)
decoded_text = tokenizer.decode(output_ids[0])

print("Input IDs", input_ids[0]) 
print("Output IDs", output_ids) 
print(f"Generated text: {decoded_text}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input IDs tensor([1026,  373,  257, 3223,  290, 6388,   88])
Output IDs tensor([[ 1026,   373,   257,  3223,   290,  6388,    88,  1755,    13,   383,
          2344,   373, 19280,    11,   290,   262, 15114,   547,  7463,    13,
           383,  2344,   373, 19280,    11,   290,   262]])
Generated text: It was a dark and stormy night. The wind was blowing, and the clouds were falling. The wind was blowing, and the


In [16]:
beam_output = gpt2.generate(
    input_ids, 
    num_beams=5, 
    max_new_tokens=30, 
) 

print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It was a dark and stormy night.

"It was dark and stormy," he said.

"It was dark and stormy," he said.




In [17]:
beam_output = gpt2.generate( 
    input_ids, 
    num_beams=5, 
    repetition_penalty=1.2, 
    max_new_tokens=38, 
)
 
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It was a dark and stormy night.

"There was a lot of rain," he said. "It was very cold."

He said he saw a man with a gun in his hand.




In [18]:
from transformers import set_seed 

# Setting the seed ensures we get the same results every time we run this code 
set_seed(70) 

sampling_output = gpt2.generate( 
    input_ids, 
    do_sample=True, 
    max_length=34, 
    top_k=0, # We'll come back to this parameter 
) 

print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It was a dark and stormy day until it broke down the big canvas on my sleep station, making me money dilapidated, and, with a big soothing mug


In [19]:
sampling_output = gpt2.generate( 
    input_ids, 
    do_sample=True, 
    temperature=0.4, 
    max_length=40, 
    top_k=0, 
) 

print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It was a dark and stormy night, and I was alone. I was in the middle of the night, and I was suddenly awakened bygoodness, and I was thinking of the old man


In [23]:
sampling_output = gpt2.generate( 
    input_ids, 
    do_sample=True, 
    temperature=0.001, 
    max_length=40, 
    top_k=0, 
)
 
print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It was a dark and stormy night. The wind was blowing, and the clouds were falling. The wind was blowing, and the clouds were falling. The wind was blowing, and the clouds were


In [20]:
sampling_output = gpt2.generate( 
    input_ids, 
    do_sample=True, 
    temperature=3.0, 
    max_length=40, 
    top_k=0, 
)
 
print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It was a dark and stormyfleet verteutorial took possession contingt containing Carol Rhino soils titsfastKEY 07 Deaths od paradeCONT WEEK Barclays Reviskish6 EdwingarPosition serv blat Imperial licenseium Bot


In [21]:
sampling_output = gpt2.generate( 
    input_ids, 
    do_sample=True, 
    max_length=40, 
    top_k=10, 
) 

print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It was a dark and stormy night and there were some things we didn't understand. I didn't understand the nature of the problem. I was afraid and scared."

In response to the


In [24]:
gpt2.generation_config.pad_token_id = tokenizer.pad_token_id

sampling_output = gpt2.generate( 
    input_ids, 
    do_sample=True, 
    max_length=40, 
    top_p=0.94, 
    top_k=0, 
) 

print(tokenizer.decode(sampling_output[0], skip_special_tokens=True))

It was a dark and stormy night from 7PM. Tony attended the Rangers game with his boys. He overheard them singing. He recorded a video with Jasper in the booth. He rented an empty


In [25]:
tokenizer.encode(" positive"), tokenizer.encode(" negative")

([3967], [4633])

In [26]:
def score(review):
    """Predict whether it is positive or negative

    This function predicts whether a review is positive or negative
    using a bit of clever prompting. It looks at the logits for the
    tokens ' positive' and ' negative' (note the space before the
    words), and returns the label with the highest score.
    """
    prompt = f"""Question: Is the following review positive or
negative about the movie?
Review: {review} Answer:"""

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids  # 对提示词进行分词
    final_logits = gpt2(input_ids).logits[0, -1]  # 从词汇中获取每个词元的logit，注意我们使用的是gpt2()而不是gpt2.generate()，因为前者返回词中的每个分词的logit，而后者仅返回所选定的分词
    if final_logits[3967] > final_logits[4633]:  # 检测正向分词的logit是否大于负向分词的logit
        print("Positive")
    else:
        print("Negative")

In [27]:
score("This movie was terrible!")

Negative


In [28]:
score("That was a delight to watch, 10/10 would recommend :)")

Positive


In [29]:
score("A complex yet wonderful film about the depravity of man") 

Positive


In [30]:
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

In [32]:
model.generation_config.pad_token_id = tokenizer.pad_token_id

prompt = """\
Translate English to Spanish:

English: I do not speak Spanish.
Spanish: No hablo español.

English: See you later!
Spanish: ¡Hasta luego!

English: Where is a good restaurant?
Spanish: ¿Dónde hay un buen restaurante?

English: What rooms do you have available?
Spanish: ¿Qué habitaciones tiene disponibles?

English: I like soccer
Spanish:"""
inputs = tokenizer(prompt, return_tensors="pt").input_ids
output = model.generate(
    inputs,
    do_sample=False,
    max_new_tokens=10,
)

print(tokenizer.decode(output[0], skip_special_tokens=True))

Translate English to Spanish:

English: I do not speak Spanish.
Spanish: No hablo español.

English: See you later!
Spanish: ¡Hasta luego!

English: Where is a good restaurant?
Spanish: ¿Dónde hay un buen restaurante?

English: What rooms do you have available?
Spanish: ¿Qué habitaciones tiene disponibles?

English: I like soccer
Spanish: Me gusta el fútbol




In [33]:
from transformers import pipeline

fill_masker = pipeline(model="bert-base-uncased")
fill_masker("The [MASK] is made of milk.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.19546717405319214,
  'token': 9841,
  'token_str': 'dish',
  'sequence': 'the dish is made of milk.'},
 {'score': 0.1290748566389084,
  'token': 8808,
  'token_str': 'cheese',
  'sequence': 'the cheese is made of milk.'},
 {'score': 0.10590700060129166,
  'token': 6501,
  'token_str': 'milk',
  'sequence': 'the milk is made of milk.'},
 {'score': 0.041120897978544235,
  'token': 4392,
  'token_str': 'drink',
  'sequence': 'the drink is made of milk.'},
 {'score': 0.03712378069758415,
  'token': 7852,
  'token_str': 'bread',
  'sequence': 'the bread is made of milk.'}]

In [34]:
from transformers import pipeline

classifier = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english")
classifier("This movie is disgustingly good !")

[{'label': 'POSITIVE', 'score': 0.9998536109924316}]

In [35]:
unmasker = pipeline("fill-mask", model="bert-base-uncased")
result = unmasker("This man works as a [MASK] during summer.")
print([r["token_str"] for r in result])

result = unmasker("This woman works as a [MASK] during summer.")
print([r["token_str"] for r in result])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['farmer', 'carpenter', 'gardener', 'fisherman', 'miner']
['maid', 'nurse', 'servant', 'waitress', 'cook']


In [37]:
!pip install sentence_transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sentence_transformers
  Downloading sentence_transformers-3.0.0-py3-none-any.whl.metadata (10 kB)
Collecting scikit-learn (from sentence_transformers)
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn->sentence_transformers)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn->sentence_transformers)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-3.0.0-py3-none-any.whl (224 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDow

In [38]:
from sentence_transformers import SentenceTransformer, util

sentences = ["I'm happy", "I'm full of happiness"]
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Compute embedding for both lists
embedding_1 = model.encode(sentences[0], convert_to_tensor=True)
embedding_2 = model.encode(sentences[1], convert_to_tensor=True)

util.pytorch_cos_sim(embedding_1, embedding_2)



tensor([[0.6003]], device='cuda:0')