In [1]:
!pip uninstall pyarrow -q
!pip install pyarrow -q
!pip install transformers -q
!pip install datasets -q
!pip install evaluate -q

Proceed (Y/n)? Y
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.6.1 requires pyarrow<16.2.0a0,>=16.1.0, but you have pyarrow 17.0.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoModelForCausalLM
from transformers import BertModel, BertConfig, BertTokenizer
from transformers import AdamW
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import evaluate

import torch
from torch.utils.data import DataLoader

import numpy as np

from tqdm.auto import tqdm

Chapter 1: Transformer Models

In [6]:
model = pipeline(task="text-generation", model='distilgpt2')

In [7]:
model('Large Language Models work by ')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Large Language Models work by ickyjain.org on Language Models for Learning.\n\n\nWe are here to help with one of more important languages (learning grammar). The English-language models are already in the development stages and have great potential'}]

Chapter 2: Using HuggingFace Transformers

In [8]:
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
print(checkpoint)

distilbert-base-uncased-finetuned-sst-2-english


In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [10]:
raw_text = ["I am really, really very tired", "This sux", "This rox"]
model_inputs = tokenizer(raw_text, padding=True, return_tensors='pt')
outputs = model(**model_inputs)
print(outputs.logits.shape)


torch.Size([3, 2])


In [11]:
out = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(out)

tensor([[9.9975e-01, 2.4557e-04],
        [9.1807e-01, 8.1928e-02],
        [4.6261e-02, 9.5374e-01]], grad_fn=<SoftmaxBackward0>)


In [13]:
config = BertConfig()
model = BertModel(config)

In [15]:
checkpoint = 'bert-base-cased'
model = BertModel.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [17]:
tokenizer = BertTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [18]:
tokenizer('Using a Transformer network is simple')

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [19]:
sequence = 'Using a Transformer network is simple'

In [21]:
tokens = tokenizer.tokenize(sequence)
print(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [22]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [24]:
words = tokenizer.decode(ids)
print(words)

Using a Transformer network is simple


In [25]:
model_inputs = tokenizer(sequence, return_tensors='pt')
model_inputs

{'input_ids': tensor([[  101,  7993,   170, 13809, 23763,  2443,  1110,  3014,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [26]:
model(**model_inputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.6153,  0.1568,  0.0985,  ..., -0.0759,  0.2038,  0.0482],
         [ 0.2562,  0.1889,  0.2532,  ...,  0.2686,  0.1463,  0.1009],
         [ 0.3518,  0.1104, -0.2133,  ...,  0.6982,  0.3967,  0.0754],
         ...,
         [ 0.2848,  0.2423,  0.6766,  ...,  0.1633, -0.2540,  0.6792],
         [ 0.4576,  0.1060,  0.1058,  ..., -0.1095, -0.3214,  0.0611],
         [ 1.2791,  0.8940,  0.2299,  ...,  0.3554,  0.1184, -0.6299]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.6330,  0.4648,  0.9998, -0.9881,  0.9364,  0.8437,  0.9561, -0.9878,
         -0.9470, -0.6955,  0.9649,  0.9972, -0.9978, -0.9997,  0.7242, -0.9568,
          0.9893, -0.5918, -0.9999, -0.7342, -0.4607, -0.9997,  0.2724,  0.9573,
          0.9318,  0.0596,  0.9829,  0.9999,  0.8708, -0.4161,  0.2261, -0.9891,
          0.7678, -0.9985,  0.1639,  0.1432,  0.6955, -0.2830,  0.7901, -0.9460,
         -0.7101, -0.8831,  0.50

In [11]:
def generate_output(checkpoint: 'str', input_text: 'str'):
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
  model = AutoModelForCausalLM.from_pretrained(checkpoint)
  model_inputs = tokenizer(input_text, return_tensors='pt')
  return model(**model_inputs)

In [12]:
generate_output('openai-community/gpt2', "Large Language Models work by ")

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ -28.5177,  -27.4272,  -30.4489,  ...,  -34.5793,  -34.9349,
           -28.1923],
         [ -91.4501,  -88.7579,  -91.6674,  ..., -100.2363, -100.8487,
           -90.9075],
         [ -96.9319,  -95.0614,  -98.9610,  ..., -109.1990, -106.9119,
           -95.6563],
         [-107.9515, -110.0738, -115.3217,  ..., -118.0129, -119.5956,
          -109.5073],
         [-114.6116, -114.0121, -118.7762,  ..., -115.8259, -118.1942,
          -113.8287],
         [ -73.8617,  -74.1852,  -76.2654,  ...,  -78.9766,  -81.2103,
           -76.1828]]], grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[-1.2234e+00,  1.8855e+00,  5.7002e-01,  ..., -9.9190e-01,
           -8.3353e-01,  9.4686e-01],
          [-2.5999e+00,  2.4775e+00,  7.0382e-02,  ..., -4.9635e-01,
           -4.7001e-01,  7.0825e-01],
          [-2.0318e+00,  2.4009e+00,  1.4435e+00,  ...,  9.6439e-02,
           -1.1927e+00,  1.2404e+00],
          [-2.986

#Chapter 3: Fine-Tuning A Pretrained Model

In [4]:
raw_datasets = load_dataset(path='glue', name='mrpc')
raw_datasets

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [5]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [8]:
classifier("Je pense rien")

[{'label': 'POSITIVE', 'score': 0.8128553032875061}]

In [10]:
classifier = pipeline("zero-shot-classification")

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [11]:
classifier("J'ai pas du temps", candidate_labels=["english", "french", "german"])

{'sequence': "J'ai pas du temps",
 'labels': ['french', 'english', 'german'],
 'scores': [0.8471136093139648, 0.1312875598669052, 0.021598881110548973]}

In [16]:
generator = pipeline(task='text-generation', model='distilgpt2')

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [18]:
generator("Large Language Models work by")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Large Language Models work by connecting the two components to one another, using a combination of one single language and a language. It is then used to perform a computation of the three or more languages with various components: the parser and the parser language. The first implementation of this approach is a parser in the Scala language, which also implements the syntax for the parser that the parser uses. The parser language is called a lambda with lambda in the syntax (see the text provided below), using different methods, including the compiler and then the parser language, in the Scala language.\n\nHere are the three functions being used to define the parser in a standard Scala language:\nThe example code below is adapted from the Scala language. In Scala, there would be two different languages in the language.\nThe lambda syntax is based on some of the terms in the example (and one another), and is based on any number of other language.\nListing 1 :: (1, 

In [3]:
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [4]:
raw_datasets["train"][0]["sentence1"], raw_datasets["train"][0]["sentence2"]

('Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .')

In [5]:
checkpoint = 'bert-base-uncased'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
tokenized_dataset = tokenizer(raw_datasets["train"]["sentence1"], raw_datasets["train"]["sentence1"], padding=True, truncation=True)

In [9]:
tokenized_dataset.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [10]:
def tokenize_function(example):
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [11]:
tokenized_datasets =raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
training_args = TrainingArguments("test-trainer", eval_strategy='epoch')

In [15]:
def compute_metrics(eval_preds):
  metric = evaluate.load("glue", "mrpc")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [16]:
trainer = Trainer(model,
                  training_args,
                  train_dataset = tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["validation"],
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [18]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [19]:
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [21]:
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["validation"], shuffle=False, batch_size=8, collate_fn=data_collator)

In [24]:
batch = next(iter(train_dataloader))
for k,v in batch.items():
  print(k, v.shape)

labels torch.Size([8])
input_ids torch.Size([8, 72])
token_type_ids torch.Size([8, 72])
attention_mask torch.Size([8, 72])


In [25]:
outputs = model(**batch)
outputs.loss, outputs.logits.shape

(tensor(0.6483, grad_fn=<NllLossBackward0>), torch.Size([8, 2]))

In [26]:
outputs

SequenceClassifierOutput(loss=tensor(0.6483, grad_fn=<NllLossBackward0>), logits=tensor([[-0.5202,  0.4605],
        [-0.6166,  0.5362],
        [-0.5159,  0.2246],
        [-0.6418,  0.3636],
        [-0.6553,  0.5790],
        [-0.4572,  0.5843],
        [-0.2727,  0.3627],
        [-0.4342,  0.4530]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [27]:
opt = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [30]:
num_epochs = 3
num_training_steps = 1377
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()

    opt.step()
    opt.zero_grad()
    progress_bar.upate(1)

  0%|          | 0/1377 [00:00<?, ?it/s]

AttributeError: 'tqdm' object has no attribute 'upate'

In [22]:
tokenized_sentences = tokenizer(raw_datasets["train"]["sentence1"],
                                raw_datasets["train"]["sentence2"],
                                padding=True,
                                truncation=True)

In [19]:
tokenized_sentences1 = tokenizer(sentences1)
tokenized_sentences2 = tokenizer(sentences2)

In [6]:
sequences = ["This is terrible.",
             "I despise this.",
             "I am confused.",
             "great work!"]

batch = tokenizer(sequences, padding=True, truncation=True, return_tensors='pt')

In [8]:
batch["labels"] = torch.tensor([0, 0, 0, 1])
print(batch.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [11]:
opt = torch.optim.AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
print(loss.detach().cpu().item())
opt.step()

3.6277594566345215


In [8]:
config = BertConfig()
model = BertModel(config)

In [10]:
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [12]:
model = BertModel.from_pretrained("bert-base-cased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [13]:
sequences = ["Hello!", "Cool.", "Nice!"]

In [15]:
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [17]:
encoded_sequences = tokenizer(sequences, padding=True, truncation=True, return_tensors='pt')
encoded_sequences

{'input_ids': tensor([[  101,  8667,   106,   102],
        [  101, 13297,   119,   102],
        [  101,  8835,   106,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]])}

In [20]:
outputs = model(**encoded_sequences)
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.6283,  0.2166,  0.5605,  ...,  0.0136,  0.6158, -0.1712],
         [ 0.6108, -0.2253,  0.9263,  ..., -0.3028,  0.4500, -0.0714],
         [ 0.8040,  0.1809,  0.7076,  ..., -0.0685,  0.4837, -0.0774],
         [ 1.3290,  0.2360,  0.4567,  ...,  0.1509,  0.9621, -0.4841]],

        [[ 0.3128,  0.1718,  0.2099,  ..., -0.0721,  0.4919, -0.1383],
         [ 0.1545, -0.3757,  0.7187,  ..., -0.3130,  0.2822,  0.1883],
         [ 0.4123,  0.3721,  0.5484,  ...,  0.0788,  0.5681, -0.2757],
         [ 0.8356,  0.3964, -0.4121,  ...,  0.1838,  1.6365, -0.4806]],

        [[ 0.5399,  0.2564,  0.2511,  ..., -0.1760,  0.6063, -0.1803],
         [ 0.2609, -0.3164,  0.5548,  ..., -0.3439,  0.3909,  0.0900],
         [ 0.5161,  0.0721,  0.5606,  ...,  0.0077,  0.3685, -0.2272],
         [ 0.6560,  0.8475, -0.1606,  ..., -0.0468,  1.6309, -0.5047]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.7105,  0.

In [21]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)



In [22]:
sequence = "I've been waiting for a HuggingFace course my whole life."

In [23]:
tokens = tokenizer.tokenize(sequence)
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
words = tokenizer.convert_ids_to_tokens(ids)
print(words)

['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
['i', "'", 've', 'been', 'waiting', 'for', 'a', 'hugging', '##face', 'course', 'my', 'whole', 'life', '.']


In [25]:
input = torch.tensor(ids)
input_batch = torch.stack([input, input])
input_batch.shape

torch.Size([2, 14])

In [27]:
outs = model(input_batch)
outs

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789],
        [-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [4]:
raw_inputs = [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [5]:
outputs = model(**inputs)
with torch.inference_mode():
  fin = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(fin)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]])


In [6]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
channels = 3000
samples = 20

weight = torch.ones((1, channels, 1, 1))
image = torch.rand((samples, channels, 10, 10))

noise = torch.randn((samples, 1, 10, 10))
temp = weight * noise
image_noisy = image + temp
print(image.shape, temp.shape, image_noisy.shape)

torch.Size([20, 3000, 10, 10]) torch.Size([20, 3000, 10, 10]) torch.Size([20, 3000, 10, 10])


In [None]:
print(torch.abs(image - image_noisy).std(0).mean())
print(torch.abs(image - image_noisy).std(1).mean())
print(torch.abs(image - image_noisy).std(2).mean())
print(torch.abs(image - image_noisy).std(3).mean())

tensor(0.5843)
tensor(2.4045e-08)
tensor(0.5671)
tensor(0.5714)
