In [45]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [1]:
%pip -q install transformers

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "sshleifer/distilbart-cnn-12-6", # sentiment analysis model
)

raw_inputs = [
    "My name is Anuj kumar",
    "I wanna gain the expertise in the Field of AI/ML."
]

inputs = tokenizer(
    raw_inputs,
    padding = True,
    truncation = True,
    return_tensor = "pt"
)

print(inputs)

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

{'input_ids': [[0, 2387, 766, 16, 660, 11591, 449, 16160, 2, 1, 1, 1, 1, 1, 1], [0, 100, 23126, 2364, 5, 6424, 11, 5, 4754, 9, 4687, 73, 10537, 4, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [9]:
# print(f"{raw_inputs[0]}\nTokenizer-\n{inputs['input_ids'][0]}\n")
# print(f"Attention Masks-\n{inputs['attention_mask'][0]}\n")

for i, sentence in enumerate(raw_inputs):
    print("\n")
    print(f"Sentence: {sentence}")
    print(f"Tokens/Input Ids: {inputs['input_ids'][i]}") # token id of sentence
    print(f"Attention Mask: {inputs['attention_mask'][i]}") # this tells model which token to give attention to
    print(f"Sentence length: {len(sentence.split(" "))}\ninput_ids length {len(inputs['input_ids'][i])}")
    print("\n")



Sentence: My name is Anuj kumar
Tokens/Input Ids: [0, 2387, 766, 16, 660, 11591, 449, 16160, 2, 1, 1, 1, 1, 1, 1]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
Sentence length: 5
input_ids length 15




Sentence: I wanna gain the expertise in the Field of AI/ML.
Tokens/Input Ids: [0, 100, 23126, 2364, 5, 6424, 11, 5, 4754, 9, 4687, 73, 10537, 4, 2]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Sentence length: 10
input_ids length 15




## Tokenizer Under the Hood

In [11]:
tokens = tokenizer.tokenize(raw_inputs[0])
tokens ## G represents space, u continuation

['My', 'Ġname', 'Ġis', 'ĠAn', 'uj', 'Ġk', 'umar']

In [12]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids

[2387, 766, 16, 660, 11591, 449, 16160]

In [14]:
decode_token = tokenizer.decode(token_ids)
decode_token

'My name is Anuj kumar'

In [19]:
# model_prepped_ids = tokenizer.prepare_for_model(token_ids)
model_prepped_ids = tokenizer(raw_inputs[0])
model_prepped_ids

{'input_ids': [0, 2387, 766, 16, 660, 11591, 449, 16160, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Working with Models

### Pipeline

In [21]:
from transformers import pipeline

In [22]:
## classifier
classifier = pipeline("sentiment-analysis")
classifier(raw_inputs)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f.
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9984731078147888},
 {'label': 'NEGATIVE', 'score': 0.985593855381012}]

In [23]:
## text-generation
text_generation = pipeline("text-generation")
text_generation(raw_inputs)

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d.
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: openai-community/gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[[{'generated_text': 'My name is Anuj kumar and I am the founder of the Foundation for Science and Technology (FST). I am a retired engineer at the National Science Foundation in New Delhi.\n\nI am also a PhD candidate at the SRI University of Technology, Mumbai.\n\nI am a member of the Indian Institute of Technology\'s Board of Scientific Research (BIR) and is a member of the International Scientific Board.\n\nI am the principal investigator and project leader of the FST and the first research co-author. The BRI is an interdisciplinary network of scientists working together to create and develop a better understanding of the natural world.\n\nSince 2001, I have been awarded the BRI\'s prestigious Distinguished Scientist Award for "Best Scientific Research in the World" (M.S.). The award is also given to the co-author of the book "World\'s Most Influential Scientists", published in 2007 by the World Science Foundation.\n\nThe BRI\'s scientific advisory council (SRI) is the body of scie

### Accessing Pre-trained Model

In [36]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english") ## text classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

In [38]:
inputs = tokenizer(raw_inputs, return_tensors='pt', padding=True)
inputs

{'input_ids': tensor([[  101,  2026,  2171,  2003,  2019, 23049,  9600,   102,     0,     0,
             0,     0,     0,     0,     0],
        [  101,  1045, 10587,  5114,  1996, 11532,  1999,  1996,  2492,  1997,
          9932,  1013, 19875,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [39]:
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-3.0620,  3.4210],
        [ 2.2146, -2.0110]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

### Model Embeddings

In [43]:
from transformers import AutoModel
model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

inputs = tokenizer(raw_inputs, return_tensors="pt", padding=True)
outputs = model(**inputs)
outputs.last_hidden_state.shape ## the token embeddings
# torch.Size([2, 15, 768]) 
# 2 -> number of sentence
# 15 -> input_ids size
# 768 -> dimension

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertModel LOAD REPORT from: distilbert/distilbert-base-uncased-finetuned-sst-2-english
Key                   | Status     |  | 
----------------------+------------+--+-
pre_classifier.bias   | UNEXPECTED |  | 
pre_classifier.weight | UNEXPECTED |  | 
classifier.bias       | UNEXPECTED |  | 
classifier.weight     | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


torch.Size([2, 15, 768])

In [44]:
## to get the full context vector for the sequence
context_vectors = outputs.last_hidden_state.mean(dim=1)
context_vectors.shape

torch.Size([2, 768])

In [47]:
### Accessing Model Config & Creating Custom Models
from transformers import GPT2Config, GPT2Model
config = GPT2Config()
config

GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": false,
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": null,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "tie_word_embeddings": true,
  "transformers_version": "5.0.0",
  "use_cache": true,
  "vocab_size": 50257
}

In [48]:
## Building the model from the config
gpt_model = GPT2Model(config)

In [49]:
## saving new model we created from config
gpt_model.save_pretrained("/content/drive/MyDrive/gpt2_model")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]