## Pipeline function

In [30]:
import os
from transformers import pipeline
import transformers
import tensorflow as tf 
import torch 

In [31]:
import sys 
print(sys.executable)

/data/u_kamm_software/EEG data analysis/eeg_llm/.venv/bin/python


In [32]:
print(tf.__version__)
print(torch.__version__)
print(transformers.__version__)

2.11.0
2.4.0+cu121
4.44.0


In [33]:
os.environ['TRANSFORMERS_CACHE'] = "/data/u_kamm_software/EEG data analysis/eeg_llm"

In [34]:
gen = pipeline("text-generation", model='distilgpt2')
res = gen(
    "In this course, we will teach you how to",
    max_length=30,
    num_return_sequences=2,
)
print(res)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to use Windows RT 8 R2 to build a new OS. For example, we will teach you how'}, {'generated_text': 'In this course, we will teach you how to best practice basic, fundamental, and practical practice in this class, taking you very seriously. Once you'}]


## Tokenizer

In [35]:
from transformers import AutoTokenizer 

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}


## Model

In [36]:
from transformers import AutoModel 

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 15, 768])


## Transformers

In [37]:
from transformers import AutoModel 

# different models available, others are available as well 
bert_model = AutoModel.from_pretrained("bert-base-cased")
print((type(bert_model)))

gpt_model = AutoModel.from_pretrained("gpt2")
print(type(gpt_model))

bart_model = AutoModel.from_pretrained("facebook/bart-base")
print(type(bart_model))



<class 'transformers.models.bert.modeling_bert.BertModel'>
<class 'transformers.models.gpt2.modeling_gpt2.GPT2Model'>
<class 'transformers.models.bart.modeling_bart.BartModel'>


In [38]:
from transformers import BertConfig, BertModel

# conifiguration of pretrained BERT model
bert_config = BertConfig.from_pretrained("bert-base-cased")
print(bert_config)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



In [39]:
# change certain hyperparameters and create new bert model 
bert_config_mod = BertConfig.from_pretrained("bert-base-cased", num_hidden_layers=10)
bert_model = BertModel(bert_config_mod)

In [40]:
# save model 
bert_model.save_pretrained("my-bert-model")

In [41]:
# reload trained model 
bert_model = BertModel.from_pretrained("my-bert-model")

## Walk-through

In [42]:
from datasets import load_dataset

# load the tiny Shakespeare dataset 
dataset = load_dataset("tiny_shakespeare")

texts = dataset['train']['text']

In [43]:
from transformers import GPT2Tokenizer 

# tokenize the data 
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/1 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (18066 > 1024). Running this sequence through the model will result in indexing errors
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  8.70 examples/s]


In [47]:
from transformers import GPT2Config, GPT2LMHeadModel 

gpt_config = GPT2Config(
    n_embd=512, 
    n_layer=6, 
    n_head=8, 
    n_positions=512, 
    resid_dropout=0.1, 
    attn_pdrop=0.1,
)

gpt = GPT2LMHeadModel(gpt_config)
gpt.resize_token_embeddings(len(tokenizer))

Embedding(50258, 512)

In [45]:
import accelerate
print(accelerate.__version__)

0.33.0


In [48]:
from transformers import Trainer, TrainingArguments 

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True, 
    num_train_epochs=3, 
    per_device_eval_batch_size=4, 
    save_steps=500, 
    save_total_limit=2, 
    logging_dir="./logs",
    logging_steps=100,
)

trainer = Trainer(
    model=gpt, 
    args=training_args, 
    train_dataset=tokenized_datasets['train'],
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`