# Pytorch installation check 

The following code can be used to check that the pytorch dependency has been installed correctly.

It will also load a default dataset and train a model in order to test that the hardware setup works.

In [2]:
import sys
import platform
import torch
import pandas as pd
import sklearn as sk
from datasets import load_dataset, load_metric

has_gpu = torch.cuda.is_available()
has_mps = getattr(torch,'has_mps',False)
device = "mps" if getattr(torch,'has_mps',False) \
    else "gpu" if torch.cuda.is_available() else "cpu"

print(f"Python Platform: {platform.platform()}")
print(f"PyTorch Version: {torch.__version__}")
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print("GPU is", "available" if has_gpu else "NOT AVAILABLE")
print("MPS (Apple Metal) is", "AVAILABLE" if has_mps else "NOT AVAILABLE")
print(f"Target device is {device}")

Python Platform: macOS-14.4.1-arm64-arm-64bit
PyTorch Version: 2.3.0
Python 3.9.19 (main, Mar 21 2024, 12:07:41) 
[Clang 14.0.6 ]
Pandas 2.2.1
Scikit-Learn 1.3.0
GPU is NOT AVAILABLE
MPS (Apple Metal) is AVAILABLE
Target device is mps


  has_mps = getattr(torch,'has_mps',False)
  device = "mps" if getattr(torch,'has_mps',False) \


In [3]:
torch.backends.mps.is_available()

True

### Dataset loading

This section loads the cola dataset (Corpus of Linguistic Acceptability) where each sentence is labeled as either 1 or 0 indicating whether it is gramatically correct or not. This dataset will be used for training the model.

In [4]:
model_checkpoint = "distilbert-base-uncased"

task = "cola"
actual_task = "mnli" if task == "mnli-mm" else task
dataset = load_dataset("glue", actual_task)
metric = load_metric('glue', actual_task)

  metric = load_metric('glue', actual_task)
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [5]:
pd.Series(dataset["train"][0:]["label"]).value_counts()

1    6023
0    2528
Name: count, dtype: int64

dataset sample

In [6]:
dataset["train"][0:5]

{'sentence': ["Our friends won't buy this analysis, let alone the next one we propose.",
  "One more pseudo generalization and I'm giving up.",
  "One more pseudo generalization or I'm giving up.",
  'The more we study verbs, the crazier they get.',
  'Day by day the facts are getting murkier.'],
 'label': [1, 1, 1, 1, 1],
 'idx': [0, 1, 2, 3, 4]}

### Data preprocessing

Load a tokenizer and use it to transform the dataset

In [7]:
from transformers import AutoTokenizer    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

# usage
tokenizer("Hello, this one sentence!", "And this sentence goes with it.")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

{'input_ids': [101, 7592, 1010, 2023, 2028, 6251, 999, 102, 1998, 2023, 6251, 3632, 2007, 2009, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
key = "sentence"

def preprocess_function(examples):    
    return tokenizer(examples[key], padding="max_length", truncation=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [9]:
encoded_dataset = encoded_dataset.remove_columns(["sentence", "idx"])
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format("torch")

In [21]:
# create smaller samples

small_train_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(500))
small_eval_dataset = encoded_dataset["validation"].shuffle(seed=42).select(range(100))

In [22]:
small_eval_dataset["labels"][:10]

tensor([0, 1, 1, 1, 1, 1, 1, 1, 0, 1])

#### Data Loaders

In [23]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

### Model loading and setting

Load a model, define hyperparameters and start training loop

In [24]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
torch.mps.empty_cache()

In [26]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [29]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [30]:
# Device selection

import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [31]:
## training loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/189 [00:00<?, ?it/s]

In [32]:
# evaluation

import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.73}

In [33]:
batch["labels"]

tensor([0, 0, 1, 0], device='mps:0')