In [None]:
pip install transformers[torch] datasets

In [None]:
from datasets import load_dataset
import numpy as np

In [None]:
raw_datasets = load_dataset("glue", "sst2")
dir(raw_datasets['train'])

### Inspecting the dataset

In [None]:
type(raw_datasets['train'])

In [None]:
raw_datasets['train'].data

In [None]:
raw_datasets['train'][50000:50003]

In [None]:
raw_datasets['train'].features

In [None]:
from transformers import AutoTokenizer

### Setting up the Tokenizer

In [None]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
tokenized_sentences = tokenizer(raw_datasets['train'][0:3]['sentence'])
from pprint import pprint
pprint(tokenized_sentences)

In [None]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

In [None]:
from transformers import TrainingArguments

In [None]:
training_args = TrainingArguments(
    "my_trainer",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=1
)

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2
)

In [None]:
type(model)

In [None]:
model

In [None]:
!pip install torchinfo

In [None]:
from torchinfo import summary

summary(model)

### Used in order to check that the fine tuning actually changed something in the model

In [None]:
params_before = []
for name, p in model.named_parameters():
  params_before.append(p.detach().cpu().numpy())

In [None]:
from transformers import Trainer

In [None]:
from datasets import load_metric

In [None]:
metric = load_metric("glue", "sst2")

In [None]:
metric.compute(predictions=[1, 0, 1], references=[1, 0, 0])

In [None]:
def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.save_model('my_saved_model')

In [None]:
!ls my_saved_model

In [None]:
from transformers import pipeline
newmodel = pipeline('text-classification', model='my_saved_model', device=0)

In [None]:
newmodel('This movie is great!')

In [None]:
newmodel('This movie sucks')

In [None]:
!cat my_saved_model/config.json

In [None]:
import json

### Modifying the labels so they display properly

In [None]:
config_path = "my_saved_model/config.json"
with open(config_path) as f:
  j = json.load(f)

j['id2label'] = {0: 'negative', 1: 'positive'}

with open(config_path, 'w') as f:
  json.dump(j, f, indent=2)

In [None]:
!cat my_saved_model/config.json

In [None]:
newmodel = pipeline('text-classification', model='my_saved_model', device=0)

In [None]:
newmodel('This movie is great!')

In [None]:
params_after = []
for name, p in model.named_parameters():
  params_after.append(p.detach().cpu().numpy())

In [None]:
for p1, p2 in zip(params_before, params_after):
  print(np.sum(np.abs(p1 - p2)))