In [1]:
!pip install datasets

from datasets import load_dataset

# Load 1% of the training and test dataset
initial_data = load_dataset("imdb", split={"train": "train[:1%]", "test": "test[:1%]"})

# Calculate 10% of the loaded subset (which is 0.1% of the entire dataset)
train_subset = initial_data['train'].shuffle(seed=42).select(range(len(initial_data['train']) // 10))
test_subset = initial_data['test'].shuffle(seed=42).select(range(len(initial_data['test']) // 10))

# Store subsets in a dictionary
dataset = {"train": train_subset, "test": test_subset}

splits = ["train", "test"]

## dataset["train"]

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Inspect the first example. Positive or Negative?
dataset["train"][0]

{'text': 'This is really a new low in entertainment. Even though there are a lot worse movies out.<br /><br />In the Gangster / Drug scene genre it is hard to have a convincing storyline (this movies does not, i mean Sebastians motives for example couldn\'t be more far fetched and worn out cliché.) Then you would also need a setting of character relationships that is believable (this movie does not.) <br /><br />Sure Tristan is drawn away from his family but why was that again? what\'s the deal with his father again that he has to ask permission to go out at his age? interesting picture though to ask about the lack and need of rebellious behavior of kids in upper class family. But this movie does not go in this direction. Even though there would be the potential judging by the random Backflashes. Wasn\'t he already down and out, why does he do it again? <br /><br />So there are some interesting questions brought up here for a solid socially critic drama (but then again, this movie is j

In [3]:
# Pre-process dataset

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Use a lambda function to tokenize

tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["text"], truncation=True, padding=True),
        batched=True
    )

# Inspect the available columns in the dataset
tokenized_dataset["train"]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 25
})

In [4]:
# Load and set up the model

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label={0:"NEG", 1:"POS"},
    label2id={"NEG":0, "POS":1},
)

for param in model.parameters():
    param.required_grad=True

## print(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [7]:
# Train the model

In [8]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
 
def compute_metrics (eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir=".data/imdb_review",
        learning_rate=2e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        
        evaluation_strategy="epoch",
        save_strategy="epoch",
        
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),    
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/opt/conda/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.140675,1.0


TrainOutput(global_step=25, training_loss=0.3222134399414063, metrics={'train_runtime': 197.8121, 'train_samples_per_second': 0.126, 'train_steps_per_second': 0.126, 'total_flos': 3311684966400.0, 'train_loss': 0.3222134399414063, 'epoch': 1.0})

In [9]:
# Evaluate the model

In [10]:
trainer.evaluate()

{'eval_loss': 0.14067542552947998,
 'eval_accuracy': 1.0,
 'eval_runtime': 40.4786,
 'eval_samples_per_second': 0.618,
 'eval_steps_per_second': 0.618,
 'epoch': 1.0}

In [11]:
# View the results

In [12]:
import pandas as pd

In [14]:
items_for_manual_review = tokenized_dataset["test"].select(
    [0,1,7]
)

results = trainer.predict(items_for_manual_review)
df=pd.DataFrame(
    {
        "text": [item["text"] for item in items_for_manual_review],
        "predictions": results.predictions.argmax(axis=1),
        "labels": results.label_ids,
    }
)

pd.set_option("display.max_colwidth", None)
df

Unnamed: 0,text,predictions,labels
0,"Blake Edwards' legendary fiasco, begins to seem pointless after just 10 minutes. A combination of The Eagle Has Landed, Star!, Oh! What a Lovely War!, and Edwards' Pink Panther films, Darling Lili never engages the viewer; the aerial sequences, the musical numbers, the romance, the comedy, and the espionage are all ho hum. At what point is the viewer supposed to give a damn? This disaster wavers in tone, never decides what it wants to be, and apparently thinks it's a spoof, but it's pathetically and grindingly square. Old fashioned in the worst sense, audiences understandably stayed away in droves. It's awful. James Garner would have been a vast improvement over Hudson who is just cardboard, and he doesn't connect with Andrews and vice versa. And both Andrews and Hudson don't seem to have been let in on the joke and perform with a miscalculated earnestness. Blake Edwards' SOB isn't much more than OK, but it's the only good that ever came out of Darling Lili. The expensive and professional look of much of Darling Lili, only make what it's all lavished on even more difficult to bear. To quote Paramount chief Robert Evans, ""24 million dollars worth of film and no picture"".",0,0
1,"OK, so my summary line is a cheap trick. But the movie is full of them and it gets absurdly praised, so...<br /><br />I caught this one on TV (uncut, as TV here shows all movies, that's for you Americans who might say I didn't like it because I saw a cut TV version - fortunately that's only an US thing), and had no idea about what it was. I switched on, caught the last minutes of a show, and the movie began. Within a minute, I was begging it was a comedy, given the particularly ridiculous clichéd beginning (yes, it's a bad movie-within-the-movie, I know, but what a way to try to keep the viewer interested! I don't even know why I didn't switch channels). And, yes, in fact the movie turned out to be a comedy, albeit an unintentional one.<br /><br />Marina Zudina is pretty enough, but gosh, what a dreadful performance! While casting a foreigner in the role is smart enough (she doesn't talk so bye bye language barrier), yet, sorry, Marina baby, playing mute doesn't mean impersonating Harpo Marx. Her acting is unintentionally funny in many moments, just look at her when she draws an X in the air while stalked by the killer. He wants to kill you, it's no time to play Zorro. We get plenty of ""running upstairs"" stuff passing for tension, as in the worst slashers, and things like pulling a carpet and a bad guy shots the other. Ugh! Will Hollywood ever learn? Yet the best/worst pearl is having a guy electrocuted in a bathtub and... Well, I have never seen anyone being electrocuted to death in a bathtub, but I'm sure you can't see the blue cartoon rays in real life, do you? And how about immediately trusting a mean-looking guy because he SAYS he's a cop, and not asking him to show you his credentials? OK, so he turns out to be a real cop. But still, not asking for the badge makes no sense (plot-wise, we could always think the credentials might be phony or he might be a crooked cop. Screen writing 101). And how about the big twist? Don't tell me you didn't see that coming from 200 miles away...<br /><br />I feel sorry for poor old Alec Guinness and his useless stock footage cameo. Now I think about this, what's the point in giving him a ""Mystery Guest Star"" credit... in the END titles? The movie's over, there's no mystery anymore, and everybody and their brother have identified Guinness (even non-movie buffs will recognize ""the old guy from 'Star Wars'""). Yet better off this way, so we can pretend it's not the late great actor.<br /><br />People keep comparing this to, of all people, Hitchcock. I suppose it has to be John Hitchcock the milkman, as the late Sir Alfred would feel embarrassed out of watching this, let alone making it. And this gets a 6.8/10???? It's Bottom 100 material! But then, we're talking a rating system that allows 'The Lord of the Rings: The Return of the King' to appear as the third best movie ever made (check Top 100), so...<br /><br />2/10.",0,0
2,"STAR RATING: ***** Saturday Night **** Friday Night *** Friday Morning ** Sunday Night * Monday Morning <br /><br />Former New Orleans homicide cop Jack Robideaux (Jean Claude Van Damme) is re-assigned to Columbus, a small but violent town in Mexico to help the police there with their efforts to stop a major heroin smuggling operation into their town. The culprits turn out to be ex-military, lead by former commander Benjamin Meyers (Stephen Lord, otherwise known as Jase from East Enders) who is using a special method he learned in Afghanistan to fight off his opponents. But Jack has a more personal reason for taking him down, that draws the two men into an explosive final showdown where only one will walk away alive.<br /><br />After Until Death, Van Damme appeared to be on a high, showing he could make the best straight to video films in the action market. While that was a far more drama oriented film, with The Shepherd he has returned to the high-kicking, no brainer action that first made him famous and has sadly produced his worst film since Derailed. It's nowhere near as bad as that film, but what I said still stands.<br /><br />A dull, predictable film, with very little in the way of any exciting action. What little there is mainly consists of some limp fight scenes, trying to look cool and trendy with some cheap slo-mo/sped up effects added to them that sadly instead make them look more desperate. Being a Mexican set film, director Isaac Florentine has tried to give the film a Robert Rodriguez/Desperado sort of feel, but this only adds to the desperation.<br /><br />VD gives a particularly uninspired performance and given he's never been a Robert De Niro sort of actor, that can't be good. As the villain, Lord shouldn't expect to leave the beeb anytime soon. He gets little dialogue at the beginning as he struggles to muster an American accent but gets mysteriously better towards the end. All the supporting cast are equally bland, and do nothing to raise the films spirits at all.<br /><br />This is one shepherd that's strayed right from the flock. *",0,0


In [15]:
# Perform PEFT

In [24]:
from peft import get_peft_model
from peft import LoraConfig
from peft import TaskType

config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=['q_lin', 'k_lin','v_lin'],
    bias="none",
    fan_in_fan_out=True,
    task_type='SEQ_CLS'
)

lora_model = get_peft_model(model, config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lora_model.to(device)



PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): Linear(
                  in_features=768, out_features=768, bias=True
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=8, bias=Fals

In [25]:
lora_model.print_trainable_parameters()

trainable params: 1,405,444 || all params: 67,768,324 || trainable%: 2.073895172617815


In [27]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
 
def compute_metrics (eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir=".data/imdb_review",
        learning_rate=2e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        
        evaluation_strategy="epoch",
        save_strategy="epoch",
        
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),    
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.040019,1.0


Checkpoint destination directory .data/imdb_review/checkpoint-25 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=25, training_loss=0.054693808555603025, metrics={'train_runtime': 139.8973, 'train_samples_per_second': 0.179, 'train_steps_per_second': 0.179, 'total_flos': 3374147481600.0, 'train_loss': 0.054693808555603025, 'epoch': 1.0})

In [28]:
lora_model.save_pretrained("bert-imdb")

In [29]:
# Performing Inference with a PEFT Model

In [30]:
from peft import AutoPeftModelForSequenceClassification
lora_model = AutoPeftModelForSequenceClassification.from_pretrained("bert-imdb", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
lora_model.config.pad_token_id = tokenizer.eos_token_id

In [None]:
trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir='./results',
        do_train=False,
        do_eval=True,
        per_device_eval_batch_size=1
    ),
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

trainer.evaluate()