# Lab 1 of Data Centric AI course

[https://github.com/dcai-course/dcai-lab/tree/master](https://github.com/dcai-course/dcai-lab/tree/master)

At the end of Lab 1, as a bonus it recommends to do a Transformer classification model of "good and bad reviews" from the magazine dataset that you study with basic ML techniques.

You are supposed to do 2 versions:

1. train a model on "as-is" dataset
2. train a model on the "cleaned" dataset: remove all reviews that contain HTML

In [1]:
import numpy as np
import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import datasets
from datasets import Dataset, DatasetDict, ClassLabel

2024-06-09 13:58:25.329272: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-09 13:58:25.329377: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-09 13:58:25.450691: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
import pandas as pd

In [10]:
train = pd.read_csv('/kaggle/input/reviews/reviews_train.csv')
test = pd.read_csv('/kaggle/input/reviews/reviews_test.csv')

train.head(10)

Unnamed: 0,review,label
0,Based on all the negative comments about Taste...,good
1,I still have not received this. Obviously I c...,bad
2,</tr>The magazine is not worth the cost of sub...,good
3,This magazine is basically ads. Kindve worthle...,bad
4,"The only thing I've recieved, so far, is the b...",bad
5,"The magazines are great, but I never received ...",good
6,This is one magazine I really love. It has pri...,good
7,Did not. Open.,bad
8,Forever the best magazine! Love it!!,good
9,Very disappointed. It's nothing more than an a...,bad


# First model - trained on "as-is" dataset, include all reviews in training data

In [11]:
label_map = {"bad": 0, "good": 1}
dataset_train = Dataset.from_dict({"label": train["label"].map(label_map), "text": train["review"].values})
dataset_test = Dataset.from_dict({"label": test["label"].map(label_map), "text": test["review"].values})

In [19]:
model_name = "distilbert-base-uncased"  # which pretrained neural network weights to load for fine-tuning on our data
# other options you could try: "bert-base-uncased", "bert-base-cased", "google/electra-small-discriminator"

max_training_steps = 1000  # how many iterations our network will be trained for
# Here set to a tiny value to ensure quick runtimes, set to higher values if you have a GPU to run this code on.

model_folder = "test_trainer"  # file where model will be saved after training

In [20]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_tokenized_dataset = dataset_train.map(tokenize_function, batched=True)
train_tokenized_dataset = train_tokenized_dataset.cast_column("label", ClassLabel(names = ["0", "1"]))

test_tokenized_dataset = dataset_test.map(tokenize_function, batched=True)
test_tokenized_dataset = test_tokenized_dataset.cast_column("label", ClassLabel(names = ["0", "1"]))

training_args = TrainingArguments(max_steps=max_training_steps, output_dir=model_folder, report_to="none")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
)

Map:   0%|          | 0/6666 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6666 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
max_steps is given, it will override any value given in num_train_epochs


In [21]:
trainer.train()

Step,Training Loss
500,0.2511
1000,0.1152


TrainOutput(global_step=1000, training_loss=0.18315346145629882, metrics={'train_runtime': 237.893, 'train_samples_per_second': 33.629, 'train_steps_per_second': 4.204, 'total_flos': 1058944384856064.0, 'train_loss': 0.18315346145629882, 'epoch': 1.1990407673860912})

In [22]:
pred_probs = trainer.predict(test_tokenized_dataset).predictions
pred_classes = np.argmax(pred_probs, axis=1)
print(f"Error rate of predictions: {np.mean(pred_classes != test_tokenized_dataset['label'])}")

Error rate of predictions: 0.022


# Seems that Transformer model with 1000 steps is OK

Not sure what benefit from removing the "bad data"

Will try anyway:

In [23]:
def is_bad_data(review: str) -> bool:
    if any(c in {'<', '>'} for c in review):
        return True
    return False

In [25]:
train_clean = train[~train['review'].map(is_bad_data)]

In [26]:
clean_dataset_train = Dataset.from_dict({"label": train_clean["label"].map(label_map), "text": train_clean["review"].values})
dataset_test = Dataset.from_dict({"label": test["label"].map(label_map), "text": test["review"].values})

In [27]:
clean_model_folder = "clean_test_trainer"

In [28]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# HERE CHANGED: using "cleaned" dataset as the train dataset
train_tokenized_dataset = clean_dataset_train.map(tokenize_function, batched=True)
train_tokenized_dataset = train_tokenized_dataset.cast_column("label", ClassLabel(names = ["0", "1"]))

test_tokenized_dataset = dataset_test.map(tokenize_function, batched=True)
test_tokenized_dataset = test_tokenized_dataset.cast_column("label", ClassLabel(names = ["0", "1"]))

training_args = TrainingArguments(max_steps=max_training_steps, output_dir=clean_model_folder, report_to="none")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

clean_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
)

Map:   0%|          | 0/3998 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3998 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
max_steps is given, it will override any value given in num_train_epochs


In [29]:
clean_trainer.train()

Step,Training Loss
500,0.1684
1000,0.0647


TrainOutput(global_step=1000, training_loss=0.11655619430541993, metrics={'train_runtime': 234.5845, 'train_samples_per_second': 34.103, 'train_steps_per_second': 4.263, 'total_flos': 1059209319653376.0, 'train_loss': 0.11655619430541993, 'epoch': 2.0})

In [30]:
clean_pred_probs = clean_trainer.predict(test_tokenized_dataset).predictions
clean_pred_classes = np.argmax(clean_pred_probs, axis=1)
print(f"Error rate of predictions: {np.mean(clean_pred_classes != test_tokenized_dataset['label'])}")

Error rate of predictions: 0.027


# So here error rate actually goes UP from 0.022 to 0.027