In [1]:
## from hugging face
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

## python libraries
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("amazon_polarity")

In [3]:
print("Type of dataset: ", type(dataset))
print("Structure of dataset: \n", str(dataset))

print("\n First record in training dataset:")
dataset["train"][0]

## this dataset has 2 sections - train and test.
## each set has dictionaries with 3 features - label, title, and content.
## training dataset has 3,600,000 records whereas testing dataset has 400,000

Type of dataset:  <class 'datasets.dataset_dict.DatasetDict'>
Structure of dataset: 
 DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 3600000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 400000
    })
})

 First record in training dataset:


{'label': 1,
 'title': 'Stuning even for the non-gamer',
 'content': 'This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'}

In [4]:
model_name = "distilbert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, token=False)

## I'm adding a classification head to this model with 2 outputs - because I want to have only 2 outcome sentiments
## This builds a new layer: Linear(hidden_size → num_labels)
## Please note that this will not freeze the existing parameters (weights and biases) - they will have to be frozen separately if we want only the last layer to be trained. That might speed up the training process.

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=False)

def tokenize_function(examples):
    # Combine title + content for input
    texts = [t + " " + c for t, c in zip(examples["title"], examples["content"])]
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

In [6]:
train_subset = dataset["train"].shuffle(seed = 20).select(range(2000))   ## taking 2K records from 3.6M
test_subset = dataset["test"].shuffle(seed = 20).select(range(500))      ## taking 500 records from 400K

In [7]:
tokenized_train = train_subset.map(tokenize_function, batched=True)
tokenized_test = test_subset.map(tokenize_function, batched=True)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map: 100%|██████████| 500/500 [00:00<00:00, 5496.68 examples/s]


In [8]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [9]:
training_args = TrainingArguments(
                                    output_dir="./results",
                                    eval_strategy="epoch",
                                    per_device_train_batch_size=16,
                                    per_device_eval_batch_size=16,
                                    num_train_epochs=2,
                                    logging_dir="./logs",
                                    save_strategy="no",  # to save time
                                    load_best_model_at_end=False
                                )

In [10]:
trainer = Trainer(
                    model=model,
                    args=training_args,
                    train_dataset=tokenized_train,
                    eval_dataset=tokenized_test,
                    compute_metrics=compute_metrics,
                )

In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.258892,0.902
2,No log,0.327917,0.91




TrainOutput(global_step=250, training_loss=0.2240973205566406, metrics={'train_runtime': 103.9077, 'train_samples_per_second': 38.496, 'train_steps_per_second': 2.406, 'total_flos': 132467398656000.0, 'train_loss': 0.2240973205566406, 'epoch': 2.0})

In [None]:
model.save_pretrained("model/")
tokenizer.save_pretrained("model/")

('yk_model/tokenizer_config.json',
 'yk_model/special_tokens_map.json',
 'yk_model/vocab.txt',
 'yk_model/added_tokens.json',
 'yk_model/tokenizer.json')

In [None]:
## To use the pretrained model in this github repo

# model = AutoModelForSequenceClassification.from_pretrained("model/")
# tokenizer = AutoTokenizer.from_pretrained("model/")