In [1]:
!pip install transformers datasets -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import transformers

# Set to avoid warning messages.
transformers.logging.set_verbosity_error()

In [14]:
from datasets import load_dataset

# Pretrained model from Huggingface
model_name = "distilbert-base-uncased"

# Dataset from HuggingFace
poem_sentiments = load_dataset("poem_sentiment")

# The Data is stored in Apache Arrow format
print(poem_sentiments)
print(poem_sentiments["test"][20:25])

print("\nSentiment Labels used",
      poem_sentiments["train"].features.get("label").names)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 892
    })
    validation: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 105
    })
    test: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 104
    })
})
{'id': [20, 21, 22, 23, 24], 'verse_text': ["as o'er the earth it wanders wide,", 'how hearts were answering to his own,', 'glad on its stone-built hearth; and thorough the wide-mouthed smoke-flue', 'sees the clouds reel and roll above our head,', '’tis to behold his vengeance for my son.'], 'label': [2, 1, 2, 2, 0]}

Sentiment Labels used ['negative', 'positive', 'no_impact', 'mixed']


In [15]:
# Encoding text from the Dataset

from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["verse_text"],
                        padding = True,
                        truncation = True)

enc_poem_sentiments = poem_sentiments.map(
                        tokenize,
                        batched = True,
                        batch_size = None)

print(enc_poem_sentiments["train"][0:5])

{'id': [0, 1, 2, 3, 4], 'verse_text': ['with pale blue berries. in these peaceful shades--', 'it flows so long as falls the rain,', 'and that is why, the lonesome day,', 'when i peruse the conquered fame of heroes, and the victories of mighty generals, i do not envy the generals,', 'of inward strife for truth and liberty.'], 'label': [1, 2, 0, 3, 3], 'input_ids': [[101, 2007, 5122, 2630, 22681, 1012, 1999, 2122, 9379, 13178, 1011, 1011, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2009, 6223, 2061, 2146, 2004, 4212, 1996, 4542, 1010, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1998, 2008, 2003, 2339, 1010, 1996, 10459, 14045, 2154, 1010, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2043, 1045, 7304, 3366, 1996, 11438, 4476, 1997, 7348, 1010, 1998, 1996, 9248, 1997, 10478, 11593, 1010, 1045, 2079, 2025, 21103, 1996, 11593, 1010, 102, 0, 0], [101, 1997, 20546, 27865, 2005, 3606, 1998, 7044, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [8]:
enc_poem_sentiments

DatasetDict({
    train: Dataset({
        features: ['id', 'verse_text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 892
    })
    validation: Dataset({
        features: ['id', 'verse_text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 105
    })
    test: Dataset({
        features: ['id', 'verse_text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 104
    })
})

In [16]:
# Exploring the data

print("Text :",
      enc_poem_sentiments["train"][1].get("verse_text"))
print("\nInput Map :",
      enc_poem_sentiments["train"][1].get("input_ids"))
print("\nAttention Mask :",
      enc_poem_sentiments["train"][1].get("attention_mask"))

print("\nTotal tokens: ",
      len(enc_poem_sentiments["train"][1].get("input_ids")))
print("Non Zero tokens: ",
      len(list(filter(
        lambda x :x > 0,
          enc_poem_sentiments["train"][1].get("input_ids")))))
print("Attention = 1: ",
      len(list(filter(
        lambda x :x > 0,
          enc_poem_sentiments["train"][1].get("attention_mask")))))

Text : it flows so long as falls the rain,

Input Map : [101, 2009, 6223, 2061, 2146, 2004, 4212, 1996, 4542, 1010, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Attention Mask : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Total tokens:  28
Non Zero tokens:  11
Attention = 1:  11


In [9]:
# dataset = enc_poem_sentiments.remove_columns(['id', 'verse_text'])
# dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 892
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 105
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 104
    })
})

In [17]:
# Splitting the dataset into training and validation sets

training_dataset = enc_poem_sentiments["train"]
validation_dataset=enc_poem_sentiments["validation"]

print("\nColumn Names : ",training_dataset.column_names)
print("\nFeatures : ",training_dataset.features)

labels = training_dataset.features.get("label")
num_labels=len(labels.names)


Column Names :  ['id', 'verse_text', 'label', 'input_ids', 'attention_mask']

Features :  {'id': Value(dtype='int32', id=None), 'verse_text': Value(dtype='string', id=None), 'label': ClassLabel(names=['negative', 'positive', 'no_impact', 'mixed'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


## Creating the Model Architecture

In [18]:
from transformers import TFAutoModelForSequenceClassification

# Loading transformer from HuggingFace
sentiment_model = (TFAutoModelForSequenceClassification
            .from_pretrained(model_name, num_labels=num_labels))

sentiment_model.get_config()

{'vocab_size': 30522,
 'max_position_embeddings': 512,
 'sinusoidal_pos_embds': False,
 'n_layers': 6,
 'n_heads': 12,
 'dim': 768,
 'hidden_dim': 3072,
 'dropout': 0.1,
 'attention_dropout': 0.1,
 'activation': 'gelu',
 'initializer_range': 0.02,
 'qa_dropout': 0.1,
 'seq_classif_dropout': 0.2,
 'return_dict': True,
 'output_hidden_states': False,
 'output_attentions': False,
 'torchscript': False,
 'torch_dtype': None,
 'use_bfloat16': False,
 'tf_legacy_loss': False,
 'pruned_heads': {},
 'tie_word_embeddings': True,
 'is_encoder_decoder': False,
 'is_decoder': False,
 'cross_attention_hidden_size': None,
 'add_cross_attention': False,
 'tie_encoder_decoder': False,
 'max_length': 20,
 'min_length': 0,
 'do_sample': False,
 'early_stopping': False,
 'num_beams': 1,
 'num_beam_groups': 1,
 'diversity_penalty': 0.0,
 'temperature': 1.0,
 'top_k': 50,
 'top_p': 1.0,
 'typical_p': 1.0,
 'repetition_penalty': 1.0,
 'length_penalty': 1.0,
 'no_repeat_ngram_size': 0,
 'encoder_no_repeat_ng

In [19]:
# Freeze the first layer if needed
sentiment_model.layers[0].trainable = True

# Add/remove layers if needed.
# sentiment_model.layers [append()/insert()/remove()]

print(sentiment_model.summary())

Model: "tf_distil_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
 dropout_39 (Dropout)        multiple                  0         
                                                                 
Total params: 66956548 (255.42 MB)
Trainable params: 66956548 (255.42 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


## Training the Sentiment Model

In [20]:
import tensorflow as tf

batch_size = 64
tokenizer_columns = tokenizer.model_input_names

train = training_dataset.to_tf_dataset(
    columns = tokenizer_columns, label_cols = ["label"], shuffle = True,
    batch_size = batch_size)

val = validation_dataset.to_tf_dataset(
    columns = tokenizer_columns, label_cols = ["label"], shuffle = False,
    batch_size = batch_size)

sentiment_model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5),
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
    metrics = tf.metrics.SparseCategoricalAccuracy())

sentiment_model.fit(train,
                    validation_data = val,
                    epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f1e2d7234f0>

In [22]:
from datasets import Dataset, DatasetDict

infer_data = {'id':[0,1],
             'verse_text':['and be glad in the summer morning when the kindred ride on their way',
                           'that sometime they put themself in danger'],
             'label':[1,0]}

infer_dataset = Dataset.from_dict(infer_data)

ds_dict = DatasetDict()
ds_dict["infer"] = infer_dataset

print(ds_dict)

# Encoding the text
enc_dataset = ds_dict.map(tokenize, batched = True, batch_size = None)

# Converting to Tensors
infer_final_dataset = enc_dataset["infer"].to_tf_dataset(
    columns = tokenizer_columns,  shuffle = True,
    batch_size = batch_size)

print(infer_final_dataset)

# Prediction
predictions = sentiment_model.predict(infer_final_dataset)

DatasetDict({
    infer: Dataset({
        features: ['id', 'verse_text', 'label'],
        num_rows: 2
    })
})


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

<_PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, 17), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 17), dtype=tf.int64, name=None)}>


In [23]:
predictions.logits

array([[-1.5647929 ,  2.9026542 , -0.79876226, -0.5324803 ],
       [ 2.533812  , -2.0192611 , -1.057152  , -0.05806513]],
      dtype=float32)

In [24]:
import numpy as np

pred_label_ids = np.argmax(predictions.logits, axis=1)

for i in range(len(pred_label_ids)):
    print("Poem:", infer_data["verse_text"][i],
          " Predicted:",labels.names[pred_label_ids[i]],
          " | Actual Label:",labels.names[infer_data["label"][i]])

Poem: and be glad in the summer morning when the kindred ride on their way  Predicted: positive  | Actual Label: positive
Poem: that sometime they put themself in danger  Predicted: negative  | Actual Label: negative
