<a href="https://colab.research.google.com/github/aayushpe/Political-Sentiment-Analysis/blob/main/Sentiment_Analysis_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [None]:
# Import dependencies
!pip install datasets
from transformers import TFAutoModel, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset # In order to import test dataset, may have to run !pip install datasets
import tensorflow as tf
import torch

In [None]:
# Import BERT model, you need a hugging face account key
model = TFAutoModel.from_pretrained('bert-base-uncased')

# Import tokenizer to transform a sentence into features
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenized Strings Examples

In [None]:
# Make a sample sentence to test with the Bert Model.
inputs = tokenizer('It was ok, it was not too great', padding=True, truncation=True, return_tensors='tf')
print(inputs) # Visualize tokenized string

In [None]:
# Send string to model
output = model(inputs)
print(output) # Visualize model output

In [None]:
# Import emotions dataset
emotions = load_dataset('SetFit/emotion')
print(emotions) # View dataset

In [None]:
# Tokenize dataset
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

In [None]:
# setting 'input_ids', 'attention_mask', 'token_type_ids', and 'label'
# to the tensorflow format. Now if you access this dataset you will get these
# columns in `tf.Tensor` format

emotions_encoded.set_format('tf',
                            columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

# setting BATCH_SIZE to 64.
BATCH_SIZE = 64

def order(inp):
    '''
    This function will group all the inputs of BERT
    into a single dictionary and then output it with
    labels.
    '''
    data = list(inp.values())
    return {
        'input_ids': data[1],
        'attention_mask': data[2],
        'token_type_ids': data[3]
    }, data[0]

# converting train split of `emotions_encoded` to tensorflow format
train_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['train'][:])
# set batch_size and shuffle
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(1000)
# map the `order` function
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# ... doing the same for test set ...
test_dataset = tf.data.Dataset.from_tensor_slices(emotions_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

In [None]:
# Initiate model
class BERTForClassification(tf.keras.Model):

    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [None]:
classifier = BERTForClassification(model, num_classes=6)

# Specify the models' optimizer and loss function
classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [None]:
# Start training, should take around 20 minutes for 3 epochs
history = classifier.fit(
    train_dataset,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
# Evaluate model on test dataset
classifier.evaluate(test_dataset)



[0.19533739984035492, 0.9200000166893005]