<a href="https://colab.research.google.com/github/babupallam/TensorFlow-Applications-with-Pre-trained-Models/blob/main/05_Sentiment_Analysis_using_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis using BERT

This involve text classification using pretrained BERT model. BERT is a model which stands for Deep Bidirectional Transformers for Language Understanding by Jacob Devlin

In [1]:
# install necessory packages
!pip install transformers datasets




In [2]:
#impport libraries

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import numpy as np


In [None]:
#loading dataset
# Load the IMDb dataset
dataset = load_dataset("imdb")

# Split the dataset into training and testing
train_dataset = dataset['train']
test_dataset = dataset['test']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#tokenizing data

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format to PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


In [19]:
# Loading the Pre-trained BERT Model
# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Setting Up the Trainer

# Define the accuracy metric
accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)




In [None]:
#  Training the Model

# Train the model
trainer.train()


In [None]:
#Evaluating the Model
# Evaluate the model
results = trainer.evaluate()

print(f"Accuracy: {results['eval_accuracy']:.2f}")


In [None]:
# Making Predictions

def predict(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # Make predictions
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    label = "Positive" if predictions == 1 else "Negative"
    return label

# Example prediction
text = "This movie was fantastic! The acting was great, and the story was very compelling."
print(predict(text))


In [None]:
# Saving the Model

model.save_pretrained("sentiment-analysis-bert")
tokenizer.save_pretrained("sentiment-analysis-bert")
