<a href="https://colab.research.google.com/github/aryanarora07/ML-AI/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install transformers datasets torch scikit-learn

# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Step 1: Load and Prepare the Dataset
dataset = load_dataset("imdb")  # IMDb dataset for sentiment analysis

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns and format the dataset for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Split the dataset into train and test sets
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

# Step 2: Load Pre-trained BERT Model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 3: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save model checkpoints
    evaluation_strategy="epoch",    # Evaluate at the end of each epoch
    learning_rate=5e-5,             # Learning rate
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,             # Number of epochs
    weight_decay=0.01,              # Weight decay for regularization
    logging_dir="./logs",           # Directory to save logs
    logging_steps=10,               # Log every 10 steps
    save_steps=500,                 # Save model checkpoint every 500 steps
    save_total_limit=2,             # Keep only the last 2 checkpoints
    load_best_model_at_end=True     # Load best model at the end of training
)

# Step 4: Define Metrics for Evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Step 5: Initialize Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Step 6: Train the Model
trainer.train()

# Step 7: Save the Fine-Tuned Model and Tokenizer
model.save_pretrained("./fine_tuned_bert")
tokenizer.save_pretrained("./fine_tuned_bert")

# Step 8: Evaluate the Model on Test Set
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)

accuracy = accuracy_score(test_dataset["labels"], predicted_labels)
f1 = f1_score(test_dataset["labels"], predicted_labels, average="weighted")

print(f"Test Accuracy: {accuracy}")
print(f"Test F1 Score: {f1}")

# Step 9: Run Inference on New Data (Optional)
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    predicted_class = np.argmax(outputs.logits.detach().numpy(), axis=-1)[0]
    sentiment = "Positive" if predicted_class == 1 else "Negative"
    return sentiment

# Example usage:
example_text = "This movie was fantastic! I loved it."
print(f"Sentiment Prediction: {predict_sentiment(example_text)}")
