<a href="https://colab.research.google.com/github/aristriana/KI_Tasks/blob/main/TaskWeek7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Nama: Aristriana Muhamad<br>
NPM: 2106709043

Model: https://huggingface.co/finiteautomata/bertweet-base-sentiment-analysis

In [16]:
# Install the necessary libraries
!pip install transformers torch evaluate --quiet


In [17]:
# Import the necessary libraries from transformers and evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import evaluate

# Load the pre-trained model and tokenizer
model_name = 'finiteautomata/bertweet-base-sentiment-analysis'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Define the sentiment analysis function to show POS, NEU, NEG scores
def analyze_sentiment(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Get the model outputs
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the logits and apply softmax to get probabilities
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1).flatten().tolist()

    # Map the probabilities to each sentiment
    sentiment_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}
    sentiment_scores = {
        "NEG": probabilities[0],
        "NEU": probabilities[1],
        "POS": probabilities[2]
    }

    # Get the predicted sentiment
    sentiment_label = torch.argmax(logits, dim=1).item()
    predicted_sentiment = sentiment_mapping[sentiment_label]

    return predicted_sentiment, sentiment_scores

# Test the function with a sample text
sample_text = "I love using this model for sentiment analysis!"
predicted_sentiment, sentiment_scores = analyze_sentiment(sample_text)
print(f"Sentiment: {predicted_sentiment}")
print(f"Scores: {sentiment_scores}")


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Sentiment: Positive
Scores: {'NEG': 0.0016961999936029315, 'NEU': 0.006148709449917078, 'POS': 0.992155134677887}


In [18]:
# Function to calculate accuracy using the evaluate library
accuracy_metric = evaluate.load("accuracy")

def calculate_accuracy(texts, labels):
    predictions = []

    for text, actual_label in zip(texts, labels):
        predicted_sentiment, _ = analyze_sentiment(text)
        # Convert sentiment string to label (0 for NEG, 1 for NEU, 2 for POS)
        label_mapping = {"Negative": 0, "Neutral": 1, "Positive": 2}
        predicted_label = label_mapping[predicted_sentiment]
        predictions.append(predicted_label)

        # Print the predicted and actual labels
        actual_sentiment = list(label_mapping.keys())[list(label_mapping.values()).index(actual_label)]
        #print(f"Text: {text}")
        #print(f"Predicted: {predicted_sentiment} (Label: {predicted_label}), Actual: {actual_sentiment} (Label: {actual_label})\n")

    # Calculate accuracy
    results = accuracy_metric.compute(predictions=predictions, references=labels)
    return results["accuracy"]

# Test accuracy calculation
test_texts = [
    "I am extremely happy with the results!",
    "The service was okay, nothing special.",
    "I am very disappointed with the quality."
]

test_labels = [2, 1, 0]  # POS, NEU, NEG
accuracy = calculate_accuracy(test_texts, test_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")



Accuracy: 66.67%


##Check the model’s accuracy using a synthetic dataset

In [19]:
# Create a synthetic dataset with 300 examples (100 for each sentiment class)
import random

# Sample synthetic texts for each sentiment
positive_texts = [
    "I absolutely love this product!",
    "This is the best experience I’ve ever had.",
    "Amazing service, I’m very satisfied!",
    "The quality exceeded my expectations.",
    "I'm thrilled with the results!",
    "This is fantastic and I’m very happy.",
    "I couldn’t be more pleased with how this turned out.",
    "Outstanding job, well done!",
    "This is perfect, just what I needed.",
    "I’m super happy with this decision."
] * 10  # Repeat to get 100 samples

neutral_texts = [
    "The product is fine, nothing special.",
    "This is okay, just as expected.",
    "It's an average experience overall.",
    "The quality is neither good nor bad.",
    "It’s a standard service, nothing to complain about.",
    "I'm indifferent about the results.",
    "This is adequate but not outstanding.",
    "I guess it's alright, not too impressive.",
    "The experience was neither here nor there.",
    "It's pretty much what I expected."
] * 10  # Repeat to get 100 samples

negative_texts = [
    "I’m very disappointed with this product.",
    "The service was terrible and unsatisfactory.",
    "This is the worst experience I’ve ever had.",
    "The quality is far below my expectations.",
    "I'm unhappy with how things turned out.",
    "This is not good, I'm really frustrated.",
    "A very poor experience overall.",
    "This didn’t meet my expectations at all.",
    "I'm upset and will not recommend this.",
    "It was a waste of time and money."
] * 10  # Repeat to get 100 samples

# Combine the datasets
synthetic_texts = positive_texts + neutral_texts + negative_texts
synthetic_labels = [2] * 100 + [1] * 100 + [0] * 100  # 2 for POS, 1 for NEU, 0 for NEG

# Shuffle the dataset to mix the samples
combined = list(zip(synthetic_texts, synthetic_labels))
random.shuffle(combined)
synthetic_texts, synthetic_labels = zip(*combined)

# Calculate accuracy on the synthetic dataset
accuracy = calculate_accuracy(synthetic_texts, synthetic_labels)
print(f"Accuracy on synthetic dataset: {accuracy * 100:.2f}%")


Accuracy on synthetic dataset: 76.67%


##Implement attention transformer and compare with the original model

In [20]:
# Import train_test_split from sklearn
from sklearn.model_selection import train_test_split

# Prepare the dataset using PyTorch
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AdamW

# Define a custom dataset class for PyTorch
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        # Return the encoded inputs and the label
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create the training and evaluation datasets
train_texts, eval_texts, train_labels, eval_labels = train_test_split(
    synthetic_texts, synthetic_labels, test_size=0.2, random_state=42
)

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
eval_dataset = SentimentDataset(eval_texts, eval_labels, tokenizer)

# Define the model, optimizer, and loss function
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Train the model using a PyTorch training loop
def train_model(model, train_dataset, eval_dataset, epochs=3, batch_size=8):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

    for epoch in range(epochs):
        # Training phase
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss:.4f}")

        # Evaluation phase
        model.eval()
        correct_predictions = 0
        total_predictions = 0
        with torch.no_grad():
            for batch in eval_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask)
                _, preds = torch.max(outputs.logits, dim=1)

                # Calculate accuracy
                correct_predictions += torch.sum(preds == labels)
                total_predictions += labels.size(0)

        accuracy = correct_predictions.double() / total_predictions
        print(f"Epoch {epoch + 1}/{epochs}, Evaluation Accuracy: {accuracy:.4f}")

# Train the model
train_model(model, train_dataset, eval_dataset)

# Evaluate the fine-tuned model's accuracy on the entire synthetic dataset
def evaluate_model_pytorch(model, texts, labels):
    dataset = SentimentDataset(texts, labels, tokenizer)
    data_loader = DataLoader(dataset, batch_size=8)
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            # Calculate accuracy
            correct_predictions += torch.sum(preds == labels)
            total_predictions += labels.size(0)

    accuracy = correct_predictions.double() / total_predictions
    return accuracy.item()

# Evaluate the fine-tuned model
fine_tuned_accuracy = evaluate_model_pytorch(model, synthetic_texts, synthetic_labels)
print(f"Fine-tuned Model Accuracy on synthetic dataset: {fine_tuned_accuracy * 100:.2f}%")

# Compare with original model accuracy
original_model_accuracy = calculate_accuracy(synthetic_texts, synthetic_labels)
print("\nComparison of Accuracies:")
print(f"Original Model Accuracy: {original_model_accuracy * 100:.2f}%")
print(f"Fine-tuned Transformer Model Accuracy: {fine_tuned_accuracy * 100:.2f}%")




Epoch 1/3, Training Loss: 0.0735
Epoch 1/3, Evaluation Accuracy: 1.0000
Epoch 2/3, Training Loss: 0.0121
Epoch 2/3, Evaluation Accuracy: 1.0000
Epoch 3/3, Training Loss: 0.0077
Epoch 3/3, Evaluation Accuracy: 1.0000
Fine-tuned Model Accuracy on synthetic dataset: 100.00%

Comparison of Accuracies:
Original Model Accuracy: 100.00%
Fine-tuned Transformer Model Accuracy: 100.00%
