# Sentiment Analysis Model Training and Accuracy Visualization

In this Jupyter Notebook, we will train a sentiment analysis model and visualize the accuracy scores over the training process.

### Importing Necessary Libraries
First, we need to import the required libraries:

In [38]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
import os
import matplotlib.pyplot as plt

# Model Training Function
Next, we will define a function to train the model and collect the accuracy scores during the training process:

In [39]:
def train_and_save_model(data_path: str, model_output_path: str):
    """Train a sentiment analysis model using provided data and save it."""
    
    # Load dataset
    df = pd.read_csv(data_path)
    texts = df['text'].tolist()
    labels = df['sentiment'].tolist()

    # Convert sentiment labels to numerical values
    label_mapping = {'olumlu': 1, 'notr': 0, 'olumsuz': 2}
    labels = [label_mapping[label] for label in labels]

    # Select first 50 and last 50 samples for training
    train_texts = texts[:50] + texts[-50:]
    train_labels = labels[:50] + labels[-50:]

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("saribasmetehan/bert-base-turkish-sentiment-analysis")
    model = AutoModelForSequenceClassification.from_pretrained("saribasmetehan/bert-base-turkish-sentiment-analysis", num_labels=3)

    # Tokenize data
    def tokenize_function(texts):
        return tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)

    train_encodings = tokenize_function(train_texts)

    # Create dataset
    class Dataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: val[idx].contiguous() for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    train_dataset = Dataset(train_encodings, train_labels)

    # List to store accuracy scores
    accuracies = []

    # Define compute_metrics function
    def compute_metrics(p):
        preds = p.predictions.argmax(axis=1)
        acc = accuracy_score(p.label_ids, preds)
        accuracies.append(acc)  # Append accuracy to the list
        return {"accuracy": acc}

    # Training arguments
    training_args = TrainingArguments(
        output_dir=model_output_path,
        num_train_epochs=4,
        per_device_train_batch_size=16,
        learning_rate=5e-5,
        logging_dir='./logs',
        logging_steps=10,
        save_steps=500,
        save_total_limit=1,
        seed=42,
        evaluation_strategy="steps",
        eval_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
    )

    # Trainer object
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(model_output_path)
    tokenizer.save_pretrained(model_output_path)

    # Return the accuracy scores
    return accuracies


# Training the Model
Now, let's train the model.

In [40]:
# Paths for the data and model
data_path = r'C:\Users\Ali Riza Ercan\Desktop\Data Science\PassoAssist\PassoAssist\data\processed\cleaned_df.csv'
model_output_path = r'C:\Users\Ali Riza Ercan\Desktop\Data Science\PassoAssist\PassoAssist\data\models\sentiment\saribasmetehan_sentiment_model'

# Train the model and collect accuracy scores
accuracies = train_and_save_model(data_path, model_output_path)



  0%|          | 0/28 [00:00<?, ?it/s]

{'loss': 1.4026, 'grad_norm': 18.722679138183594, 'learning_rate': 3.2142857142857144e-05, 'epoch': 1.43}
{'loss': 0.163, 'grad_norm': 4.068999767303467, 'learning_rate': 1.4285714285714285e-05, 'epoch': 2.86}
{'train_runtime': 259.2748, 'train_samples_per_second': 1.543, 'train_steps_per_second': 0.108, 'train_loss': 0.5881467844758715, 'epoch': 4.0}


This code will train the model, collect accuracy scores at each evaluation step, and plot them using *matplotlib*.

# Testing the Model
Finally, we add a function to test the trained model with new input texts:

In [41]:
# Function to test the trained model
def test_model(model_path: str, text: str):
    """Load a saved model and test with a given input text, showing prediction probabilities."""

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_probs = F.softmax(logits, dim=1)[0]
        predicted_class = torch.argmax(predicted_probs).item()

    # Map the predicted class to the sentiment label
    label_mapping = {1: 'olumlu', 0: 'notr', 2: 'olumsuz'}
    sentiment = label_mapping[predicted_class]
    probability = predicted_probs[predicted_class].item()

    print(f"Input: {text}")
    print(f"Predicted Sentiment: {sentiment} (Confidence: {probability:.2f})")

# Test the model with a sample sentence
test_model(model_output_path, "Bu bir deneme cümlesidir.")

Input: Bu bir deneme cümlesidir.
Predicted Sentiment: notr (Confidence: 1.00)


This function allows you to test the saved model with a new text input and outputs the predicted sentiment and the confidence level.