In [2]:
%load_ext autoreload
%autoreload 2

# Use HuggingFace's datasets library to access the Emotion dataset
from datasets import load_dataset
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Function to load datasets using HuggingFace's 'datasets' library
def load_emotion_data():
    cache_dir = "./data_cache"  # Cache directory to store the datasets locally

    # Load training, validation, and test sets
    train_dataset = load_dataset("tweet_eval", name="emotion", split="train", cache_dir=cache_dir)
    val_dataset = load_dataset("tweet_eval", name="emotion", split="validation", cache_dir=cache_dir)
    test_dataset = load_dataset("tweet_eval", name="emotion", split="test", cache_dir=cache_dir)

    print(f"Training dataset loaded with {len(train_dataset)} instances.")
    print(f"Validation dataset loaded with {len(val_dataset)} instances.")
    print(f"Test dataset loaded with {len(test_dataset)} instances.")

    return train_dataset, val_dataset, test_dataset

# Load the data
train_data, val_data, test_data = load_emotion_data()

# Prepare the text and labels for training, validation, and testing
X_train, y_train = train_data['text'], train_data['label']
X_val, y_val = val_data['text'], val_data['label']
X_test, y_test = test_data['text'], test_data['label']

# Define the model pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),  # Text vectorization
    ('clf', MultinomialNB())                          # Naive Bayes classifier
])

# Train the model on the training data
pipeline.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = pipeline.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy}")

# Evaluate the model on the test set
y_test_pred = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy}")
print(classification_report(y_test, y_test_pred))


Downloading readme:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]


Downloading data:   0%|                                                                     | 0.00/233k [00:00<?, ?B/s][A
Downloading data: 100%|██████████████████████████████████████████████████████████████| 233k/233k [00:00<00:00, 452kB/s][A

Downloading data:   0%|                                                                     | 0.00/105k [00:00<?, ?B/s][A
Downloading data: 100%|██████████████████████████████████████████████████████████████| 105k/105k [00:00<00:00, 598kB/s][A

Downloading data:   0%|                                                                    | 0.00/28.6k [00:00<?, ?B/s][A
Downloading data: 100%|████████████████████████████████████████████████████████████| 28.6k/28.6k [00:00<00:00, 205kB/s][A


Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

Training dataset loaded with 3257 instances.
Validation dataset loaded with 374 instances.
Test dataset loaded with 1421 instances.
Validation Accuracy: 0.6524064171122995
Test Accuracy: 0.6537649542575651
              precision    recall  f1-score   support

           0       0.62      0.89      0.73       558
           1       0.75      0.49      0.60       358
           2       0.70      0.15      0.25       123
           3       0.66      0.61      0.63       382

    accuracy                           0.65      1421
   macro avg       0.68      0.54      0.55      1421
weighted avg       0.67      0.65      0.63      1421



In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.nn.utils.rnn import pad_sequence

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load dataset
dataset = load_dataset("tweet_eval", "emotion")
train_dataset, val_dataset, test_dataset = dataset['train'], dataset['validation'], dataset['test']

# Tokenization and Encoding the data
def tokenize_and_encode(sentences):
    tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')
    return [torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)) for sentence in sentences]

# Dataset Class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.labels = labels
        self.texts = [torch.tensor(text) for text in tokenize_and_encode(texts)]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create custom datasets
train_data = EmotionDataset(train_dataset['text'], train_dataset['label'])
val_data = EmotionDataset(val_dataset['text'], val_dataset['label'])
test_data = EmotionDataset(test_dataset['text'], test_dataset['label'])

# Dataloader
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=lambda x: pad_sequence(x, batch_first=True, padding_value=0))
val_loader = DataLoader(val_data, batch_size=32, shuffle=False, collate_fn=lambda x: pad_sequence(x, batch_first=True, padding_value=0))
test_loader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=lambda x: pad_sequence(x, batch_first=True, padding_value=0))

# Model definition
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, _) = self.lstm(embedded)
        hidden = hidden.squeeze(0)
        return self.fc(hidden)

# Instantiate the model
vocab_size = 30522  # Size of BERT's vocabulary
embedding_dim = 100
hidden_dim = 256
output_dim = 4  # Number of emotion classes
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training function
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for text, labels in iterator:
        text, labels = text.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(text)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# Training loop
for epoch in range(10):  # Number of epochs
    loss = train(model, train_loader, optimizer, criterion)
    print(f'Epoch: {epoch+1}, Loss: {loss:.4f}')

# Evaluate function and other utilities would be added to assess the model performance


Downloading: "https://github.com/huggingface/pytorch-transformers/zipball/main" to C:\Users\ayofa/.cache\torch\hub\main.zip


RuntimeError: Missing dependencies: sentencepiece, sacremoses