In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torchsummary
import torchtext
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt
from torchmetrics import Accuracy, Precision
from tqdm import tqdm 
import pandas as pd

import os, math

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
sentiment_mapping = {"positive": 0, "neutral": 1, "negative": 2}

train_df = pd.read_csv("data/tweet-sentiment-extraction/train.csv").drop(columns=['textID'])
train_df['text'] = train_df['text'].str.lower().fillna('')
train_df['selected_text'] = train_df['selected_text'].str.lower()
train_df['sentiment'] = train_df['sentiment'].apply(lambda x: sentiment_mapping[x])

test_df = pd.read_csv("data/tweet-sentiment-extraction/test.csv").drop(columns=['textID'])
test_df['text'] = test_df['text'].str.lower().fillna('')
test_df['sentiment'] = test_df['sentiment'].apply(lambda x: sentiment_mapping[x])




Tokenization


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
vocab_size = len(tokenizer)

def get_encoded_text(df, tokenizer, cache_path):
    # Check if the cache file exists
    if os.path.exists(cache_path):
        # Load the encoded text from cache
        encoded_data = torch.load(cache_path)
        print("Loaded encoded text from cache.")
        return encoded_data
    else:
        # Encode the text and save it to cache
        encoded_sentences = df['text'].apply(lambda x: tokenizer.encode_plus(
            x,
            add_special_tokens=True,       # Add '[CLS]' and '[SEP]'
            max_length=128,                # Pad & truncate all sentences to max length
            padding='max_length',          # Pad all sentences to max length
            truncation=True,               # Truncate long sentences to max length
            return_attention_mask=False,    # Return attention mask
            return_tensors='pt'            # Return pytorch tensors
        ))
        
        # Convert the list of encoded sentences to a dictionary of tensors
        input_ids = torch.cat([item['input_ids'] for item in encoded_sentences])
        
        # Save the tensors to cache
        encoded_data = input_ids
        torch.save(encoded_data, cache_path)
        print("Encoded text and saved to cache.")
        return encoded_data

# Example usage with train and test DataFrames
train_encoded = get_encoded_text(train_df, tokenizer, "train_encoded_sentences.pt")
test_encoded = get_encoded_text(test_df, tokenizer, "test_encoded_sentences.pt")

train_encoded

Loaded encoded text from cache.
Loaded encoded text from cache.


tensor([[  101,  1045,  1036,  ...,     0,     0,     0],
        [  101, 17111,  2080,  ...,     0,     0,     0],
        [  101,  2026,  5795,  ...,     0,     0,     0],
        ...,
        [  101,  8038,  2100,  ...,     0,     0,     0],
        [  101,  2021,  2009,  ...,     0,     0,     0],
        [  101,  2035,  2023,  ...,     0,     0,     0]])

In [6]:
# from minbpe import RegexTokenizer
# tokenizer = RegexTokenizer()
# tokenizer.load("toy.model")
# vocab_size = len(tokenizer.vocab)

# def get_encoded_text(df,tokenizer, cache_path):
#     # Check if the cache file exists
#     if os.path.exists(cache_path):
#         # Load the encoded text from cache
#         encoded_sentences = torch.load(cache_path)
#         print("Loaded encoded text from cache.")
#         return encoded_sentences
#     else:
#         # Encode the text and save it to cache
#         encoded_sentences = df['text'].apply(lambda x: tokenizer.encode(x))
#         torch.save(encoded_sentences, cache_path)
#         print("Encoded text and saved to cache.")
#         return encoded_sentences
#     return encoded_sentences

# train_encoded : pd.Series = get_encoded_text(train_df, tokenizer, "train_encoded_sentences.pt")
# test_encoded : pd.Series = get_encoded_text(test_df, tokenizer, "test_encoded_sentences.pt")


# class TextDataset(Dataset):
#     def __init__(self, encoded_texts: pd.Series, labels: pd.Series, max_seq_length: int):
#         self.max_seq_length = max_seq_length
#         self.encoded_texts = self.pad_and_truncate(encoded_texts)
#         self.labels = torch.tensor(labels.values, dtype=torch.long)

#     def pad_and_truncate(self, encoded_texts: pd.Series) -> torch.Tensor:
#         padded_texts = []
#         for encoded_text in encoded_texts:
#             if len(encoded_text) > self.max_seq_length:
#                 padded_text = encoded_text[:self.max_seq_length]
#             else:
#                 padded_text = encoded_text + [0] * (self.max_seq_length - len(encoded_text))
#             padded_texts.append(padded_text)
#         return torch.tensor(padded_texts, dtype=torch.long)

#     def __len__(self):
#         return len(self.encoded_texts)

#     def __getitem__(self, idx):
#         encoded_text = self.encoded_texts[idx]
#         label = self.labels[idx]
#         return encoded_text, label

### Transformer stuff

In [7]:
# Subclass an appropriate PyTorch class 
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_length):
        super(PositionalEncoder, self).__init__()
        self.d_model = d_model
        self.max_length = max_length
        
        # Initialize the positional encoding matrix
        pe = torch.zeros(max_length, d_model)

        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * -(math.log(10000.0) / d_model))
        
        # Calculate and assign position encodings to the matrix
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    # Update the embeddings tensor adding the positional encodings
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

In [8]:
class MultiHeadAttention(nn.Module):
	def __init__(self, d_model: int, nums_heads: int):
		super().__init__()


		self.num_heads = nums_heads
		self.d_model = d_model
		self.head_dim = d_model // nums_heads

		self.query_linear = nn.Linear(d_model, d_model)
		self.key_linear = nn.Linear(d_model, d_model)
		self.value_linear = nn.Linear(d_model, d_model)
		self.output_linear = nn.Linear(d_model, d_model)

	def split_heads(self, x: torch.Tensor, batch_size: int):
		x = x.view(batch_size, -1, self.num_heads, self.head_dim)

		#rearrange for batched matmul
		return x.permute(0, 2, 1, 3).contiguous()\
			.view(batch_size * self.num_heads, -1, self.head_dim)
		
	def compute_attention(self, query: torch.Tensor, key: torch.Tensor, mask=None):
		scores = torch.matmul(query, key.transpose(-2, -1)) * self.d_model**-.5

		if mask is not None:
			scores = scores.masked_fill(mask == 0, float('-inf'))
		attention_weights = torch.softmax(scores, dim=-1)
		return attention_weights
	
	def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask=None):
		batch_size = query.size(0)

		query = self.split_heads(self.query_linear(query), batch_size)
		key = self.split_heads(self.key_linear(key), batch_size)
		value = self.split_heads(self.value_linear(value), batch_size)

		attention_weights = self.compute_attention(query, key, mask)

		output = torch.matmul(attention_weights, value)

		output = output.view(batch_size, self.num_heads, -1, self.head_dim).permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model)
		return self.output_linear(output)
	
class FeedForwardSubLayer(nn.Module):
	def __init__(self, d_model, d_ff):
		super().__init__()
		self.model = nn.Sequential(
			nn.Linear(d_model, d_ff),
			nn.ReLU(),
			nn.Linear(d_ff, d_model),
		)
	def forward(self, x):
		return self.model(x)

In [9]:
class EncoderLayer(nn.Module):
	def __init__(self, d_model, num_heads, d_ff, dropout) -> None:
		super().__init__()
		

		self.self_attention = MultiHeadAttention(d_model, num_heads)
		self.feedforward = FeedForwardSubLayer(d_model, d_ff)
		self.norm1 = nn.LayerNorm(d_model)
		self.norm2 = nn.LayerNorm(d_model)
		self.dropoutLayer = nn.Dropout(dropout)

	def forward(self, x, mask):
		outputs = self.self_attention(x, x, x)
		x = self.norm1(x + self.dropoutLayer(outputs))

		ff_outputs = self.feedforward(x)

		x = self.norm2(x + self.dropoutLayer(ff_outputs))
		return x
	

class TransformerEncoder(nn.Module):
	def __init__(self,
			vocab_size: int,
			d_model: int,
			num_layers: int,
			num_heads: int,
			d_ff: int,
			dropout: float,
			max_seq_length: int
		):
		super().__init__()

		self.embedding = nn.Embedding(vocab_size, d_model)
		self.positional_encoding = PositionalEncoder(d_model, max_seq_length)
		self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

	def forward(self, x, mask):
		x = self.embedding(x)
		x = self.positional_encoding(x)
		for layer in self.layers:
			x = layer(x, mask)
		return x

In [43]:
class ClassifierHead(nn.Module):
    def __init__(self, d_model, num_classes):
        super(ClassifierHead, self).__init__()
        # Add linear layer for multiple-class classification
        self.fc = nn.Linear(d_model,  num_classes)

    def forward(self, x,):
        logits = self.fc(x[:, 0, :])
        return logits

تعليم بنية نموذج المحول علي معلومات تويتات و تصنيفها من حيث الشعور


In [45]:
num_classes = 3
vocab_size = len(tokenizer.vocab)
batch_size = 32
d_model = 256
num_heads = 32
num_layers = 3
d_ff = 4 * d_model
sequence_length = 128
dropout = 0.5

train_dataset = TensorDataset(train_encoded, torch.tensor(train_df['sentiment'].values))
test_dataset = TensorDataset(test_encoded, torch.tensor(test_df['sentiment'].values))

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

encoder = TransformerEncoder(
	vocab_size=vocab_size,
	d_model=d_model,
	num_layers=num_layers,
	num_heads=num_heads,
	d_ff=d_ff,
	dropout=dropout,
	max_seq_length=sequence_length
	).to(device)
classifier = ClassifierHead(d_model, num_classes).to(device)

criterion = nn.CrossEntropyLoss()


In [42]:
accuracy_metric = Accuracy(task='multiclass', num_classes=num_classes).to(device)  # Ensure the metric is on the same device as your models
optimizer = torch.optim.Adam(list(encoder.parameters()) + list(classifier.parameters()), lr=0.0001)
# optimizer = torch.optim.Adam(list(encoder.parameters()), lr=0.0001)

epochs = 5

for epoch in range(epochs):
    # encoder.train()
    encoder.eval()
    classifier.train()
    epoch_loss = 0
    
    accuracy_metric.reset()
    # Training phase
    for sequences, labels in tqdm(train_dataloader):
        sequences, labels = sequences.to(device), labels.to(device)
        
        encoded_sequences = encoder(sequences, mask=mask)
        logits = classifier(encoded_sequences)
        loss = criterion(logits, labels)
        
        # Backward pass and optimization step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        accuracy_metric.update(logits, labels)
        epoch_loss += loss.item()
    
    train_accuracy = accuracy_metric.compute()
    accuracy_metric.reset()

    # Validation phase
    encoder.eval()
    classifier.eval()
    val_loss = 0
    
    with torch.no_grad():
        for sequences, labels in test_dataloader:
            sequences, labels = sequences.to(device), labels.to(device)
            
            encoded_sequences = encoder(sequences, mask=mask)
            logits = classifier(encoded_sequences)
            loss = criterion(logits, labels)
            
            val_loss += loss.item()
            accuracy_metric(logits, labels)
    val_accuracy = accuracy_metric.compute()
    val_loss /= len(test_dataloader)
    
    print(f"Epoch {epoch}, Training LossZ = {epoch_loss / len(train_dataloader)}, Validation Loss = {val_loss}, Validation Accuracy = {val_accuracy}")
    accuracy_metric.reset()


100%|██████████| 859/859 [01:00<00:00, 14.09it/s]


Epoch 0, Training LossZ = 1.0885120265975126, Validation Loss = 0.9857209856445724, Validation Accuracy = 0.5121675133705139


100%|██████████| 859/859 [00:59<00:00, 14.52it/s]


Epoch 1, Training LossZ = 0.8470303180678482, Validation Loss = 0.8310850676115569, Validation Accuracy = 0.6129032373428345


100%|██████████| 859/859 [00:59<00:00, 14.53it/s]


Epoch 2, Training LossZ = 0.7152467985855409, Validation Loss = 0.7611070731738666, Validation Accuracy = 0.6598755121231079


100%|██████████| 859/859 [00:59<00:00, 14.55it/s]


Epoch 3, Training LossZ = 0.6059964245772889, Validation Loss = 0.7498277490203445, Validation Accuracy = 0.6743067502975464


100%|██████████| 859/859 [00:59<00:00, 14.55it/s]


Epoch 4, Training LossZ = 0.5183625632378774, Validation Loss = 0.8006908936543508, Validation Accuracy = 0.6717600226402283


In [30]:
from torchmetrics import ConfusionMatrix


confusion_matrix_metric = ConfusionMatrix(task='multiclass', num_classes=num_classes).to(device)

# Validation phase
encoder.eval()
classifier.eval()
val_loss = 0
all_preds = []
all_labels = []
misclassified_examples = []
test_dataloader = DataLoader(test_dataset,batch_size=1000)

with torch.no_grad():
    sequences, labels = next(iter(test_dataloader))
    sequences, labels = sequences.to(device), labels.to(device)
    
    encoded_sequences = encoder(sequences, mask=mask)
    logits = classifier(encoded_sequences)
    loss = F.cross_entropy(logits, labels, reduce=False)
    
    val_loss += loss.mean().item()
    accuracy_metric(logits, labels)
    
    preds = torch.argmax(logits, dim=1)
    all_preds.append(preds)
    all_labels.append(labels)

    _, max_indices = torch.sort(loss, descending=True)
    misclassified_examples.extend(zip(sequences[max_indices].tolist(), labels[max_indices], preds[max_indices].tolist(), loss[max_indices].tolist()))

val_accuracy = accuracy_metric.compute()
val_loss /= len(test_dataloader)

# Stack all predictions and labels
all_preds = torch.cat(all_preds)
all_labels = torch.cat(all_labels)

print(f"Epoch {epoch}, Training Loss = {epoch_loss / len(train_dataloader)}, Validation Loss = {val_loss}, Validation Accuracy = {val_accuracy}")

# Compute confusion matrix
confusion_matrix = confusion_matrix_metric(all_preds, all_labels)

# Define labels
labels = ['Positive', 'Neutral', 'Negative']

# Convert confusion matrix to pandas DataFrame for better readability
cm_df = pd.DataFrame(confusion_matrix.cpu().numpy(), index=labels, columns=labels)
print("Confusion Matrix:")
print(cm_df)

# Sort misclassified examples by loss values

# Get top n misclassified examples
top_n = 10  # Change to the desired value
top_n_misclassified = misclassified_examples[:top_n]

# Print top n misclassified examples
print(f"\nTop {top_n} Misclassified Examples (ordered by loss):")
for sequence, true_label, pred_label, loss_value in top_n_misclassified:
    print(f"Sequence: {tokenizer.decode(sequence, True)}, True Label: {labels[true_label]}, Predicted Label: {labels[pred_label]}, Loss: {loss_value}")

# Reset metrics for next epoch
accuracy_metric.reset()
confusion_matrix_metric.reset()




Epoch 4, Training Loss = 0.5701334268826128, Validation Loss = 0.17670556902885437, Validation Accuracy = 0.7080000042915344
Confusion Matrix:
          Positive  Neutral  Negative
Positive       210       99        18
Neutral         29      313        58
Negative         8       80       185

Top 10 Misclassified Examples (ordered by loss):
Sequence: oh! i ate pizza last night too! i stupidly feel closer to you somehow!, True Label: Positive, Predicted Label: Negative, Loss: 5.4190545082092285
Sequence: getting my phone back this week yeeeewww, True Label: Positive, Predicted Label: Neutral, Loss: 5.2540602684021
Sequence: prom is so over rated!! irritated... going to bed goodnight, True Label: Negative, Predicted Label: Positive, Loss: 4.931923866271973
Sequence: the hotel should be thankful, True Label: Positive, Predicted Label: Neutral, Loss: 4.784107208251953
Sequence: _ other excellent analogy, True Label: Positive, Predicted Label: Neutral, Loss: 4.611337661743164
Sequence: th