In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# Try reading the CSV file with a different encoding
df = pd.read_csv('dataset_of text classification.csv', encoding='ISO-8859-1')

# Display the DataFrame
df.head()


Unnamed: 0,UserName,ScreenName,Location,TweetAt,Text,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [3]:
df.shape

(41157, 6)

In [4]:
df.drop_duplicates(inplace=True)

In [5]:
df['Text'] = df['Text'].apply(lambda x: x.lower()) #for converting it to lower case

In [6]:
#Removing stop words and Special Characters
import nltk
nltk.download('punkt')

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
df['Text'] = df['Text'].apply(lambda x: ' '.join ([word for word in word_tokenize(x) if word.isalnum() and word not in stop_words]))



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ralphonseraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ralphonseraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Step 2 - Model Architecture and Set-up

In [7]:
import torch
import torch.nn as nn
from transformers import GPT2Model, GPT2Config

class CustomGPT2(nn.Module):
    def __init__(self):
        super(CustomGPT2, self).__init__()
        config = GPT2Config.from_pretrained('gpt2')
        self.gpt2 = GPT2Model(config)

    def forward(self, input_ids, attention_mask):
        return self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
    
# Define the CustomGPT2 model
# class CustomGPT2(nn.Module):
#     def __init__(self, num_labels):
#         super(CustomGPT2, self).__init__()
#         self.gpt2 = GPT2Model.from_pretrained('gpt2')
#         self.dropout = nn.Dropout(0.1)
#         self.classifier = nn.Linear(self.gpt2.config.hidden_size, num_labels)

#     def forward(self, input_ids, attention_mask=None):
#         outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
#         sequence_output = outputs.last_hidden_state #whatever outputs your gpt2 is giving that you are picking the last hidden state.
#         pooled_output = sequence_output[:, -1, :]
#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)
#         return logits

Step 3 - Training Script Development

In [8]:
# # Define a mapping from sentiment labels to numerical labels
# sentiment_mapping = {'positive': 0, 'neutral': 1, 'negative': 2}

# Check unique values in the 'Sentiment' column
unique_sentiments = df['Sentiment'].unique()
print(unique_sentiments)

['Neutral' 'Positive' 'Extremely Negative' 'Negative' 'Extremely Positive']


In [9]:
from torch.utils.data import DataLoader, TensorDataset
from transformers import GPT2Config, GPT2Model, GPT2Tokenizer
from torch.nn.utils.rnn import pad_sequence


# Define a mapping from sentiment labels to numerical labels
sentiment_mapping = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}

# Tokenize the text using the GPT-2 tokenizer and cast to torch.LongTensor
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenized_text = df['Text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
max_length = 64
padded_text = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in tokenized_text], batch_first=True, padding_value=0)[:, :max_length]

In [10]:
# # Assuming sentiment_mapping is a dictionary mapping sentiment labels to numerical labels
# sentiment_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}


# Convert sentiment labels to numerical labels and cast to torch.LongTensor
labels_tensor = torch.tensor(df['Sentiment'].map(sentiment_mapping).values, dtype=torch.long)

# Create a PyTorch dataset and DataLoader
dataset = TensorDataset(padded_text, labels_tensor)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define your CustomGPT2 model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CustomGPT2().to(device)


In [11]:
# Define the optimizer and criterion
import torch.optim as optim
import logging
from torch.utils.tensorboard import SummaryWriter

# Initialize logging and TensorBoard writer
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
writer = SummaryWriter()

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

checkpoint_interval = 1  # For example, save the model every epoch

# Training loop with accuracy calculation and checkpointing
num_epochs = 2
global_step = 0

for epoch in range(num_epochs):  # Loop over the dataset multiple times, defined by num_epochs.
    model.train()  # Set the model to training mode. This enables dropout and batch normalization layers.
    total_loss = 0  # Initialize total loss for the epoch.
    correct_predictions = 0  # Initialize the count of correct predictions for accuracy calculation.
    total_predictions = 0  # Initialize the total number of predictions made.
    
    for batch_idx, (input_ids, labels) in enumerate(data_loader):  # Iterate over batches of data.
        input_ids, labels = input_ids.to(device), labels.to(device)  # Move input and labels to the configured device (GPU/CPU).

        optimizer.zero_grad()  # Clear the gradients before computing them. Prevents accumulation from previous iterations.
        outputs = model(input_ids, attention_mask=None)  # Forward pass through the model. Attention mask is optional here.
        
        # Extract logits from model outputs. Assumes outputs are structured with logits as the first element.
        logits = outputs[0][:, -1, :]  # Get logits for the last token positions across all examples in the batch.

        loss = criterion(logits, labels)  # Compute the loss between model predictions and true labels.
        loss.backward()  # Backpropagate the error through the model.
        optimizer.step()  # Update model parameters based on gradients.

        total_loss += loss.item()  # Accumulate the loss.

        _, predicted_labels = torch.max(logits, 1)  # Get the predicted labels by finding the max logit value across columns.
        correct_predictions += (predicted_labels == labels).sum().item()  # Count correct predictions.
        total_predictions += labels.size(0)  # Update the total number of predictions made.

        if batch_idx % 10 == 0:  # Optionally log information every 10 batches.
            logger.info(f'Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item()}')  # Log current epoch, batch, and loss.

    epoch_loss = total_loss / len(data_loader)  # Calculate average loss for the epoch.
    epoch_accuracy = correct_predictions / total_predictions  # Calculate accuracy for the epoch.

    logger.info(f'Epoch {epoch}, Loss: {epoch_loss}, Accuracy: {epoch_accuracy}')  # Log the epoch's average loss and accuracy.
    writer.add_scalar('Loss', epoch_loss, epoch)  # Write the epoch loss to TensorBoard.
    writer.add_scalar('Accuracy', epoch_accuracy, epoch)  # Write the epoch accuracy to TensorBoard.

    # Checkpointing: save the model state_dict every `checkpoint_interval` epochs.
    if epoch % checkpoint_interval == 0:
        checkpoint_path = f'model_checkpoint_epoch_{epoch}.pth'  # Define the checkpoint file path.
        torch.save(model.state_dict(), checkpoint_path)  # Save the model's state_dict.
        logger.info(f"Saved checkpoint to '{checkpoint_path}'.")  # Log that the model was successfully saved.

    global_step += len(data_loader)  # Update the global step by the number of batches processed.

writer.close()  # Close the TensorBoard writer.


INFO:__main__:Epoch 0, Batch 0, Loss: 7.358343601226807
INFO:__main__:Epoch 0, Batch 10, Loss: 1.8029873371124268
INFO:__main__:Epoch 0, Batch 20, Loss: 1.8454762697219849
INFO:__main__:Epoch 0, Batch 30, Loss: 1.7500286102294922
INFO:__main__:Epoch 0, Batch 40, Loss: 1.844016194343567
INFO:__main__:Epoch 0, Batch 50, Loss: 1.9717369079589844
INFO:__main__:Epoch 0, Batch 60, Loss: 1.6280494928359985
INFO:__main__:Epoch 0, Batch 70, Loss: 1.7735522985458374
INFO:__main__:Epoch 0, Batch 80, Loss: 1.9819430112838745
INFO:__main__:Epoch 0, Batch 90, Loss: 1.6509259939193726
INFO:__main__:Epoch 0, Batch 100, Loss: 1.6819223165512085
INFO:__main__:Epoch 0, Batch 110, Loss: 1.579703688621521
INFO:__main__:Epoch 0, Batch 120, Loss: 1.7213611602783203
INFO:__main__:Epoch 0, Batch 130, Loss: 1.682446002960205
INFO:__main__:Epoch 0, Batch 140, Loss: 1.6632447242736816
INFO:__main__:Epoch 0, Batch 150, Loss: 1.5399305820465088
INFO:__main__:Epoch 0, Batch 160, Loss: 1.6873606443405151
INFO:__main_

Step 4. Monitoring and Logging:


In [12]:
# import logging

# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# # Inside the training loop
# logger.info(f'Epoch {epoch}, Batch {batch}, Loss: {loss.item()}, Accuracy: {accuracy.item()}')
# from torch.utils.tensorboard import SummaryWriter

# # Create a TensorBoard writer
# writer = SummaryWriter()

# # Inside the training loop
# writer.add_scalar('Loss', loss.item(), global_step)
# writer.add_scalar('Accuracy', accuracy.item(), global_step)

Step 5: Utilizing GPU/TPU resources:

In [13]:
#Using Tesorflow

import tensorflow as tf

# Check GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Explicitly specify GPU device
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

Num GPUs Available:  1


Step 6 : Regular Checkpoints

In [14]:
# sample_texts = ['This product has been great in my experience!', 'I really did not like this movie at all.']
# predictions = [predict(text) for text in sample_texts]
# for text, pred in zip(sample_texts, predictions):
#     logger.info(f'Text: {text}, Predicted Sentiment: {pred}')

Step 7 : Ethical and Quality control in Training

In [15]:
#Sample code for bias monitoring and data Quality control
#Implement your own logic and bias monitoring for the code

Step 8: Documentation

In [16]:
#Sample code for Code Documentation