# Multi-Column Sentiment-Analysis:

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the dataset
excel_file = 'expanded_data.xlsx'
df = pd.read_excel(excel_file)

In [3]:
# Fill NaN values in relevant columns with empty strings
df['Observation Name'] = df['Observation Name'].fillna('')
df['Description'] = df['Description'].fillna('')
df['RPRemarks'] = df['RPRemarks'].fillna('')

# Map sentiment labels
label_mapping = {'Positive Zone': 2, 'Neutral Zone': 1, 'Negative Zone': 0}
df['labels'] = df['Region'].map(label_mapping) 

In [4]:
df.head()

Unnamed: 0,ActDate,ActNo,ObservationID,Observation Name,CategoryID,Category,SubCategoryID,SubCatName,DepartID,DeptName,Description,Region,RPRemarks,UnitCode,Status,labels
0,2024-01-01,1001,,Example Observation,,Category A,,SubCategory X,,Department 1,"Cooking microwave pizzas, yummy",Positive Zone,Sample remark,Sample Unit,Active,2
1,2024-01-01,1001,,Test Observation,,Category A,,SubCategory X,,Department 1,Any plans of allowing sub tasks to show up in ...,Neutral Zone,Sample remark,Sample Unit,Active,1
2,2024-01-01,1001,,Detailed Sample Observation,,Category A,,SubCategory X,,Department 1,"I love the humor, I just reworded it. Like sa...",Positive Zone,Sample remark,Sample Unit,Active,2
3,2024-01-01,1001,,Example Observation,,Category B,,SubCategory Y,,Department 2,naw idk what ur talkin about,Negative Zone,Sample remark,Sample Unit,Active,0
4,2024-01-01,1001,,Test Observation,,Category B,,SubCategory Y,,Department 2,That sucks to hear. I hate days like that,Negative Zone,Sample remark,Sample Unit,Active,0


In [5]:
# Tokenize each column separately
tokens_obs = tokenizer(df['Observation Name'].tolist(), padding=True, truncation=True, return_tensors='pt')
tokens_desc = tokenizer(df['Description'].tolist(), padding=True, truncation=True, return_tensors='pt')
tokens_rpremarks = tokenizer(df['RPRemarks'].tolist(), padding=True, truncation=True, return_tensors='pt')

In [6]:
# Custom Dataset class for handling multiple text columns
class MultiColumnSentimentDataset(Dataset):
    def __init__(self, encodings_obs, encodings_desc, encodings_rpremarks, labels):
        self.encodings_obs = encodings_obs
        self.encodings_desc = encodings_desc
        self.encodings_rpremarks = encodings_rpremarks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {
            'input_ids_obs': self.encodings_obs['input_ids'][idx],
            'attention_mask_obs': self.encodings_obs['attention_mask'][idx],
            'input_ids_desc': self.encodings_desc['input_ids'][idx],
            'attention_mask_desc': self.encodings_desc['attention_mask'][idx],
            'input_ids_rpremarks': self.encodings_rpremarks['input_ids'][idx],
            'attention_mask_rpremarks': self.encodings_rpremarks['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx])
        }
        return item

In [7]:
# Create dataset and dataloader
dataset = MultiColumnSentimentDataset(tokens_obs, tokens_desc, tokens_rpremarks, df['labels'].tolist())
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Initialize model and optimizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Set model to training mode and move to device (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
# Custom forward pass for handling multiple columns
for batch in train_loader:
    # Move inputs to device
    input_ids_obs = batch['input_ids_obs'].to(device)
    attention_mask_obs = batch['attention_mask_obs'].to(device)
    input_ids_desc = batch['input_ids_desc'].to(device)
    attention_mask_desc = batch['attention_mask_desc'].to(device)
    input_ids_rpremarks = batch['input_ids_rpremarks'].to(device)
    attention_mask_rpremarks = batch['attention_mask_rpremarks'].to(device)
    labels = batch['labels'].to(device)
    
    # Forward pass for each column
    outputs_obs = model(input_ids=input_ids_obs, attention_mask=attention_mask_obs).logits
    outputs_desc = model(input_ids=input_ids_desc, attention_mask=attention_mask_desc).logits
    outputs_rpremarks = model(input_ids=input_ids_rpremarks, attention_mask=attention_mask_rpremarks).logits
    
    # Aggregate outputs (here we use mean, but other techniques like concatenation or weighted sum can also be tried)
    outputs = (outputs_obs + outputs_desc + outputs_rpremarks) / 3
    
    # Calculate loss (cross entropy)
    loss = torch.nn.CrossEntropyLoss()(outputs, labels)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [10]:
# Example code snippet to monitor training loss
num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        
        # Forward and backward pass
        outputs_obs = model(input_ids=input_ids_obs, attention_mask=attention_mask_obs).logits
        outputs_desc = model(input_ids=input_ids_desc, attention_mask=attention_mask_desc).logits
        outputs_rpremarks = model(input_ids=input_ids_rpremarks, attention_mask=attention_mask_rpremarks).logits
        outputs = (outputs_obs + outputs_desc + outputs_rpremarks) / 3
        loss = torch.nn.CrossEntropyLoss()(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

Epoch 1/3, Loss: 0.5504085719585419
Epoch 2/3, Loss: 0.12391160527865092
Epoch 3/3, Loss: 0.03712490126490593


In [11]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Subset

# Step 1: Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Step 2: Tokenize each column separately for both train and validation sets
train_tokens_obs = tokenizer(train_df['Observation Name'].tolist(), padding=True, truncation=True, return_tensors='pt')
train_tokens_desc = tokenizer(train_df['Description'].tolist(), padding=True, truncation=True, return_tensors='pt')
train_tokens_rpremarks = tokenizer(train_df['RPRemarks'].tolist(), padding=True, truncation=True, return_tensors='pt')

val_tokens_obs = tokenizer(val_df['Observation Name'].tolist(), padding=True, truncation=True, return_tensors='pt')
val_tokens_desc = tokenizer(val_df['Description'].tolist(), padding=True, truncation=True, return_tensors='pt')
val_tokens_rpremarks = tokenizer(val_df['RPRemarks'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Step 3: Define the training and validation datasets
train_dataset = MultiColumnSentimentDataset(train_tokens_obs, train_tokens_desc, train_tokens_rpremarks, train_df['labels'].tolist())
val_dataset = MultiColumnSentimentDataset(val_tokens_obs, val_tokens_desc, val_tokens_rpremarks, val_df['labels'].tolist())

# Step 4: Create DataLoaders for both training and validation sets
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model.eval()  # Set model to evaluation mode
predictions, true_labels = [], []

with torch.no_grad():  # Disable gradient computation for inference
    for batch in val_loader:
        input_ids_obs = batch['input_ids_obs'].to(device)
        attention_mask_obs = batch['attention_mask_obs'].to(device)
        input_ids_desc = batch['input_ids_desc'].to(device)
        attention_mask_desc = batch['attention_mask_desc'].to(device)
        input_ids_rpremarks = batch['input_ids_rpremarks'].to(device)
        attention_mask_rpremarks = batch['attention_mask_rpremarks'].to(device)
        labels = batch['labels'].to(device)
        
        outputs_obs = model(input_ids=input_ids_obs, attention_mask=attention_mask_obs).logits
        outputs_desc = model(input_ids=input_ids_desc, attention_mask=attention_mask_desc).logits
        outputs_rpremarks = model(input_ids=input_ids_rpremarks, attention_mask=attention_mask_rpremarks).logits
        outputs = (outputs_obs + outputs_desc + outputs_rpremarks) / 3
        
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

# Calculate accuracy and other metrics
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
print(f"Validation Accuracy: {accuracy}")
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Validation Accuracy: 0.8333333333333334
Precision: 0.6944444444444445, Recall: 0.8333333333333334, F1 Score: 0.7575757575757575


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
model.eval()  # Set model to evaluation mode
sample_data = {
    'Observation Name': "Sample observation text",
    'Description': "Sample description text",
    'RPRemarks': "Sample remarks text"
}

# Tokenize each column of sample data
sample_tokens_obs = tokenizer([sample_data['Observation Name']], padding=True, truncation=True, return_tensors='pt')
sample_tokens_desc = tokenizer([sample_data['Description']], padding=True, truncation=True, return_tensors='pt')
sample_tokens_rpremarks = tokenizer([sample_data['RPRemarks']], padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    outputs_obs = model(input_ids=sample_tokens_obs['input_ids'].to(device),
                        attention_mask=sample_tokens_obs['attention_mask'].to(device)).logits
    outputs_desc = model(input_ids=sample_tokens_desc['input_ids'].to(device),
                         attention_mask=sample_tokens_desc['attention_mask'].to(device)).logits
    outputs_rpremarks = model(input_ids=sample_tokens_rpremarks['input_ids'].to(device),
                              attention_mask=sample_tokens_rpremarks['attention_mask'].to(device)).logits
    outputs = (outputs_obs + outputs_desc + outputs_rpremarks) / 3
    pred = torch.argmax(outputs, dim=1).item()

sentiment_mapping = {0: "Negative Zone", 1: "Neutral Zone", 2: "Positive Zone"}
print(f"Predicted Sentiment: {sentiment_mapping[pred]}")

Predicted Sentiment: Negative Zone
