In [1]:
!nvidia-smi


Tue Apr 23 16:14:03 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              25W / 250W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import pandas as pd

## Importing Dataset

In [3]:
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset

file_dict = {
  "train" : "/kaggle/input/cs689p/LREC_EmoInHindi.csv"
}
go_emotions = load_dataset('csv',data_files=file_dict,delimiter=',',column_names=['dialogueId','utterance_no','authorRole','utterance','emotions','emoIntensity'])
#split into 80 10 

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
#split data into train and test
from sklearn.model_selection import train_test_split
data = go_emotions['train'].train_test_split(test_size=0.1)

## Splitting

In [5]:
from datasets import DatasetDict
ds = DatasetDict({
    'train': data['train'],
    'test': data['test']
})

In [6]:
#convert emotions to list its pandas series check for single value
ds

DatasetDict({
    train: Dataset({
        features: ['dialogueId', 'utterance_no', 'authorRole', 'utterance', 'emotions', 'emoIntensity'],
        num_rows: 39823
    })
    test: Dataset({
        features: ['dialogueId', 'utterance_no', 'authorRole', 'utterance', 'emotions', 'emoIntensity'],
        num_rows: 4425
    })
})

## All emotions

In [7]:
mapping = {
    0: 'joy', 
    1: 'anticipation', 
    2: 'neutral', 
    3: 'anger', 
    4: 'disgusted', 
    5: 'confident', 
    6: 'annoyed', 
    7: 'hopeful', 
    8: 'apprehensive', 
    9: 'grateful', 
    10: 'sad', 
    11: 'compassion', 
    12: 'fear', 
    13: 'guilty', 
    14: 'surprised', 
    15: 'impressed', 
    16: 'confused',
    17: 'emotions'
}
reverse_mapping = {v:k for k,v in mapping.items()}
print(reverse_mapping)
n_labels = len(mapping)

{'joy': 0, 'anticipation': 1, 'neutral': 2, 'anger': 3, 'disgusted': 4, 'confident': 5, 'annoyed': 6, 'hopeful': 7, 'apprehensive': 8, 'grateful': 9, 'sad': 10, 'compassion': 11, 'fear': 12, 'guilty': 13, 'surprised': 14, 'impressed': 15, 'confused': 16, 'emotions': 17}


In [8]:
train, test = ds["train"].to_pandas(), ds["test"].to_pandas()

In [9]:
train.shape

(39823, 6)

## Splitting emotion column 

In [10]:
def split_emotions(df):
    df['emotions'] = df['emotions'].str.split(',')
    df['emotions'] = df['emotions'].apply(lambda x: [str(item).strip() for item in x])
    return df

train = split_emotions(train)

In [11]:
train['emotions'][1]

['anticipation']

In [12]:
mapping = {
    0: 'joy', 
    1: 'anticipation', 
    2: 'neutral', 
    3: 'anger', 
    4: 'disgusted', 
    5: 'confident', 
    6: 'annoyed', 
    7: 'hopeful', 
    8: 'apprehensive', 
    9: 'grateful', 
    10: 'sad', 
    11: 'compassion', 
    12: 'fear', 
    13: 'guilty', 
    14: 'surprised', 
    15: 'impressed', 
    16: 'confused',
    17: 'emotions'
}
reverse_mapping = {v:k for k,v in mapping.items()}
print(reverse_mapping)
n_labels = len(mapping)

{'joy': 0, 'anticipation': 1, 'neutral': 2, 'anger': 3, 'disgusted': 4, 'confident': 5, 'annoyed': 6, 'hopeful': 7, 'apprehensive': 8, 'grateful': 9, 'sad': 10, 'compassion': 11, 'fear': 12, 'guilty': 13, 'surprised': 14, 'impressed': 15, 'confused': 16, 'emotions': 17}


## Changing emotions to map values



In [13]:
def change_to_int(df):
    new_df = df.copy()  # Create a new DataFrame to avoid modifying the original
    
    for i in range(len(new_df)):
        x = new_df['emotions'][i]
        list1 = [reverse_mapping[j] for j in x]
        new_df.at[i, 'emotions'] = list1  # Use .at to modify the new DataFrame
    
    return new_df

train = change_to_int(train)

In [14]:
test = split_emotions(test)

In [15]:
test = change_to_int(test)

## Converting to one hot Encodings

In [16]:
from tqdm.notebook import tqdm

def one_hot_encoder(df):
    one_hot_encoding = []
    for i in tqdm(range(len(df))):
        temp = [0]*n_labels
        label_indices = df.iloc[i]["emotions"]
        for index in label_indices:
            temp[index] = 1
        one_hot_encoding.append(temp)
    return pd.DataFrame(one_hot_encoding)
train_ohe_labels = one_hot_encoder(train)
test_ohe_labels = one_hot_encoder(test)

  0%|          | 0/39823 [00:00<?, ?it/s]

  0%|          | 0/4425 [00:00<?, ?it/s]

## Concating these values to dataframe

In [17]:
train = pd.concat([train, train_ohe_labels], axis=1)
test = pd.concat([test, test_ohe_labels], axis=1)

In [18]:
train

Unnamed: 0,dialogueId,utterance_no,authorRole,utterance,emotions,emoIntensity,0,1,2,3,...,8,9,10,11,12,13,14,15,16,17
0,1465,20,bot,रक्षक से संपर्क करने के लिए धन्यवाद। आपका दिन ...,"[9, 0]",11,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1771,12,bot,क्या आप ऑनलाइन या व्यक्तिगत रूप से कानूनी सलाह...,[1],1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1738,13,user,"हां, लेकिन फिर से वह एक अलग आईडी से ईमेल भेज र...",[12],2,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,782,14,bot,आपकी झुंझलाहट के लिए खेद है। क्या आपको कुछ और ...,"[13, 2]",10,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,1354,13,user,मुझे सलाह न दें जल्दी से मुझे मेरे स्थान के पा...,[7],1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39818,777,25,user,"कोई रास्ता नहीं, बस खो जाओ।",[3],1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
39819,2316,30,bot,आप बस उस पर एक ईमेल भेज सकते हैं और आपका ईमेल ...,[5],2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39820,1415,17,user,धन्यवाद ।,[2],0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
39821,2429,10,bot,यह बहुत दयनीय है। अकेला महसूस न करें हम यहां आ...,"[11, 11, 1]",221,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [19]:
train.drop(columns=['emotions','emoIntensity','dialogueId','utterance_no','authorRole'], inplace=True)
test.drop(columns=['emotions','emoIntensity','dialogueId','utterance_no','authorRole'], inplace=True)

In [20]:
train

Unnamed: 0,utterance,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,रक्षक से संपर्क करने के लिए धन्यवाद। आपका दिन ...,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,क्या आप ऑनलाइन या व्यक्तिगत रूप से कानूनी सलाह...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"हां, लेकिन फिर से वह एक अलग आईडी से ईमेल भेज र...",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,आपकी झुंझलाहट के लिए खेद है। क्या आपको कुछ और ...,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,मुझे सलाह न दें जल्दी से मुझे मेरे स्थान के पा...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39818,"कोई रास्ता नहीं, बस खो जाओ।",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
39819,आप बस उस पर एक ईमेल भेज सकते हैं और आपका ईमेल ...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
39820,धन्यवाद ।,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
39821,यह बहुत दयनीय है। अकेला महसूस न करें हम यहां आ...,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [21]:
train.columns

Index(['utterance', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
       17],
      dtype='object')

## Replacing column values to Emotion Values

In [22]:
# replace column name with mapping except utterance
train.columns = ['utterance', 'joy', 'anticipation', 'neutral', 'anger', 'disgusted', 'confident', 'annoyed', 'hopeful', 'apprehensive', 'grateful', 'sad', 'compassion', 'fear', 'guilty', 'surprised', 'impressed', 'confused','emotions']
test.columns = ['utterance', 'joy', 'anticipation', 'neutral', 'anger', 'disgusted', 'confident', 'annoyed', 'hopeful', 'apprehensive', 'grateful', 'sad', 'compassion', 'fear', 'guilty', 'surprised', 'impressed', 'confused','emotions']

print(train)

                                               utterance  joy  anticipation  \
0      रक्षक से संपर्क करने के लिए धन्यवाद। आपका दिन ...    1             0   
1      क्या आप ऑनलाइन या व्यक्तिगत रूप से कानूनी सलाह...    0             1   
2      हां, लेकिन फिर से वह एक अलग आईडी से ईमेल भेज र...    0             0   
3      आपकी झुंझलाहट के लिए खेद है। क्या आपको कुछ और ...    0             0   
4      मुझे सलाह न दें जल्दी से मुझे मेरे स्थान के पा...    0             0   
...                                                  ...  ...           ...   
39818                        कोई रास्ता नहीं, बस खो जाओ।    0             0   
39819  आप बस उस पर एक ईमेल भेज सकते हैं और आपका ईमेल ...    0             0   
39820                                          धन्यवाद ।    0             0   
39821  यह बहुत दयनीय है। अकेला महसूस न करें हम यहां आ...    0             1   
39822     रक्षक पधारने के लिए धन्यवाद। आपका दिन शुभ हो !    1             0   

       neutral  anger  disgusted  confident  annoye

## Parameters for model

In [23]:
MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 1e-05

In [24]:
from transformers import BertTokenizer, BertModel


KeyboardInterrupt: 

## Importing Tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
target_list = list(mapping.values())
print(target_list)

## Class for Dataset

In [None]:
import torch
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['utterance']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
train_size = 0.8
train_df = train.sample(frac=train_size, random_state=200).reset_index(drop=True)
val_df = train.drop(train_df.index).reset_index(drop=True)

In [None]:
test.shape

## Creating train and valid dataset

In [None]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)

valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

## Data Loader

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
train_dataset

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


In [None]:
device

## Loading and saving CHeckpoint

In [None]:
import shutil
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 18)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:

val_targets=[]
val_outputs=[]

## Training Model

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
   
  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf
   
 
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    with tqdm(total=len(training_loader)) as pbar:
            for batch_idx, data in enumerate(training_loader):
                ids = data['input_ids'].to(device)
                mask = data['attention_mask'].to(device)
                token_type_ids = data['token_type_ids'].to(device)
                targets = data['targets'].to(device)

                outputs = model(ids, mask, token_type_ids)

                optimizer.zero_grad()
                loss = loss_fn(outputs, targets)
                loss.backward()
                optimizer.step()
                
                train_loss += loss.item()
                pbar.update(1)
                pbar.set_description(f'Training Loss: {train_loss / (batch_idx + 1):.4f}')
        
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics 
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
        
        # save checkpoint
      save_ckp(checkpoint, False, checkpoint_path, best_model_path)
        
      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model

In [None]:
import os
kaggle_dir = '/kaggle/working'

# Define checkpoint and best model file paths within the Kaggle working directory
ckpt_path = 'checkpoint.pt'
best_model_path = 'best_model.pt'

In [None]:
import numpy as np
trained_model = train_model(EPOCHS,train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)

## For Testing

In [None]:
example = test['utterance'][7]
print(example)
print(type(example))
encodings = tokenizer.encode_plus(
    example,
    None,
    add_special_tokens=True,
    max_length=MAX_LEN,
    padding='max_length',
    return_token_type_ids=True,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)
model.eval()
with torch.no_grad():
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    print(train_df.columns[1:].to_list()[int(np.argmax(final_output, axis=1))])