In [None]:
#mporting the necessary libraries
import torch
import time
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
import pandas as pd
#!pip install transformers
from transformers import DistilBertModel, DistilBertTokenizer
from torch import cuda
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [26]:
df = pd.read_csv('/content/drive/My Drive/KanCMD/kannada_sentiment.csv',delimiter=',',
                 header=None,names=['sentiment','sentence'])
#df.sentiment = df.sentiment.apply({'Positive':0,'Negative':1,'Mixed feelings':2,'unknown state':3,'not-Kannada':4}.get)
#df.head(9)
my_dict = {
    'p' : 'Positive',
    'n' : 'Negative',
    'm' : 'Mixed feelings',
    'u' : 'unknown state',
    'nk': 'not-Kannada'
}


encode_dict = {}

def encode_cat(x):
  if x not in encode_dict.keys():
    encode_dict[x] = len(encode_dict)
  return encode_dict[x]

df['encode_cat'] = df['sentiment'].apply(lambda x: encode_cat(x))
df.head(6)


Unnamed: 0,sentiment,sentence,encode_cat
0,Negative,Bari olu nan makla,0
1,Negative,Sir nivu news helida hage heltiri sir,0
2,unknown state,Idu riyel rar,1
3,Negative,ಕಥೆ ಅರ್ದ ಅಗಿದೆ.ಅಶ್ವತ್ತಾಮ ತನ್ಣಗಾಯಗೊಂಡ ದೇಹವನ್ನು ...,0
4,Mixed feelings,ಕಥೆ ಸರಿಯಾಗಿ ತಿಳಿದುಕೊಳ್ಳಿ..... please don't wor...,2
5,Positive,Ashwathama. Vadhukkidhe.evathu. video. Nodithu...,3


In [22]:
#Initializing the key variables which will be later used in the training

MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 2e-5
distilbert_multilingual = 'distilbert-base-multilingual-cased'
distilbert_base_uncased = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(distilbert_multilingual)

In [4]:
class SentimentDataset(Dataset):

  def __init__(self,dataframe,tokenizer,max_len):
    self.len = len(dataframe)
    self.data = dataframe
    self.tokenizer = tokenizer
    self.max_len = max_len 
  

  def __getitem__(self,index):
    sentence = str(self.data.sentence[index])
    sentence = " ".join(sentence.split())
    encoding = self.tokenizer.encode_plus(
        sentence,
        add_special_tokens = True,
        max_length = self.max_len,
        padding = 'max_length',
        return_token_type_ids = False,
        return_tensors = 'pt',
        truncation = True
    )
    #ids = encoding['input_ids']
    #mask = encoding['attention_mask']
    return {
        'ids' : encoding['input_ids'].flatten(),
        'mask': encoding['attention_mask'].flatten(),
        'targets': torch.tensor(self.data.encode_cat[index],dtype=torch.long)
    }

  def __len__(self):
    return self.len

In [5]:
#Creating the dataset and dataloader for training 
train_size = 0.9
train_dataset = df.sample(frac=train_size,random_state=42)
test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print('Total no of entities in the dataset: {}'.format(df.shape))
print('Train dataset:{}'.format(train_dataset.shape))
print('Test dataset: {}'.format(test_dataset.shape))

training_set = SentimentDataset(train_dataset,tokenizer,MAX_LEN)
testing_set = SentimentDataset(test_dataset,tokenizer,MAX_LEN)

Total no of entities in the dataset: (7671, 3)
Train dataset:(6904, 3)
Test dataset: (767, 3)


In [6]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [7]:
# Fine-Tuning DistilBERT by adding a dropout and a dense layer on top of it to get the final output

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-multilingual-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = DistillBERTClass()
model.to(device)
 

In [23]:
#Defining the loss function and optimizer
loss_function = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(params= model.parameters(),lr = LEARNING_RATE)
#loss_function.to(device)

In [10]:
#Fine-Tuning DistilBERT
def calcuate_accuracy(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [11]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
  
  tr_loss = 0
  n_correct = 0
  nb_tr_steps = 0
  nb_tr_examples = 0
  model.train()
  start_time = time.time()
  for _,data in enumerate(training_loader, 0):
      ids = data['ids'].to(device, dtype = torch.long)
      mask = data['mask'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.long)

      outputs = model(ids, mask)
      loss = loss_function(outputs, targets)
      tr_loss += loss.item()
      big_val, big_idx = torch.max(outputs.data, dim=1)
      n_correct += calcuate_accuracy(big_idx, targets)

      nb_tr_steps += 1
      nb_tr_examples+=targets.size(0)
      
      optimizer.zero_grad()
      loss.backward()
      #When using GPU
      optimizer.step()

  print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
  epoch_loss = tr_loss/nb_tr_steps
  epoch_accu = (n_correct*100)/nb_tr_examples
  print(f"Training Loss Epoch: {epoch_loss}")
  print(f"Training Accuracy Epoch: {epoch_accu}")
  end_time = time.time()
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')

  return 

In [25]:
for epoch in range(EPOCHS):
  train(epoch)
  print()

The Total Accuracy for Epoch 0: 85.8632676709154
Training Loss Epoch: 0.40450390463660435
Training Accuracy Epoch: 85.8632676709154
Epoch: 01 | Epoch Time: 3m 12s

The Total Accuracy for Epoch 1: 89.28157589803013
Training Loss Epoch: 0.31484513093838123
Training Accuracy Epoch: 89.28157589803013
Epoch: 02 | Epoch Time: 3m 12s

The Total Accuracy for Epoch 2: 91.93221320973349
Training Loss Epoch: 0.2405983853817883
Training Accuracy Epoch: 91.93221320973349
Epoch: 03 | Epoch Time: 3m 12s

The Total Accuracy for Epoch 3: 93.1199304750869
Training Loss Epoch: 0.19959371864442127
Training Accuracy Epoch: 93.1199304750869
Epoch: 04 | Epoch Time: 3m 12s

The Total Accuracy for Epoch 4: 94.17728852838934
Training Loss Epoch: 0.1626206540073596
Training Accuracy Epoch: 94.17728852838934
Epoch: 05 | Epoch Time: 3m 12s

The Total Accuracy for Epoch 5: 95.36500579374275
Training Loss Epoch: 0.12951855679070665
Training Accuracy Epoch: 95.36500579374275
Epoch: 06 | Epoch Time: 3m 12s

The Total 

In [16]:
def valid(model,testing_loader):
  model.eval()
  n_correct = 0
  n_wrong = 0
  total = 0
  tr_loss = 0
  nb_tr_steps = 0
  nb_tr_examples = 0
  with torch.no_grad():
    for _,data in enumerate(testing_loader,0):
      ids = data['ids'].to(device,dtype = torch.long)
      mask = data['mask'].to(device,dtype = torch.long)
      targets = data['targets'].to(device,dtype=torch.long)
      outputs = model(ids,mask).squeeze()
      loss = loss_function(outputs,targets)
      tr_loss += loss.item()
      big_val,big_idx = torch.max(outputs.data,dim=1)
      n_correct += calcuate_accuracy(big_idx,targets)
      nb_tr_steps += 1
      nb_tr_examples += targets.size(0)

    epoch_loss = tr_loss/nb_tr_steps
    epoch_accuracy = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch:{epoch_loss}")
    print(f"Validation Accuracy Epoch:{epoch_accuracy}")

    return epoch_accuracy
      

In [17]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Validation Loss Epoch:1.2726746344317992
Validation Accuracy Epoch:58.279009126466754
Accuracy on test data = 58.28%


In [18]:
def get_predictions(model, data_loader):
  model = model.eval()
  sentence = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      #texts = d["sentences"]
      ids = d["ids"].to(device)
      mask = d["mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=ids,
        attention_mask=mask
      )
      _, preds = torch.max(outputs, dim=1)
      #sentence.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return sentence, predictions, prediction_probs, real_values

In [19]:

y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  testing_loader
)

In [20]:
class_name = ['Positive','Negative','Mixed_feelings','unknown_state','not-Kannada']

In [21]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test, y_pred, target_names=class_name,zero_division=0))

                precision    recall  f1-score   support

      Positive       0.59      0.59      0.59       152
      Negative       0.36      0.37      0.36        84
Mixed_feelings       0.16      0.08      0.11        75
 unknown_state       0.68      0.74      0.71       347
   not-Kannada       0.56      0.58      0.57       109

      accuracy                           0.58       767
     macro avg       0.47      0.47      0.47       767
  weighted avg       0.56      0.58      0.57       767

