In [1]:
!pip install transformers -q
!pip install sentencepiece -q

[K     |████████████████████████████████| 3.1 MB 15.4 MB/s 
[K     |████████████████████████████████| 61 kB 408 kB/s 
[K     |████████████████████████████████| 895 kB 88.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 49.4 MB/s 
[K     |████████████████████████████████| 596 kB 61.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 15.0 MB/s 
[?25h

In [2]:
# Mount Google drive to upload datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# The path to the data on my drive
D = '/content/drive/My Drive/W266_Project_Data/pmi_data'

In [4]:
import pandas as pd
import csv
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

In [5]:
# Load the deduplicated datasets
p_dev_new = pd.read_csv(D+"/xml/dedup_para_dev.csv")
p_dev_new.drop(columns={'Unnamed: 0'}, inplace=True)
p_test_new = pd.read_csv(D+"/xml/dedup_para_test.csv")
p_test_new.drop(columns={'Unnamed: 0'}, inplace=True)

In [38]:
# Filter out the Punjabi records
p_dev_new = p_dev_new[p_dev_new.language_task != 'translate English to Punjabi']
p_test_new = p_test_new[p_test_new.language_task != 'translate English to Punjabi']

In [39]:
p_test_new.language_task.unique()

array(['translate English to Hindi', 'translate English to Malayalam',
       'translate English to Tamil'], dtype=object)

In [40]:
# Set up the device to run on GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [41]:
# Create an instance of the tokenizer and model using Indic Bert
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', keep_accents=True)
model = AutoModel.from_pretrained('ai4bharat/indic-bert')

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'sop_classifier.classifier.bias', 'sop_classifier.classifier.weight', 'predictions.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [42]:
class SiameseNet(torch.nn.Module):
    def __init__(self):
      super(SiameseNet, self).__init__()

      # Set up a fully-connected layer to apply to each incoming sentence vector
      # Use a dense layer (nn.Linear) to reduce the dimensionality of the input embeddings
      # use a relu activation function
      self.fc = torch.nn.Sequential(
            torch.nn.Linear(768, 128),
            torch.nn.LayerNorm(128),
            torch.nn.ReLU(inplace=True))
      
      self.relu = torch.nn.ReLU(inplace=True)
      self.batch2 = torch.nn.LayerNorm(384)
      self.ffn = torch.nn.Linear(384, 128)
      self.batch3 = torch.nn.LayerNorm(128)
      self.layer_out = torch.nn.Linear(128,1)

      # Add drop out
      self.drop = torch.nn.Dropout(0.2)

    # left and right represent columns in our data
    def forward(self, left, right):

      # Get the sentence embeddings 
      u = left
      v = right

      # Reduce the dimensionality
      u = self.fc(u)
      v = self.fc(v)

      # Get the absolute value of the element-wise difference of the embedding vectors
      uv = torch.sub(u, v)
      uv_abs = torch.abs(uv)

      # concatenate u, v, and the absolute value of the element-wise difference between the embeddings
      x = torch.cat([u, v, uv_abs], dim=-1)

      # Normalize the concatenated batch
      x = self.batch2(x)

      # Use the relu activation function
      x = self.relu(x)

      # Pass the concatenated layers through a dense layer to reduce the dimensionality
      x = self.ffn(x)
      # Normalize the concatenated layers
      x = self.batch3(x)
      # Pass through relu activation
      x = self.relu(x)
      # Implement drop out to avoid overfitting
      x = self.drop(x)
      # Pass the output through a final dense layer
      output = self.layer_out(x)
      # Reshape the output to have the same dimension as the target labels
      output = output[:,-1,:]

      return output

In [43]:
# Load my saved model
net = SiameseNet().to(device)

# Load the weights and biases from the last round of training
net.load_state_dict(torch.load("/content/drive/My Drive/W266_Project_Data/models/siamese_model/siamese_models_used/three_dense_model_wopb_layernorm.pt"))

<All keys matched successfully>

In [44]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def sentence_embeddings(device, tokenizer, model, sentences):
  """Create contextualized sentence embeddings
  
  Parameters:
  tokenizer: transformers tokenizer for creating word tokens
  model:
  sentences: sentences to generate sentence embeddings for
  """

  # Tokenize sentences and return PyTorch tensors
  encoded_input = tokenizer(sentences, padding=True, truncation=True, 
                            max_length=128, return_tensors='pt')
  encoded_input = encoded_input.to(device)
  
  # Pass the tokenized input to the model 
  with torch.no_grad():
    model.to(device)
    model_output = model(**encoded_input) 
  
  # Perform mean pooling to get total sentence embeddings
  sentence_embeddings = mean_pooling(model_output, 
                                     encoded_input['attention_mask'])
  
  return sentence_embeddings

In [45]:
# An object to split and parse the training and test data fields
class SiameseNetworkDataset(Dataset): 
  def __init__(self,training_df, device, tokenizer, model):
    self.training_df=training_df
    self.sent1 = self.training_df['sentence_1'].tolist()
    self.sent2 = self.training_df['sentence_2'].tolist()
    self.language_task = self.training_df['language_task'].tolist()
    self.label = self.training_df['label'].tolist()
    self.device = device
    self.tokenizer = tokenizer
    self.model = model

  def __getitem__(self, index):
    sent1 = self.sent1[index]
    sent2 = self.sent2[index]
    task = self.language_task[index]
    label = self.label[index]

    # Get the sentence embeddings for sentences 1 and 2 for each row
    sent1_embeddings = sentence_embeddings(self.device, self.tokenizer, self.model, sent1)
    sent2_embeddings = sentence_embeddings(self.device, self.tokenizer, self.model, sent2)
    return sent1_embeddings, sent2_embeddings, task, torch.from_numpy(np.array(label)).long()

  def __len__(self):
    return len(self.training_df)

In [46]:
# Create the validation and test datasets
siamese_dev = SiameseNetworkDataset(p_dev_new, device, tokenizer, model)
siamese_test = SiameseNetworkDataset(p_test_new, device, tokenizer, model)

In [47]:
# Create the dev and test data loaders
val_loader = DataLoader(siamese_dev, batch_size=16)
test_loader = DataLoader(siamese_test, batch_size=16)

In [48]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [49]:
# Set the criteria for evaluation and training
# Use cross-entropy since we're doing classification and want to retrieve the labels
criterion = torch.nn.BCEWithLogitsLoss()

In [50]:
# Enter validation phase   
with torch.no_grad():
  # Initialize the validation loss and accuracy rates as 0
  val_epoch_loss = 0
  val_epoch_acc = 0
        
  # Place the model into evaluation mode
  net.eval()

  # Loop over records in the validation dataloader
  for i, data in enumerate(val_loader, 0):
    sent1_val, sent2_val, task_val, label_val = data
    label_val = label_val.type(torch.FloatTensor).reshape((label_val.shape[0], 1))

    # Send the validation data to GPU
    sent1_val, sent2_val, label_val = sent1_val.to(device), sent2_val.to(device), label_val.to(device)

    # Get the validation predictions          
    val_pred = net(sent1_val, sent2_val)
    # Find the validation loss and accuracy
    val_loss = criterion(val_pred, label_val)
    val_acc = binary_acc(val_pred, label_val)
          
    # Update the validation loss and accuracy rates
    val_epoch_loss += val_loss.item()
    val_epoch_acc += val_acc.item()

# Get the average accuracy and validation loss
print(f'Average validation accuracy: {val_epoch_acc/len(val_loader)}')
print(f'Average validation loss: {val_epoch_loss/len(val_loader)}')

Average validation accuracy: 77.62650602409639
Average validation loss: 0.509097362857267


In [51]:
# Get the test accuracy and labels
y_pred_list = []
y_actual_list = []
y_language_task = []
probability_of_paraphrase = []
y_loss = 0

with torch.no_grad():

  # Place the model into evaluation mode
    net.eval()

    # Loop over the test set
    for i, data in enumerate(test_loader, 0):

      # Extract the test set components
      sent1_test, sent2_test, task_test, label_test = data

      # Store the language tasks and actual labels to check model performance
      y_actual_list.extend(label_test.tolist())
      y_language_task.extend(task_test)

      # Reshape the test label to have the same dimensions as the model output
      label_test = label_test.type(torch.FloatTensor).reshape((label_test.shape[0], 1))

      # Send the test data to GPU
      sent1_test, sent2_test, label_test = sent1_test.to(device), sent2_test.to(device), label_test.to(device)

      # Get the test set predictions
      test_pred = net(sent1_test, sent2_test)

      # Get the test loss and add it to y_loss
      test_loss = criterion(test_pred, label_test)

      # Add the test loss from that batch to the running total
      y_loss += test_loss.item()
            
      # Get the predicted test labels
      y_test_pred = torch.round(torch.sigmoid(test_pred)).long()

      # Add the test labels to their respective list
      y_pred_list.extend(y_test_pred)

      # Get the predicted test probabilities
      y_test_probs = torch.sigmoid(test_pred).float()

      # Add the test labels to their respective list
      probability_of_paraphrase.extend(y_test_probs)

    # Get the average test loss
    y_avg_loss = y_loss/len(test_loader)

In [52]:
# Print the average test loss
print(f'Average test loss: {y_avg_loss}')

Average test loss: 0.5150788989232247


In [53]:
# Create a dataframe from test performance
y_pred = [x.item() for x in y_pred_list]
y_probs = [x.item() for x in probability_of_paraphrase]
test_performance = pd.DataFrame(list(zip(y_language_task, y_actual_list, y_pred, y_probs)),columns =['language', 'actual_label', 'predicted_label', 'para_prob'])

In [54]:
# Check the first few rows
test_performance.head()

Unnamed: 0,language,actual_label,predicted_label,para_prob
0,translate English to Hindi,0,0,0.301309
1,translate English to Hindi,0,0,0.077849
2,translate English to Hindi,1,1,0.925789
3,translate English to Hindi,0,0,0.037609
4,translate English to Hindi,1,1,0.981072


In [55]:
# Write the test set and predictions to a csv
final = pd.concat([test_performance, p_test_new[['sentence_1', 'sentence_2']]], axis=1)

In [56]:
final.head()

Unnamed: 0,language,actual_label,predicted_label,para_prob,sentence_1,sentence_2
0,translate English to Hindi,0,0,0.301309,निशा प्रतिदिन दादी को सीतारामजी के मंदिर में द...,जयपुर-कोटा के बीच 248 किलोमीटर की दूरी होने के...
1,translate English to Hindi,0,0,0.077849,मनरेगा के मजदूर पीएम मोदी को लौटाएंगे पांच रुपए,मिलान कोर्ट ऑफ अपील्स के आदेश के बाद पहली बा...
2,translate English to Hindi,1,1,0.925789,पाकिस्तान को दिया करारा जवाब,पाक को दिया मुंहतोड़ जवाब
3,translate English to Hindi,0,0,0.037609,स्कूल में दो बिल्डिंग हैं।,"जिस समय पुलिस स्कूल पहुंची, उस समय करीब पांच ह..."
4,translate English to Hindi,1,1,0.981072,रेप के मामले में वह पीड़िता से समझौते का दवाब ...,वह पीड़िता से रेप के मामले में समझौते का दवाब ब...


In [57]:
# Get the overall accuracy rate with sklearn accuracy score
# Accuracy is approximately 77%
accuracy_score(y_actual_list, y_pred)

0.7696374622356495

In [58]:
# Get the overall f1-score
f1_score(y_actual_list, y_pred)

0.7075743048897413

In [59]:
# Get the accuracy scores on Hindi
hi_test = final[final.language == 'translate English to Hindi']
# The model performs very well on Hindi
print(accuracy_score(hi_test.actual_label.tolist(), hi_test.predicted_label.tolist()))
# Get the f1-score
print(f1_score(hi_test.actual_label.tolist(), hi_test.predicted_label.tolist()))

0.8571428571428571
0.8383838383838383


In [60]:
# Get the accuracy scores on Tamil
ta_test = final[final.language == 'translate English to Tamil']
print(accuracy_score(ta_test.actual_label.tolist(), ta_test.predicted_label.tolist()))
# Get the f1-score
print(f1_score(ta_test.actual_label.tolist(), ta_test.predicted_label.tolist()))

0.6690140845070423
0.5017667844522968


In [61]:
# Get the accuracy scores on Malayalam
ml_test = final[final.language == 'translate English to Malayalam']
print(accuracy_score(ml_test.actual_label.tolist(), ml_test.predicted_label.tolist()))
# Get the f1-score
print(f1_score(ml_test.actual_label.tolist(), ml_test.predicted_label.tolist()))

0.7777777777777778
0.7252747252747254


In [63]:
# Let's see the number of mistakes the model is making (on what language tasks)
cm = confusion_matrix(y_actual_list, y_pred)
cm

array([[650, 112],
       [193, 369]])

In [64]:
# Check the confusiom matrix on Hindi
confusion_matrix(hi_test.actual_label.tolist(), hi_test.predicted_label.tolist())

array([[218,  33],
       [ 31, 166]])

In [65]:
# Check the confusiom matrix on Tamil
confusion_matrix(ta_test.actual_label.tolist(), ta_test.predicted_label.tolist())

array([[214,  39],
       [102,  71]])

In [66]:
# Check the confusiom matrix on Malayalam
confusion_matrix(ml_test.actual_label.tolist(), ml_test.predicted_label.tolist())

array([[218,  40],
       [ 60, 132]])

In [67]:
final.head()

Unnamed: 0,language,actual_label,predicted_label,para_prob,sentence_1,sentence_2
0,translate English to Hindi,0,0,0.301309,निशा प्रतिदिन दादी को सीतारामजी के मंदिर में द...,जयपुर-कोटा के बीच 248 किलोमीटर की दूरी होने के...
1,translate English to Hindi,0,0,0.077849,मनरेगा के मजदूर पीएम मोदी को लौटाएंगे पांच रुपए,मिलान कोर्ट ऑफ अपील्स के आदेश के बाद पहली बा...
2,translate English to Hindi,1,1,0.925789,पाकिस्तान को दिया करारा जवाब,पाक को दिया मुंहतोड़ जवाब
3,translate English to Hindi,0,0,0.037609,स्कूल में दो बिल्डिंग हैं।,"जिस समय पुलिस स्कूल पहुंची, उस समय करीब पांच ह..."
4,translate English to Hindi,1,1,0.981072,रेप के मामले में वह पीड़िता से समझौते का दवाब ...,वह पीड़िता से रेप के मामले में समझौते का दवाब ब...


In [68]:
# Save the file for error analysis
final.to_csv(D+"/predicted_text/siamese_evaluations_of_translations/3dense_wopb_para_test_eval.csv")