In [1]:
from google.colab import drive

drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
!pip install transformers
!pip install -U -q PyDrive
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m95.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m128.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2
Looking in i

### Imports

In [3]:
# Import required libraries
import torch
import pandas as pd
import numpy as np
import re
import random
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import random

In [4]:

import torch

# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")


Found device: Tesla T4, n_gpu: 1


In [5]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

### Data download and preprocessing

In [6]:
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    # Convert all text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    text = ' '.join(tokens)

    # Remove extra whitespaces
    text = re.sub(' +', ' ', text)

    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
df = pd.read_csv('/content/gdrive/MyDrive/NLP Project/data/IMDB Dataset.csv')
df = df.rename(columns={'review':'text'})
df = df[['text', 'sentiment']]

 
# How much of the dataset to use
data_size = 0.2
df = df.sample(frac=data_size, random_state=42)

df['text'] = df['text'].apply(clean_text)

# Convert the sentiment labels into numerical values
sentiment_map = {'positive': 0, 'negative': 1}
df['sentiment'] = df['sentiment'].replace(sentiment_map)

# Find and delete any empty rows
empty_rows = df[df['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) == 0)]
df.drop(empty_rows.index, inplace=True)

### Noise Addition

In [8]:
# Noise funcs 

char_action = ['insert',
        'substitute',
        'delete',
        'swap',
]

word_action = ['substitute',
        'delete',
        'swap',
]


def get_action(type):
  if type=="char":
    return random.choice(char_action)
  elif type=="word":
    return random.choice(word_action)


def augment_tweet(tweet, p=0.7):
    """
    Augment a tweet with character-level and word-level noise.

    Args:
        tweet (str): The original tweet.
        p (float): The probability of applying the char level augmentation.

    Returns:
        str: The augmented tweet.
    """
    # Define a list of character-level augmentation techniques
    char_augmenters = [
        nac.OcrAug(),
        nac.KeyboardAug(aug_char_p=0.2, aug_word_p=0.2, include_special_char=False),
        nac.RandomCharAug(action=get_action("char"), aug_char_p=0.1, aug_word_p=0.1),
    ]

    # Define a list of word-level augmentation techniques
    word_augmenters = [
        naw.SpellingAug(),
        naw.SplitAug(),
        naw.SynonymAug(),
        naw.RandomWordAug(aug_p=0.2, action=get_action("word")),
    ]

    # Randomly apply a character-level or word-level augmentation with probability p
    if random.random() < p:
        aug = random.choice(char_augmenters)
        augmented_tweet = aug.augment(tweet)
    else:
        aug = random.choice(word_augmenters)
        augmented_tweet = aug.augment(tweet)
        
    return augmented_tweet


In [9]:
def add_noise(df, augmentation_percentage, task):

  if task=="sentiment_analysis":
    # Sample 10% of the rows in the DataFrame
    augment_indices = df.sample(frac=augmentation_percentage).index

    # Apply the augment_tweet function to each tweet in the sampled rows
    for index in augment_indices:
        tweet = df.loc[index, 'text']
        augmented_tweet = augment_tweet(tweet)
        df.loc[index, 'text'] = augmented_tweet
    
    return df
  
  elif task=="question_answering":

    # TODO - noise functions for QA

    return df

In [10]:
df = add_noise(df, augmentation_percentage=0.1, task="sentiment_analysis")

# Randomly shuffle all rows
df = df.sample(frac=1).reset_index(drop=True)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


### Train/val/test splits

In [11]:
# Separate the dataset into three subsets based on the sentiment labels
positive_reviews = df[df['sentiment'] == sentiment_map['positive']]
negative_reviews = df[df['sentiment'] == sentiment_map['negative']]

# Shuffle each of the two subsets randomly
positive_reviews = positive_reviews.sample(frac=1, random_state=42)
negative_reviews = negative_reviews.sample(frac=1, random_state=42)

print(len(positive_reviews), len(negative_reviews))
# Divide each subset into training, validation, and test sets with a 70/20/10 ratio
train_pos, val_pos_test_pos = train_test_split(positive_reviews, test_size=0.3, random_state=42)
val_pos, test_pos = train_test_split(val_pos_test_pos, test_size=0.33, random_state=42)

train_neg, val_neg_test_neg = train_test_split(negative_reviews, test_size=0.3, random_state=42)
val_neg, test_neg = train_test_split(val_neg_test_neg, test_size=0.33, random_state=42)

# Merge the corresponding subsets from each sentiment back together to form the final training, validation, and test sets
train_set = pd.concat([train_pos, train_neg], ignore_index=True)
val_set = pd.concat([val_pos, val_neg], ignore_index=True)
test_set = pd.concat([test_pos, test_neg], ignore_index=True)

5039 4961


In [12]:
from transformers import ElectraTokenizer

def tokenize_and_format(sentences):
  tokenizer = ElectraTokenizer.from_pretrained("google/electra-base-discriminator")

  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  attention_masks = []

  # For every sentence...
  for sentence in sentences:
      # `encode_plus` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length`
      #   (6) Create attention masks for [PAD] tokens.
      encoded_dict = tokenizer.encode_plus(
                          sentence,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = 64,           # Pad & truncate all sentences.
                          padding = 'max_length',
                          truncation = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )

      # Add the encoded sentence to the list.
      input_ids.append(encoded_dict['input_ids'])

      # And its attention mask (simply differentiates padding from non-padding).
      attention_masks.append(encoded_dict['attention_mask'])
  return input_ids, attention_masks

In [13]:
#tokenize train, test and val individually

# For train

texts = train_set.text.values
labels = train_set.sentiment.values

### tokenize_and_format() is a helper function provided in helpers.py ###
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids_train = torch.cat(input_ids, dim=0)
attention_masks_train = torch.cat(attention_masks, dim=0)
labels_train = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Original:  movies loved almost everyone come across yet happen boxoffice failures andaz apna apna intelligent hilarious comedy falls catogory indian director kept mind sensibilities audience churned kader khan type stereotypical hoax movie two guys dream riches try accomplish wooing millionaires daughter humorous drama unfolds lot complexities surface story complexities add sheer comedy entire plot aamir khan plays streetsmart guy salman khan gives unexpectedly good performance dumb guy villian played paresh rawaland henchmen junior ajit kaliaa make laugh sleep although movie borrows lot movies despite shoddy camerawork despite loud times remains one scarce funny movies bombay come movies like padosan golmal amol plaekar movies sad didnt well boxoffice means producers turn back formulas creativity abandoned
Token IDs: tensor([[  101,  5691,  3866,  2471,  3071,  2272,  2408,  2664,  4148,  3482,
          7245,  6610, 15428,  1998, 10936,  9706,  2532,  9706,  2532,  9414,
         263

In [14]:
# For test

texts = test_set.text.values
labels = test_set.sentiment.values

### tokenize_and_format() is a helper function provided in helpers.py ###
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids_test = torch.cat(input_ids, dim=0)
attention_masks_test = torch.cat(attention_masks, dim=0)
labels_test = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Original:  actually good surreal mystery movie despite description tries sell scifi movie balkan stars woman haunted mysterious visions lost memories trying piece together spends majority movie trying make sense visions atmospheric effective true kinski appear much film staring actors good english dubbed version available us dubbing leaves something desired actors good job cinematography academy award winner vittorio storaro excellent earlier giallo director bazzoni fifth cord also excellent also lensed storarro
Token IDs: tensor([[  101,  2941,  2204, 16524,  6547,  3185,  2750,  6412,  5363,  5271,
         16596,  8873,  3185, 17581,  3340,  2450, 11171,  8075, 12018,  2439,
          5758,  2667,  3538,  2362, 15970,  3484,  3185,  2667,  2191,  3168,
         12018, 12483,  4621,  2995, 12631,  5488,  3711,  2172,  2143,  4582,
          5889,  2204,  2394,  9188,  2544,  2800,  2149, 12931, 10472,  3727,
          2242,  9059,  5889,  2204,  3105, 16434,  2914,  2400,  3453, 2591

In [15]:
# For val

texts = val_set.text.values
labels = val_set.sentiment.values

### tokenize_and_format() is a helper function provided in helpers.py ###
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids_val = torch.cat(input_ids, dim=0)
attention_masks_val = torch.cat(attention_masks, dim=0)
labels_val = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Original:  far best war documentary ever made beginning first episode sir laurence olivier described horrific events oradoursurglane day soldiers came final days war mushroom clouds appeared japan never missed second classic series remember well even though screened way back every aspect tragedy covered detail whole series compulsory viewing many worlds children possible tragedy world war two repeated bigotry hatred greed intolerance confused patriotism religious zeal
Token IDs: tensor([[  101,  2521,  2190,  2162,  4516,  2412,  2081,  2927,  2034,  2792,
          2909, 10883, 14439,  2649, 23512,  2824,  2030,  9365,  9236, 12514,
         20644,  2154,  3548,  2234,  2345,  2420,  2162, 18565,  8044,  2596,
          2900,  2196,  4771,  2117,  4438,  2186,  3342,  2092,  2130,  2295,
         12238,  2126,  2067,  2296,  7814, 10576,  3139,  6987,  2878,  2186,
         14770, 10523,  2116,  8484,  2336,  2825, 10576,  2088,  2162,  2048,
          5567,  2502,  4140,   102]])


In [16]:
#printing out len of train,test val
total = len(df)
num_train = len(train_set)
num_val = len(val_set)
num_test = len(test_set)

print('Train Set Size: ',num_train)
print('Validation Set Size: ',num_val)
print('Test Set Size: ',num_test)

Train Set Size:  6999
Validation Set Size:  2010
Test Set Size:  991


In [17]:
# make lists of 3-tuples 

train_dataset=[]
for i in range(num_train):
  train_dataset.append((input_ids_train[i], attention_masks_train[i], labels_train[i]))

val_dataset=[]
for i in range(num_val):
  val_dataset.append((input_ids_val[i], attention_masks_val[i], labels_val[i]))


test_dataset=[]
for i in range(num_test):
  test_dataset.append((input_ids_test[i], attention_masks_test[i], labels_test[i]))

### Training

In [18]:
from transformers import AdamW, ElectraForSequenceClassification, get_linear_schedule_with_warmup

model = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=2)
#model.cuda()

# Set the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
total_steps = len(train_dataset) * 4
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.o

In [None]:
# # Define the training loop
# from tqdm.auto import tqdm

# model.to(device)

# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
# val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

# best_val_loss = float("inf")
# num_epochs = 3
# train_loss = []
# val_loss = []
# for epoch in range(num_epochs):
#   # Train the model
#   model.train()
#   epoch_loss = 0

#   train_loop = tqdm(train_loader, desc=f'Training Epoch {epoch+1}', leave=True)

#   for batch in train_loop:
      
#     inputs = {'input_ids': batch[0].to(device),
#               'attention_mask': batch[1].to(device),
#               'labels': batch[2].to(device)}
#     optimizer.zero_grad()
#     outputs = model(**inputs)
#     loss = outputs[0]

#     loss.backward()
#     optimizer.step()
#     scheduler.step()

#     epoch_loss += loss.item()
#     train_loop.set_postfix(loss=loss.item())

#   epoch_loss /= len(train_loader)
#   train_loss.append(epoch_loss)

#   # Evaluate the model on the validation set
#   model.eval()
#   val_preds = []
#   val_labels = []
#   epoch_val_loss = 0

#   with torch.no_grad():
#     for batch in val_loader:
        
#       inputs = {'input_ids': batch[0].to(device),
#                 'attention_mask': batch[1].to(device),
#                 'labels': batch[2].to(device)}
      
#       outputs = model(**inputs)
#       loss = outputs[0]
#       epoch_val_loss += loss.item()

#       logits = outputs[1]
#       preds = torch.argmax(logits, axis=1)
#       val_preds.extend(preds.cpu().numpy())
#       val_labels.extend(batch[2].cpu().numpy())

#     epoch_val_loss /= len(val_loader)

#   if epoch_val_loss < best_val_loss:
#     best_val_loss = epoch_val_loss
#     # torch.save(model.state_dict(), "t5_sentiment_model.pt")
#     path = '/content/gdrive/MyDrive/NLP Project/models/ELECTRA_SA_NOISY10'

#     torch.save(model.state_dict(), path+'/model_parameters.pth')

#   # Compute the evaluation metrics
#   val_accuracy = accuracy_score(val_labels, val_preds)
#   val_report = classification_report(val_labels, val_preds, target_names=['positive', 'negative'])
  

#   # Print the results for the current epoch
#   print('Epoch:', epoch+1, ', Training Loss:', epoch_loss/len(train_loader), ', Validation Loss:', epoch_val_loss, ', Validation Accuracy:', val_accuracy)
#   print('Validation Classification Report:')
#   print(val_report)
    

Training Epoch 1:   0%|          | 0/219 [00:00<?, ?it/s]

Epoch: 1 , Training Loss: 0.0022512169364957155 , Validation Loss: 0.4419419609365009 , Validation Accuracy: 0.7825870646766169
Validation Classification Report:
              precision    recall  f1-score   support

    positive       0.88      0.66      0.75      1013
    negative       0.72      0.91      0.81       997

    accuracy                           0.78      2010
   macro avg       0.80      0.78      0.78      2010
weighted avg       0.80      0.78      0.78      2010



Training Epoch 2:   0%|          | 0/219 [00:00<?, ?it/s]

Epoch: 2 , Training Loss: 0.001625876047675811 , Validation Loss: 0.38835807452126153 , Validation Accuracy: 0.8054726368159204
Validation Classification Report:
              precision    recall  f1-score   support

    positive       0.82      0.78      0.80      1013
    negative       0.79      0.83      0.81       997

    accuracy                           0.81      2010
   macro avg       0.81      0.81      0.81      2010
weighted avg       0.81      0.81      0.81      2010



Training Epoch 3:   0%|          | 0/219 [00:00<?, ?it/s]

Epoch: 3 , Training Loss: 0.001119243048613196 , Validation Loss: 0.408074625546024 , Validation Accuracy: 0.8218905472636816
Validation Classification Report:
              precision    recall  f1-score   support

    positive       0.83      0.82      0.82      1013
    negative       0.82      0.83      0.82       997

    accuracy                           0.82      2010
   macro avg       0.82      0.82      0.82      2010
weighted avg       0.82      0.82      0.82      2010



In [None]:
# Save model state dict in folder

#path = '/content/gdrive/MyDrive/NLP Project/models/ELECTRA_SA_CLEAN100'

#torch.save(model.state_dict(), path+'/model_parameters.pth')

### EVALUATION ON CLEAN DATA

In [None]:
# Download data for Testing
df_test = pd.read_csv('/content/gdrive/MyDrive/NLP Project/data/IMDB Dataset.csv')
df_test = df_test.rename(columns={'review':'text'})
df_test = df_test[['text', 'sentiment']]

 
# How much of the dataset to use
data_size = 0.2
df_test = df_test.sample(frac=data_size, random_state=42)

df_test['text'] = df_test['text'].apply(clean_text)

# Convert the sentiment labels into numerical values
sentiment_map = {'positive': 0, 'negative': 1}
df_test['sentiment'] = df_test['sentiment'].replace(sentiment_map)

# Find and delete any empty rows
empty_rows = df_test[df_test['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) == 0)]
df_test.drop(empty_rows.index, inplace=True)


# Separate the dataset into three subsets based on the sentiment labels
positive_reviews = df_test[df_test['sentiment'] == sentiment_map['positive']]
negative_reviews = df_test[df_test['sentiment'] == sentiment_map['negative']]

# Shuffle each of the two subsets randomly
positive_reviews = positive_reviews.sample(frac=1, random_state=42)
negative_reviews = negative_reviews.sample(frac=1, random_state=42)

print(len(positive_reviews), len(negative_reviews))
# Divide each subset into training, validation, and test sets with a 70/20/10 ratio
train_pos, val_pos_test_pos = train_test_split(positive_reviews, test_size=0.3, random_state=42)
val_pos, test_pos = train_test_split(val_pos_test_pos, test_size=0.33, random_state=42)

train_neg, val_neg_test_neg = train_test_split(negative_reviews, test_size=0.3, random_state=42)
val_neg, test_neg = train_test_split(val_neg_test_neg, test_size=0.33, random_state=42)

# Merge the corresponding subsets from each sentiment back together to form the final training, validation, and test sets
train_set = pd.concat([train_pos, train_neg], ignore_index=True)
val_set = pd.concat([val_pos, val_neg], ignore_index=True)
test_set = pd.concat([test_pos, test_neg], ignore_index=True)

# For test

texts = test_set.text.values
labels = test_set.sentiment.values

### tokenize_and_format() is a helper function provided in helpers.py ###
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids_test = torch.cat(input_ids, dim=0)
attention_masks_test = torch.cat(attention_masks, dim=0)
labels_test = torch.tensor(labels)


test_dataset=[]
for i in range(num_test):
  test_dataset.append((input_ids_test[i], attention_masks_test[i], labels_test[i]))

5039 4961


In [None]:
import torch

# Specify device
device = torch.device('cpu')

# Load model for evaluations, comment for finetuning
path = '/content/gdrive/MyDrive/NLP Project/models/ELECTRA_SA_NOISY10'
model.load_state_dict(torch.load(path+'/model_parameters.pth', map_location=device))

model.to(device)
input_ids_test = input_ids_test.to(device)
attention_masks_test = attention_masks_test.to(device)
labels_test = labels_test.to(device)

# Set model to evaluation mode
model.eval()

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids=input_ids_test, attention_mask=attention_masks_test)
    logits = outputs.logits

# Apply softmax to obtain probabilities
probs = torch.softmax(logits, dim=1)
preds = torch.argmax(probs, dim=1)

# Move predictions and labels back to CPU for evaluation
preds = preds.detach().cpu().numpy()
labels_test = labels_test.cpu().numpy()

# Calculate accuracy
accuracy = (preds == labels_test).mean()
print(f'Test Accuracy: {accuracy:.4f}')

from sklearn.metrics import f1_score

f1 = f1_score(labels_test, preds, average='weighted')
print(f'F1 Score: {f1:.4f}')

Test Accuracy: 0.9122
F1 Score: 0.9122


### EVALUATION ON NOISY DATA

In [None]:
import random
random.seed(42)
random_noise = random.uniform(0.05, 0.15)
print(random_noise)

# Download data for Testing
df_test = pd.read_csv('/content/gdrive/MyDrive/NLP Project/data/IMDB Dataset.csv')
df_test = df_test.rename(columns={'review':'text'})
df_test = df_test[['text', 'sentiment']]

 
# How much of the dataset to use
data_size = 0.2
df_test = df_test.sample(frac=data_size, random_state=42)

df_test['text'] = df_test['text'].apply(clean_text)

# Convert the sentiment labels into numerical values
sentiment_map = {'positive': 0, 'negative': 1}
df_test['sentiment'] = df_test['sentiment'].replace(sentiment_map)

# Find and delete any empty rows
empty_rows = df_test[df_test['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) == 0)]
df_test.drop(empty_rows.index, inplace=True)

df_test = add_noise(df_test, augmentation_percentage=random_noise, task="sentiment_analysis")

# Separate the dataset into three subsets based on the sentiment labels
positive_reviews = df_test[df_test['sentiment'] == sentiment_map['positive']]
negative_reviews = df_test[df_test['sentiment'] == sentiment_map['negative']]

# Shuffle each of the two subsets randomly
positive_reviews = positive_reviews.sample(frac=1, random_state=42)
negative_reviews = negative_reviews.sample(frac=1, random_state=42)

print(len(positive_reviews), len(negative_reviews))
# Divide each subset into training, validation, and test sets with a 70/20/10 ratio
train_pos, val_pos_test_pos = train_test_split(positive_reviews, test_size=0.3, random_state=42)
val_pos, test_pos = train_test_split(val_pos_test_pos, test_size=0.33, random_state=42)

train_neg, val_neg_test_neg = train_test_split(negative_reviews, test_size=0.3, random_state=42)
val_neg, test_neg = train_test_split(val_neg_test_neg, test_size=0.33, random_state=42)

# Merge the corresponding subsets from each sentiment back together to form the final training, validation, and test sets
train_set = pd.concat([train_pos, train_neg], ignore_index=True)
val_set = pd.concat([val_pos, val_neg], ignore_index=True)
test_set = pd.concat([test_pos, test_neg], ignore_index=True)

# For test

texts = test_set.text.values
labels = test_set.sentiment.values

### tokenize_and_format() is a helper function provided in helpers.py ###
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids_test = torch.cat(input_ids, dim=0)
attention_masks_test = torch.cat(attention_masks, dim=0)
labels_test = torch.tensor(labels)


test_dataset=[]
for i in range(num_test):
  test_dataset.append((input_ids_test[i], attention_masks_test[i], labels_test[i]))


0.11394267984578837


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


5039 4961


In [None]:
import torch

# Specify device
device = torch.device('cpu')

# Load model for evaluations, comment for finetuning
path = '/content/gdrive/MyDrive/NLP Project/models/ELECTRA_SA_NOISY10'
model.load_state_dict(torch.load(path+'/model_parameters.pth', map_location=device))

model.to(device)
input_ids_test = input_ids_test.to(device)
attention_masks_test = attention_masks_test.to(device)
labels_test = labels_test.to(device)

# Set model to evaluation mode
model.eval()

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids=input_ids_test, attention_mask=attention_masks_test)
    logits = outputs.logits

# Apply softmax to obtain probabilities
probs = torch.softmax(logits, dim=1)
preds = torch.argmax(probs, dim=1)

# Move predictions and labels back to CPU for evaluation
preds = preds.detach().cpu().numpy()
labels_test = labels_test.cpu().numpy()

# Calculate accuracy
accuracy = (preds == labels_test).mean()
print(f'Test Accuracy: {accuracy:.4f}')

from sklearn.metrics import f1_score

f1 = f1_score(labels_test, preds, average='weighted')
print(f'F1 Score: {f1:.4f}')

Test Accuracy: 0.8678
F1 Score: 0.8675
