In [187]:
from google.colab import drive

drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [190]:
!pip install transformers
!pip install -U -q PyDrive
!pip install 
!pip install keras
!pip install sentencepiece
!pip install nlpaug

In [191]:
# Import required libraries
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import XLNetTokenizer, XLNetForSequenceClassification, ElectraForSequenceClassification
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
import random
from tqdm.auto import tqdm
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import sys
import argparse
import textwrap
from sklearn import metrics
from sklearn.model_selection import train_test_split
import re

In [192]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print('success!')

import os
import zipfile

# Download helper functions file
helper_file = drive.CreateFile({'id': '16HW-z9Y1tM3gZ_vFpJAuwUDohz91Aac-'})
helper_file.GetContentFile('helpers.py')
print('helper file downloaded! (helpers.py)')

success!
helper file downloaded! (helpers.py)


## Noise Function

In [240]:
# Noise funcs 

char_action = ['insert',
        'substitute',
        'delete',
        'swap',
]

word_action = ['substitute',
        'delete',
        'swap',
]


def get_action(type):
  if type=="char":
    return random.choice(char_action)
  elif type=="word":
    return random.choice(word_action)


def augment_tweet(tweet, p=0.7):
    """
    Augment a tweet with character-level and word-level noise.

    Args:
        tweet (str): The original tweet.
        p (float): The probability of applying the char level augmentation.

    Returns:
        str: The augmented tweet.
    """
    # Define a list of character-level augmentation techniques
    char_augmenters = [
        nac.OcrAug(),
        nac.KeyboardAug(aug_char_p=0.2, aug_word_p=0.2, include_special_char=False),
        nac.RandomCharAug(action=get_action("char"), aug_char_p=0.2, aug_word_p=0.1),
    ]

    # Define a list of word-level augmentation techniques
    word_augmenters = [
        naw.SpellingAug(),
        naw.SplitAug(),
        naw.SynonymAug(),
        naw.RandomWordAug(aug_p=0.2, action=get_action("word")),
    ]

    # Randomly apply a character-level or word-level augmentation with probability p
    if random.random() < p:
        aug = random.choice(char_augmenters)
        augmented_tweet = aug.augment(tweet)
    else:
        aug = random.choice(word_augmenters)
        augmented_tweet = aug.augment(tweet)
        
    return augmented_tweet[0]

def add_noise(df, augmentation_percentage, task):

  if task=="sentiment_analysis":
    # Sample 10% of the rows in the DataFrame
    augment_indices = df.sample(frac=augmentation_percentage).index

    # Apply the augment_tweet function to each tweet in the sampled rows
    for index in augment_indices:
        tweet = df.loc[index, 'text']
        augmented_tweet = augment_tweet(tweet)
        df.loc[index, 'text'] = augmented_tweet

  return df

## Data Preprocessing

In [194]:
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    # Convert all text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    text = ' '.join(tokens)

    # Remove extra whitespaces
    text = re.sub(' +', ' ', text)

    return text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [195]:
random.seed(42)
random_noise = random.uniform(0.05, 0.15)
print(random_noise)

# Download data for Error Analysis
df_error = pd.read_csv('/content/gdrive/MyDrive/NLP Project/data/IMDB Dataset.csv')
df_error = df_error.rename(columns={'review':'text'})
df_error = df_error[['text', 'sentiment']]

df_error['text'] = df_error['text'].apply(clean_text)

# How much of the dataset to use
data_size = 0.2
df_error = df_error.sample(frac=data_size, random_state=42)

# Convert the sentiment labels into numerical values
sentiment_map = {'positive': 0, 'negative': 1}
df_error['sentiment'] = df_error['sentiment'].replace(sentiment_map)

# Find and delete any empty rows
empty_rows = df_error[df_error['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) == 0)]
df_error.drop(empty_rows.index, inplace=True)

df_error = add_noise(df_error, augmentation_percentage=random_noise, task="sentiment_analysis")

# Separate the dataset into three subsets based on the sentiment labels
positive_reviews = df_error[df_error['sentiment'] == sentiment_map['positive']]
negative_reviews = df_error[df_error['sentiment'] == sentiment_map['negative']]


# Divide each subset into training, validation, and test sets with a 70/20/10 ratio
train_pos, val_pos_test_pos = train_test_split(positive_reviews, test_size=0.3, random_state=42)
val_pos, test_pos = train_test_split(val_pos_test_pos, test_size=0.33, random_state=42)

train_neg, val_neg_test_neg = train_test_split(negative_reviews, test_size=0.3, random_state=42)
val_neg, test_neg = train_test_split(val_neg_test_neg, test_size=0.33, random_state=42)

test_set = pd.concat([test_pos, test_neg], ignore_index=True)
test_set = test_set.sample(frac=1, random_state=42)

texts = test_set.text.values
labels = test_set.sentiment.values

0.11394267984578837


### Model loading

In [196]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BERT
bert_clean = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
path = '/content/gdrive/MyDrive/NLP Project/models/BERT_SA_clean100'
bert_clean.load_state_dict(torch.load(path+'/model_parameters.pth'))


bert_noisy = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
path = '/content/gdrive/MyDrive/NLP Project/models/BERT_SA_noisy10_100'
bert_noisy.load_state_dict(torch.load(path+'/model_parameters.pth'))


# ELECTRA
electra_clean = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=2).to(device)
path = '/content/gdrive/MyDrive/NLP Project/models/ELECTRA_SA_CLEAN100'
electra_clean.load_state_dict(torch.load(path+'/model_parameters.pth'))


electra_noisy = ElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator', num_labels=2).to(device)
path = '/content/gdrive/MyDrive/NLP Project/models/ELECTRA_SA_NOISY10'
electra_noisy.load_state_dict(torch.load(path+'/model_parameters.pth'))


# T5
t5_clean = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
path = '/content/gdrive/MyDrive/NLP Project/models/T5_SA_clean20'
t5_clean.load_state_dict(torch.load(path+'/model_parameters.pth'))


t5_noisy = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
path = '/content/gdrive/MyDrive/NLP Project/models/T5_SA_noise10'
t5_noisy.load_state_dict(torch.load(path+'/model_parameters.pth'))


# XLNET
xlnet_clean = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2).to(device)
path = '/content/gdrive/MyDrive/NLP Project/models/XLNet_SA_clean100'
xlnet_clean.load_state_dict(torch.load(path+'/model_parameters.pth'))


xlnet_noisy = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2).to(device)
path = '/content/gdrive/MyDrive/NLP Project/models/XLNet_SA_Noisy15'
xlnet_noisy.load_state_dict(torch.load(path+'/model_parameters.pth'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

### BERT

In [197]:
from helpers import tokenize_and_format, flat_accuracy

# tokenize_and_format() is a helper function provided in helpers.py
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids_test = torch.cat(input_ids, dim=0)
attention_masks_test = torch.cat(attention_masks, dim=0)
labels_test = torch.tensor(labels)


input_ids_test = input_ids_test.to(device)
attention_masks_test = attention_masks_test.to(device)
labels_test = labels_test.to(device)

#### BERT CLEAN


In [198]:
# Set model to evaluation mode
bert_clean.eval()

# Generate predictions
with torch.no_grad():
    outputs = bert_clean(input_ids=input_ids_test, attention_mask=attention_masks_test)
    logits = outputs.logits

# Get the predicted probabilities and sentiment label
probs = torch.softmax(logits, dim=1)
predicted_label = torch.argmax(probs, dim=1).tolist()
# Map the sentiment label to its corresponding class
sentiment_classes = ['Positive', 'Negative']
pred_sent_bert_clean = [sentiment_classes[label] for label in predicted_label]

wrong_texts_bert_clean = []
for i in range(len(pred_sent_bert_clean)):
  if pred_sent_bert_clean[i] != sentiment_classes[labels_test[i]]:
    # print(f'Truth: {sentiment_classes[labels_test[i]]} | Predicted: {pred_sent_bert_clean[i]}\t: {texts[i]}')
    wrong_texts_bert_clean.append(texts[i])

print("Number of examples that were predicted wrongly by BERT Clean in test set:", len(wrong_texts_bert_clean))


Number of examples that were predicted wrongly by BERT Clean in test set: 137


#### BERT NOISY

In [199]:
# Set model to evaluation mode
bert_noisy.eval()

# Generate predictions
with torch.no_grad():
    outputs = bert_noisy(input_ids=input_ids_test, attention_mask=attention_masks_test)
    logits = outputs.logits

# Get the predicted probabilities and sentiment label
probs = torch.softmax(logits, dim=1)
predicted_label = torch.argmax(probs, dim=1).tolist()
# Map the sentiment label to its corresponding class
sentiment_classes = ['Positive', 'Negative']
pred_sent_bert_noisy= [sentiment_classes[label] for label in predicted_label]

correct_texts_bert_noisy = []
for i in range(len(pred_sent_bert_noisy)):
  if pred_sent_bert_noisy[i] == sentiment_classes[labels_test[i]]:
    # print(f'Truth: {sentiment_classes[labels_test[i]]} | Predicted: {pred_sent_bert_noisy[i]}\t: {texts[i]}')
    correct_texts_bert_noisy.append(texts[i])

print("Number of examples that were predicted correctly by BERT 10 in test set:", len(correct_texts_bert_noisy))


Number of examples that were predicted correctly by BERT 10 in test set: 884


In [208]:
wrong_revs_bert_clean = [review for review in wrong_texts_bert_clean if review in correct_texts_bert_noisy]
# Print the results
print("Reviews that BERT clean got wrong:")
cnt=1
for string in wrong_revs_bert_clean:
    print(f"Review {cnt}: {string}")
    cnt += 1

Reviews that BERT clean got wrong:
Review 1: havent read eoIrigiynal short story know literary points went wrong im going go path herebr br cti8me ago learnt stephen king movies simply horror films perhaps couple exceptions one started well enough im going complain acting although fred gwynne usual wonderful also forgive total lack parenting HsTkills necessary make story m3ov4e forwardbr br one consistent point couldnt help get andnpoyAed came pretty close end movie least characters partook activity dumb stupidity moments refer thus tiny zombie running around house suspect bed br br get close bed blindly raising duvet cover exposing pretty much whole body whatever damage teeny undead cannibal might inflict br br b move lUittlje _aw0ay bed cpeqer completely open end position slightly increased safety least see mini terror coming g3ivin#g little reaction timebr br know lets go feel like offering slaughter today blehbr br fun enough film though Bsca(ry
Review 2: ed wood movie angora love 

EXAMPLE_TEXT =  31630

I think that this movie was reasonbaly good. It's kinda weird that now the Olsen twins are 13 and have boyfriends and all. I enjoyed them alot when they were little kids on Full House. Anyway, the casting was good and the movie was somewhat funny. I kind of got mixed up between all the switching places and their names. It's just kind of an older version of It Takes Two.

Noisy - I think that thi8 m0vie was reasonbaly good. It ' s kinda weird that now the Olsen twins are 13 and have boyfriends and all. 1 enjoyed them alot when they weke little kids on Full House. Anyway, the casting was good and the movie was somewhat funny. I kind of got mixed up 6etween all the switching places and their names. It ' s just kind of an older version of 1t Takes Twu.

In [236]:
review = "I think that this movie was reasonbaly good. It's kinda weird that now the Olsen twins are 13 and have boyfriends and all. I enjoyed them alot when they were little kids on Full House. Anyway, the casting was good and the movie was somewhat funny. I kind of got mixed up between all the switching places and their names. It's just kind of an older version of It Takes Two."
review = augment_tweet(review)
review

"I think that thi8 m0vie was reasonbaly good. It ' s kinda weird that now the Olsen twins are 13 and have boyfriends and all. 1 enjoyed them alot when they weke little kids on Full House. Anyway, the casting was good and the movie was somewhat funny. I kind of got mixed up 6etween all the switching places and their names. It ' s just kind of an older version of 1t Takes Twu."

## ELECTRA

In [206]:
from helpers import tokenize_and_format, flat_accuracy

# tokenize_and_format() is a helper function provided in helpers.py
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids_test = torch.cat(input_ids, dim=0)
attention_masks_test = torch.cat(attention_masks, dim=0)
labels_test = torch.tensor(labels)


input_ids_test = input_ids_test.to(device)
attention_masks_test = attention_masks_test.to(device)
labels_test = labels_test.to(device)

### CLEAN

In [210]:
# Set model to evaluation mode
electra_clean.eval()

# Generate predictions
with torch.no_grad():
    outputs = electra_clean(input_ids=input_ids_test, attention_mask=attention_masks_test)
    logits = outputs.logits


# Get the predicted probabilities and sentiment label
probs = torch.softmax(logits, dim=1)
predicted_label = torch.argmax(probs, dim=1).tolist()
# Map the sentiment label to its corresponding class
sentiment_classes = ['Positive', 'Negative']
pred_sent_electra_clean = [sentiment_classes[label] for label in predicted_label]

wrong_texts_electra_clean = []
for i in range(len(pred_sent_electra_clean)):
  if pred_sent_electra_clean[i] != sentiment_classes[labels_test[i]]:
    # print(f'Truth: {sentiment_classes[labels_test[i]]} | Predicted: {pred_sent_electra_clean[i]}\t: {texts[i]}')
    wrong_texts_electra_clean.append(texts[i])

print("Number of examples that were predicted wrongly by ELECTRA CLEAN in test set:", len(wrong_texts_electra_clean))


Number of examples that were predicted wrongly by ELECTRA CLEAN in test set: 120


### NOISY

In [211]:
# Set model to evaluation mode
electra_noisy.eval()

# Generate predictions
with torch.no_grad():
    outputs = electra_noisy(input_ids=input_ids_test, attention_mask=attention_masks_test)
    logits = outputs.logits


# Get the predicted probabilities and sentiment label
probs = torch.softmax(logits, dim=1)
predicted_label = torch.argmax(probs, dim=1).tolist()
# Map the sentiment label to its corresponding class
sentiment_classes = ['Positive', 'Negative']
pred_sent_electra_noisy = [sentiment_classes[label] for label in predicted_label]

correct_texts_electra_noisy = []
for i in range(len(pred_sent_electra_noisy)):
  if pred_sent_electra_noisy[i] == sentiment_classes[labels_test[i]]:
    # print(f'Truth: {sentiment_classes[labels_test[i]]} | Predicted: {pred_sent_electra_noisy[i]}\t: {texts[i]}')
    correct_texts_electra_noisy.append(texts[i])

print("Number of examples that were predicted correctly by ELECTRA 10 in test set:", len(correct_texts_electra_noisy))


Number of examples that were predicted correctly by ELECTRA 10 in test set: 886


In [212]:
wrong_revs_electra_clean = [review for review in wrong_texts_electra_clean if review in correct_texts_electra_noisy]
# Print the results
print("Reviews that ELECTRA clean got wrong:")
cnt=1
for string in wrong_revs_electra_clean:
    print(f"Review {cnt}: {string}")
    cnt += 1

Reviews that ELECTRA clean got wrong:
Review 1: viscontis first feature ossessione adaptation james cains postman always rings twice im familiar book film versions big fan cains double indemnity much fan billy wilders film version fact two novellas seem like must similar involve illicit love affair ravenous wife complains morally weak man husband worthless mean giovanna woman italian version played well clara calamai evil incarnate like wife double indemnity seems spoiled husband great performance juan de landa bit cruel strikes like least uncompromising hes older unattractive shes rather fickle gino shows young muscular man takes five minutes get bed sweats wants forever shes stuck husband break first meet apparently although intentionally vague plan murder husband successful move back womans home town run bar husband owned gino unenthusiastic idea wants giovanna one thing certainly doesnt want sit around one place rest life relationship quickly crumbles ossessione complex film comple

EXAMPLE - 34312

This version of Anna Christie is in German. Greta Garbo again plays Anna Christie, but all of the other characters have different actors from the English version. Both were filmed back to back because Garbo had such a following in Germany. Garbo herself supposedly favored her Anna Christie in this version over the English version. It's a good tale and a must-see for Garbo fans.

Noisy - This version of A nna Christie is in German. Greta Garbo again plays Anna Chris tie, but all of the other characters h ave different actors from the English versi on. Both w ere filmed back to back because Gar bo had such a following in Germany. Garbo he rself supposedly favored her A nna Chris tie in this version over the English version. It ' s a good t ale and a must - see for Garbo fans.

In [235]:
review = "This version of Anna Christie is in German. Greta Garbo again plays Anna Christie, but all of the other characters have different actors from the English version. Both were filmed back to back because Garbo had such a following in Germany. Garbo herself supposedly favored her Anna Christie in this version over the English version. It's a good tale and a must-see for Garbo fans."
print(augment_tweet(review))

This version of A nna Christie is in German. Greta Garbo again plays Anna Chris tie, but all of the other characters h ave different actors from the English versi on. Both w ere filmed back to back because Gar bo had such a following in Germany. Garbo he rself supposedly favored her A nna Chris tie in this version over the English version. It ' s a good t ale and a must - see for Garbo fans.


## T5

In [213]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


#### Clean

In [221]:
t5_clean.eval()

predicted_sentiments = []

for review in texts:
  inputs = tokenizer.encode("sentiment: " + review, return_tensors="pt").to(device)

  # Generate the sentiment prediction
  with torch.no_grad():
      outputs = t5_clean.generate(inputs, max_length=2)
  
  # Decode the predicted sentiment
  predicted_sentiment = tokenizer.decode(outputs[0], skip_special_tokens=True)
  predicted_sentiments.append(predicted_sentiment)

sentiment_map = {0:'positive', 1:'negative'}

# Print the predictions
wrong_texts_t5_clean = []
for i in range(len(predicted_sentiments)):
  if predicted_sentiments[i] != sentiment_map[labels[i]]:
    wrong_texts_t5_clean.append(texts[i])

print("Number of examples that were predicted wrongly by T5 clean in test set:", len(wrong_texts_t5_clean))



Number of examples that were predicted wrongly by T5 clean in test set: 389


In [222]:
t5_noisy.eval()

predicted_sentiments = []

for review in texts:
  inputs = tokenizer.encode("sentiment: " + review, return_tensors="pt").to(device)

  # Generate the sentiment prediction
  with torch.no_grad():
      outputs = t5_noisy.generate(inputs, max_length=2)
  
  # Decode the predicted sentiment
  predicted_sentiment = tokenizer.decode(outputs[0], skip_special_tokens=True)
  predicted_sentiments.append(predicted_sentiment)

sentiment_map = {0:'positive', 1:'negative'}

# Print the predictions
correct_texts_t5_noisy = []
for i in range(len(predicted_sentiments)):
  if predicted_sentiments[i] == sentiment_map[labels[i]]:
    correct_texts_t5_noisy.append(texts[i])

print("Number of examples that were predicted correctly by T5 Noisy in test set:", len(correct_texts_t5_noisy))

Number of examples that were predicted correctly by T5 Noisy in test set: 571


In [223]:
wrong_revs_t5_clean = [review for review in wrong_texts_t5_clean if review in correct_texts_t5_noisy]
# Print the results
print("Reviews that T5 clean got wrong:")
cnt=1
for string in wrong_revs_t5_clean:
    print(f"Review {cnt}: {string}")
    cnt += 1

Reviews that T5 clean got wrong:
Review 1: princess tam tam without trappings racism way think racism united states subtle american viewer assertions ethnic identity time pay attention alwinas baker placement within shots addressed characters settings around depict savage african ask alwina shred agency throughout film dont want ruin anything end pay careful attention dichotomy eastern western culture say least offensive diction thankfully disavowed days french checkered past imperial force throughout areas depicted see chris markers les statues meurent aussi pay attention places european travelers visit africa reflect attitudes towards give film sucker baker much professional career like princes tam tam regressive certainly overshadowed efforts towards integration work freaking spy gushing sorry however film captivating performance besides telling relic bygone mentalities
Review 2: rate e br br never actually owned nintendo spiel one many timesin opinion along conkers bad fur clarence

Example - 14561

This movie got off to an interesting start. Down the road however, the story gets convoluted with a poor illustration of ancient black magic rituals. The male lead was very good , even though he gets the worst end of the stick in the climax. In comparison, this is "Boomerang" meets "Extremities".

Noisy - This movie got off to an interesting sta5t. Down the road however, the stKry gets cinvolKted with a poor illustration of ancient bKack magis riFuaIs. The male lead was verH gooe, even tUouRh he gets the worst end of the stick in the climax. In comparison, this is " Booherwng " meets " Extremities ".

In [246]:
review = "This movie got off to an interesting start. Down the road however, the story gets convoluted with a poor illustration of ancient black magic rituals. The male lead was very good , even though he gets the worst end of the stick in the climax. In comparison, this is \"Boomerang\" meets \"Extremities\"."
augment_tweet(review)

'This movie got off to an interesting sta5t. Down the road however, the stKry gets cinvolKted with a poor illustration of ancient bKack magis riFuaIs. The male lead was verH gooe, even tUouRh he gets the worst end of the stick in the climax. In comparison, this is " Booherwng " meets " Extremities ".'

## XLNET

#### Clean

In [224]:
from transformers import XLNetTokenizer

xlnet_clean.eval()

# Load the fine-tuned XLNet model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

predicted_sentiments = []

for review in texts:
    inputs = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        truncation=True,
        max_length=128,
        padding='max_length',
        return_tensors='pt'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate the sentiment prediction
    with torch.no_grad():
        outputs = xlnet_clean(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()

    # Map the sentiment label to its corresponding class
    sentiment_classes = ['positive', 'negative']
    predicted_sentiment = sentiment_classes[predicted_label]
    predicted_sentiments.append(predicted_sentiment)

sentiment_map = {0:'positive', 1:'negative'}

# Print the predictions
wrong_texts_xlnet_clean = []
for i in range(len(predicted_sentiments)):
  if predicted_sentiments[i] != sentiment_map[labels[i]]:
    wrong_texts_xlnet_clean.append(texts[i])

print("Number of examples that were predicted wrongly by XLNET Clean in test set:", len(wrong_texts_xlnet_clean))

Number of examples that were predicted wrongly by XLNET Clean in test set: 66


#### Noisy

In [227]:
from transformers import XLNetTokenizer

xlnet_noisy.eval()

predicted_sentiments = []

for review in texts:
    inputs = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        truncation=True,
        max_length=128,
        padding='max_length',
        return_tensors='pt'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate the sentiment prediction
    with torch.no_grad():
        outputs = xlnet_noisy(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()

    # Map the sentiment label to its corresponding class
    sentiment_classes = ['positive', 'negative']
    predicted_sentiment = sentiment_classes[predicted_label]
    predicted_sentiments.append(predicted_sentiment)

sentiment_map = {0:'positive', 1:'negative'}

# Print the predictions
correct_texts_xlnet_noisy = []
for i in range(len(predicted_sentiments)):
  if predicted_sentiments[i] == sentiment_map[labels[i]]:
    correct_texts_xlnet_noisy.append(texts[i])

print("Number of examples that were predicted correctly by XLNET Noisy in test set:", len(correct_texts_xlnet_noisy))

Number of examples that were predicted correctly by XLNET Noisy in test set: 914


In [228]:
wrong_revs_xlnet_clean = [review for review in wrong_texts_xlnet_clean if review in correct_texts_xlnet_noisy]
# Print the results
print("Reviews that XLNET clean got wrong:")
cnt=1
for string in wrong_revs_xlnet_clean:
    print(f"Review {cnt}: {string}")
    cnt += 1

Reviews that XLNET clean got wrong:
Review 1: cant take movie seriouslythe plot predictable trite acting often top dialog laughable adds great fun three career girls late find way big city evils temptations mothers probably warned married men alcohol premarital sex abortion etcbr br theres amanda farrell joan crawford succeed professionally whose personal life sacrificed office name doorbr br movie may believable years ago great campy fun rentbuy enjoy
Review 2: creator donnie darko brings twilight zone themed tale oddest fashion film centers middle aged young couple living paycheck paycheck one day mysterious box appears red button later day spooky gentleman shows tells choice press button receive million dollars someone dont know die disturbing provocative question suspensefully outlined trailer tv spots let known dont know see times pretentious bit melodramatic film ultimately effective good performances intriguing subject matter would unfair ruin plot twists lets say film deliver a

EXAMPLE - 11763

This is a movie about how men think women think about love. No woman describes a one-night sexual encounter and declares it a love story. Of the ten monologues I felt only three really had any kind of truth ring through them. I kept waiting for the film to get better, and it did a bit, but never better enough. This is an interesting concept, and I kept wanting it to be good, but it never succeeded. Maybe if they actually WERE love stories it would have worked.

NOisy - This is a movie abkut how men think women think about love. No woman describes a one - night sexual encounter and declares it a love story. Of the ten monologI3s I felt only three really had any kjnd of truth ring through them. I k3pt waiting for the fUlm to get better, and it did a bit, but never hetfer enough. TUis is an interesting concept, and I kept wanting it to be good, but it never succeeded. Mahbe if they actuSllj WERE lKve stories it would have worked.

In [249]:
review = "This is a movie about how men think women think about love. No woman describes a one-night sexual encounter and declares it a love story. Of the ten monologues I felt only three really had any kind of truth ring through them. I kept waiting for the film to get better, and it did a bit, but never better enough. This is an interesting concept, and I kept wanting it to be good, but it never succeeded. Maybe if they actually WERE love stories it would have worked."
augment_tweet(review)

'This is a movie abkut how men think women think about love. No woman describes a one - night sexual encounter and declares it a love story. Of the ten monologI3s I felt only three really had any kjnd of truth ring through them. I k3pt waiting for the fUlm to get better, and it did a bit, but never hetfer enough. TUis is an interesting concept, and I kept wanting it to be good, but it never succeeded. Mahbe if they actuSllj WERE lKve stories it would have worked.'