In [1]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [9

In [6]:
"""

    Created By: Ashutosh Mishra | Parisha Desai

    Date: 14 April 2024

    Enhancement Project [Crypto]
    This script allows to utilize pretrained models and finetunes a standard model to predict if a comment from hugging face IMDB Dataset is having a positive or negative sentiment. There is also an option for GPT-2 and Roberta to perform with and without adversarial attacks.
    Also, the distil-bert is fine tuned with or without adversarial attacks.

"""
import evaluate
import numpy as np
import torch
#from transformers import create_optimizer
#import tensorflow as tf
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification , create_optimizer
#from transformers import DistilBertTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification
#from datasets import DatasetDict, Dataset
from transformers.keras_callbacks import KerasMetricCallback
import random

def create_pertubations(texts, tokenizer):

    print("Size is " + str(len(texts)))
    moded_text = []
    #operations = [1, -1, 2, -2, 3, -3]

    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

    for sentence in inputs['input_ids']:

        #sentence = tokenizer(texts, padding=True, truncation=True)
        sentenceLength = int(len(sentence))

        #Create 5 modified texts
        for times in range(5):
            temp = sentence
            for val in range(3):
                targetLocation = random.randint(3, 35)
                if temp[targetLocation] > 1:
                    temp[targetLocation] -= 1 #= temp[targetLocation] + random.choice(operations)
            moded_text.append(tokenizer.decode(temp, skip_special_tokens=True))
    print("New Size is " + str(len(moded_text)))
    return moded_text


# Function to perform sentiment analysis with batching
def batch_predict_sentiment(texts, tokenizer, model, mode, batch_size=10):
    """
        This function predicts the sentiment for the test dataset using batches to reduce load
        Parameters
        ----------
        arg1 : texts
            This is the input test data for which predictions will pop up
        arg2 : tokenizer
            This is the tokenizer to be used
        arg3 : model
            This is the model to be used
        arg4 : batch_size
            This is the desired batch size to be used for predictions. Note it's defaulted to 10 based on performance on a normal machine. One can tweeak the value if memory is available

    """
    if mode == True:
        moded_values = create_pertubations(texts, tokenizer)

        total_texts = len(moded_values)            #Calculate total Length
        predicted_sentiments = list()           #Define an empty list

        for i in range(0, total_texts, batch_size):   #Perform action in Batches

            texts_batch = moded_values[i:i + batch_size]    #Select Batch
            inputs = tokenizer(texts_batch, return_tensors="pt", padding=True, truncation=True)  #Using tokenizer gernerate tensors for torch

            with torch.no_grad():       #Use model to predict
                outputs = model(**inputs)   #Save Results

            predicted_classes = torch.argmax(outputs.logits, dim=1).tolist()   #Get Max Vals as correct predictions
            predicted_sentiments.extend(predicted_classes)  #Append results to list
            print("Predicted " + str(i) + "elements")
    else:
        total_texts = len(texts)            #Calculate total Length
        predicted_sentiments = list()           #Define an empty list

        for i in range(0, total_texts, batch_size):   #Perform action in Batches

            texts_batch = texts[i:i + batch_size]    #Select Batch
            inputs = tokenizer(texts_batch, return_tensors="pt", padding=True, truncation=True)  #Using tokenizer gernerate tensors for torch

            with torch.no_grad():       #Use model to predict
                outputs = model(**inputs)   #Save Results

            predicted_classes = torch.argmax(outputs.logits, dim=1).tolist()   #Get Max Vals as correct predictions
            predicted_sentiments.extend(predicted_classes)  #Append results to list
            print("Predicted " + str(i) + "elements")

    return predicted_sentiments      #Return list of values

# Calculate accuracy
def calculate_accuracy(predicted_sentiments, actual_labels):
    """
        This function calculates the accuracy of the models
        Parameters
        ----------
        arg1 : predicted_sentiments
            This is the predicted outputs
        arg2 : actual_labels
            This is the true values

    """
    correct_predictions = sum(1 for pred, label in zip(predicted_sentiments, actual_labels) if pred == label)    #Sum correct predictions
    total_predictions = len(actual_labels)    #Get total predictions
    accuracy = correct_predictions / total_predictions * 100  #Calculate Accuracy
    return accuracy    #Return accuracy


def predict_gpt2(test_data, mode):

    """
        This function consumes the input test data and predicts output and accuracy
        Parameters
        ----------
        arg1 : test_data
            This is the input test data for which predictions will pop up

    """
    print("Loading Tokenizer and Model")
    tokenizer = AutoTokenizer.from_pretrained("mnoukhov/gpt2-imdb-sentiment-classifier")  #Load the tokenizer for gpt2 trained on IMDB Dataset
    model = AutoModelForSequenceClassification.from_pretrained("mnoukhov/gpt2-imdb-sentiment-classifier") #Load the pre-trained hugging face model for gpt2

    print("Predicting in batches")
    batch_predicted_sentiments = batch_predict_sentiment(test_data['text'], tokenizer, model, mode)   #Run the predictions in batch mode to reduce load on Memory
    actual_labels = test_data['label']        #Define True Labels

    print("Calculating Accuracies")
    accuracy = calculate_accuracy(batch_predicted_sentiments, actual_labels)    #Calculate the Accuracy achieved
    print(f"The accuracy achieved for predictions is {accuracy:.2f}%")

def predict_roberta(test_data, mode):

    """
        This function consumes the input test data and predicts output and accuracy
        Parameters
        ----------
        arg1 : test_data
            This is the input test data for which predictions will pop up

    """
    print("Loading Tokenizer and Model")
    tokenizer = AutoTokenizer.from_pretrained("abhishek/autonlp-imdb-roberta-base-3662644")  #Load the tokenizer for gpt2 trained on IMDB Dataset
    model = AutoModelForSequenceClassification.from_pretrained("abhishek/autonlp-imdb-roberta-base-3662644") #Load the pre-trained hugging face model for gpt2

    print("Predicting in batches")
    batch_predicted_sentiments = batch_predict_sentiment(test_data['text'], tokenizer, model, mode)   #Run the predictions in batch mode to reduce load on Memory
    actual_labels = test_data['label']        #Define True Labels

    print("Calculating Accuracies")
    accuracy = calculate_accuracy(batch_predicted_sentiments, actual_labels)    #Calculate the Accuracy achieved
    print(f"The accuracy achieved for predictions is {accuracy:.2f}%")

def create_pertubations_finetuned(data, tokenizer):
    moded_text = []

    texts = []
    labels = []

    for row in data:
        texts.append(row['text'])
        labels.append(row['label'])

    labels = [label for label in labels for _ in range(5)]

    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

    for sentence in inputs['input_ids']:
        for times in range(5):
            temp = sentence
            for val in range(3):
                targetLocation = random.randint(3, 35)
                if temp[targetLocation] > 1:
                    temp[targetLocation] -= 1
            moded_text.append(tokenizer.decode(temp, skip_special_tokens=True))


    data_dict = {"text": moded_text, "label": labels}

    dataset = Dataset.from_dict(data_dict)
    return dataset


def finetuned_predict_distilbert(imdb, mode):
    """
        This function consumes an input file and finetunes the pretrained word2vec model
        Parameters
        ----------
        arg1 : path_to_train_file
            This is the input filename

    """
    print("Loading Tokenizer and Model")

    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")     #Load the tokenizer for distilbert trained on IMDB Dataset
    model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)      #Load the pre-trained hugging face model for distilbert

    small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
    small_val_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(300))])
    small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])

    if mode == True:
        moded_train_dataset = create_pertubations_finetuned(small_train_dataset, tokenizer)
        moded_val_dataset = create_pertubations_finetuned(small_val_dataset, tokenizer)
        moded_test_dataset = create_pertubations_finetuned(small_test_dataset, tokenizer)

        small_dataset_dict = DatasetDict({
                    'train': moded_train_dataset,
                    'validation': moded_val_dataset,
                    'test': moded_test_dataset
                })
    else:
        small_dataset_dict = DatasetDict({
                    'train': small_train_dataset,
                    'validation': small_val_dataset,
                    'test': small_test_dataset
                })

    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)             #Using a function to map the whole dataset across train, test and validation

    tokenized_imdb = small_dataset_dict.map(preprocess_function, batched=True)        #Mapping the data with the tokenizer
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")    #Using data collator to capture the fields
    accuracy = evaluate.load("accuracy")    #Using accuracy for computing metrics

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)

    #Defining important hyper parameters
    batch_size = 16
    num_epochs = 6
    batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)
    optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

    #Converting to tensor-flow sets for input requirement
    tf_train_set = model.prepare_tf_dataset(
        tokenized_imdb["train"],
        shuffle=True,
        batch_size=16,
        collate_fn=data_collator,
    )

    tf_validation_set = model.prepare_tf_dataset(
        tokenized_imdb["validation"],
        shuffle=False,
        batch_size=16,
        collate_fn=data_collator,
    )

    tf_test_set = model.prepare_tf_dataset(
        tokenized_imdb["test"],
        shuffle=False,
        batch_size=16,
        collate_fn=data_collator,
    )

    #Compiling model based on previous additions
    model.compile(optimizer=optimizer)

    #Using Keras Metrics for Call Backs and traing the models for 5 epochs
    metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
    model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=5, verbose = True)

    #predicting the results
    results = model.predict(tf_test_set)

    #Fetching max values for correct labels
    predicted_labels = np.argmax(results['logits'], axis=1)

    #Get the accuracy
    accuracy = np.mean(predicted_labels == small_dataset_dict['test']['label'])
    print(f"The accuracy achieved for predictions is {accuracy:.2f}%")


def main():

    """
        This is the main function that controls the execution of the script. The user is prompted to choose from a menu to either select a function or exit script

    """

    #Printing a welcome screen Message
    print('\t\t\t\t\t\t!! Welcome !!')
    print('#####################################################################################################################')
    print('This python program will use pretrained models and predict if a comment from test dataset is positive or negative ')
    print('#####################################################################################################################\n')


    #Instancing the variables
    blRun = True                                                                                        #Creating a boolean Flag to control the while loop execution. By default the value is set as True
    nInst = 1                                                                                           #Creating a integer type variable to count the number of times the while loop was executed

    while(blRun):                                                                                       #While the boolean condition is true run the loop else exit
        print('\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tInstance --> ' + str(nInst))
        print('\nPlease select one of the following options : -')

        print('\t\t\tFor using the predict_gpt2 function please type 1')
        print('\t\t\tFor using the predict_roberta function please type 2')
        print('\t\t\tFor using the finetuned_predict_distilbert function please type 3')
        print('\t\t\tFor quitting the program please type q or Q')

        choice = input('\nPlease enter your input --> ')

        if choice == '1':
            print('\nPlease select one of the following options : -')
            print('\t\t\t\tFor using the predict_gpt2 function please type 1')
            print('\t\t\t\tFor using the predict_gpt2 with adversarial attack function please type 2')
            choice = input('\nPlease enter your input --> ')

            if choice == '1':
              imdb = load_dataset("imdb")
              small_test_dataset = imdb["test"].shuffle(seed=42).select(range(200))
              predict_gpt2(small_test_dataset, False)
            elif choice == '2':
              imdb = load_dataset("imdb")
              small_test_dataset = imdb["test"].shuffle(seed=42).select(range(200))
              predict_gpt2(small_test_dataset, True)
            else:
              print("\n!!!!!!!!!!!!!!!Invalid Input!!!!!!!!!!!!!!!\n Please check the input choice.\n")


        elif choice == '2':
            print('\nPlease select one of the following options : -')
            print('\t\t\t\tFor using the predict_roberta function please type 1')
            print('\t\t\t\tFor using the predict_roberta with adversarial attack function please type 2')
            choice = input('\nPlease enter your input --> ')

            if choice == '1':
              imdb = load_dataset("imdb")
              small_test_dataset = imdb["test"].shuffle(seed=42).select(range(200))
              predict_roberta(small_test_dataset, False)
            elif choice == '2':
              imdb = load_dataset("imdb")
              small_test_dataset = imdb["test"].shuffle(seed=42).select(range(200))
              predict_roberta(small_test_dataset, True)
            else:
              print("\n!!!!!!!!!!!!!!!Invalid Input!!!!!!!!!!!!!!!\n Please check the input choice.\n")


        elif choice == '3':
            print('\nPlease select one of the following options : -')
            print('\t\t\t\tFor using the finetuned_predict_distilbert function please type 1')
            print('\t\t\t\tFor using the finetuned_predict_distilbert with adversarial attack function please type 2')
            choice = input('\nPlease enter your input --> ')

            if choice == '1':
              imdb = load_dataset("imdb")
              finetuned_predict_distilbert(imdb, False)
            elif choice == '2':
              imdb = load_dataset("imdb")
              finetuned_predict_distilbert(imdb, True)
            else:
              print("\n!!!!!!!!!!!!!!!Invalid Input!!!!!!!!!!!!!!!\n Please check the input choice.\n")

        elif (choice == 'q' or choice == 'Q'):
            print("Thankyou for trying out the program.\n You have tried out this program for " + str(nInst) + " time/s. \nHave a nice day!!\n")
            blRun = False

        else:
          print("\n!!!!!!!!!!!!!!!Invalid Input!!!!!!!!!!!!!!!\n Please check the input choice.\n")

        nInst += 1




if __name__ == '__main__':
    main()


						!! Welcome !!
#####################################################################################################################
This python program will use pretrained models and predict if a comment from test dataset is positive or negative 
#####################################################################################################################

																		Instance --> 1

Please select one of the following options : -
			For using the predict_gpt2 function please type 1
			For using the predict_roberta function please type 2
			For using the finetuned_predict_distilbert function please type 3
			For quitting the program please type q or Q

Please enter your input --> 1

Please select one of the following options : -
				For using the predict_gpt2 function please type 1
				For using the predict_gpt2 with adversarial attack function please type 2

Please enter your input --> 1
Loading Tokenizer and Model
Predicting in batches
Predicted 0elements
Predicted 

tokenizer_config.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1])
512
tensor([    0,   713,    16,     6,   396,    10,  2980,     6,     5,   144,
        14598,  1569,    38,   348,   655,   450,     4, 29945,     6,   114,
            5,  6644,     9,    42,  1569,    32,   655,  2967,     6,    51,
          581,   342,  1669,   101,  2488,  1653,  5460,    66,     9,    10,
          633,     4, 18512,    22, 20907,    12,   673,   113,  3422,   328,
        28305,   162,     6,    47,   351,    75,  9917,    24,   328,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1, 

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

KeyError: "Column train not in the dataset. Current columns in the dataset: ['text', 'label']"