In [1]:
import matplotlib.pyplot as plt   # Import the matplotlib.pyplot library for data visualization
import seaborn as sns              # Import the seaborn library for data visualization
import pandas as pd                # Import the pandas library for data manipulation and analysis

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # Import scikit-learn metrics

!pip install transformers        # Install the transformers package
!pip install -U torchtext==0.6.0 # Install a specific version of the torchtext package
!pip install pytorch-pretrained-bert pytorch-nlp # Install the pytorch-pretrained-bert and pytorch-nlp packages

import torch                      # Import PyTorch library

from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup  # Import the necessary classes from the transformers package

import warnings                   # Import the warnings library to handle warning messages
warnings.filterwarnings('ignore') # Ignore warning messages

import logging                    # Import the logging library to handle log messages
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR) # Ignore log messages related to tokenization_utils_base

from google.colab import drive    # Import the drive module from the google.colab library and mount the Google Drive
drive.mount('/content/drive')


Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl (197 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.6/197.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-macosx_10_11_x86_64.whl (4.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting filelock
  Downloading filelock-3.12.0-py3

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'google.colab'

In [None]:
!unzip "/content/drive/MyDrive/Colab Notebooks/sarcasm.zip" -d "/content"

Archive:  /content/drive/MyDrive/Colab Notebooks/sarcasm.zip
replace /content/__MACOSX/._test? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [None]:
data_path = 'content'   # Set the path to the input data directory
output_path = 'content' # Set the path to the output data directory

In [None]:
def get_data(train_data,test_data):
  train_df = pd.read_csv(train_data,index_col=0)  #training data set
  #train_df = train_df.drop(columns=['rephrase','sarcasm','irony','understatement','overstatement', 'rhetorical_question','satire',])
  test_df = pd.read_csv(test_data) # getting the csv file for the test data set  
  return train_df, test_df # getting the train and test data set 

train_df, test_df = get_data('/content/drive/MyDrive/Colab Notebooks/training_data.csv','../content/test/task_A_En_test.csv')


In [None]:
print("Sarcastic", len(train_df[train_df["sarcastic"]==1]))   # Print the number of sarcastic samples in the training data
print("Not Sarcastic", len(train_df[train_df["sarcastic"]==0])) # Print the number of non-sarcastic samples in the training data

Sarcastic 10478
Not Sarcastic 9508


In [None]:
def clean_data(df, column):

  # Import necessary libraries
  import re
  from nltk.corpus import stopwords
  import nltk
  import nltk.corpus
  nltk.download('stopwords')
  from nltk.corpus import stopwords
  stop = stopwords.words('english')

  # Get the text data from the specified column
  b = list(df[column])
  corpus = []

  # Clean the text data
  for i in range(len(b)):
      review = re.sub(r'http\S+', ' ', str(b[i]))  # Remove URLs
      review = re.sub("\d*\.\d+","",review)         # Remove floating-point numbers 
      review = re.sub(r'@\S+', ' ', review)         # Remove Twitter handles
      
      TAG_RE = re.compile(r'<[^>]+>')
      review = TAG_RE.sub('', review)               # Remove HTML tags

      review = re.sub('\[[^]]*\]', ' ', review)     # Remove square brackets and any text within them
      
      review = review.lower()                      # Convert all text to lowercase
      review = review.split()                      
      
      review = ' '.join(review)                     # Join the cleaned tokens into a single string
        
      corpus.append(review)
  
  # Assign the cleaned text data to a new column "clean"
  df = df.assign(clean = corpus)
  
  # Returns the cleaned dataset
  return df

# Clean the "tweet" column of the "train_df" DataFrame using the "clean_data" function
train_df = clean_data(train_df, "tweet")

# Print the cleaned DataFrame
train_df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,tweet,sarcastic,clean
0,the biggest only problem thing i got from coll...,1,the biggest only problem thing i got from coll...
1,the absolutely only thing i got fired from the...,1,the absolutely only thing i got fired from the...
2,perhaps the second only nice thing i got out f...,1,perhaps the second only nice thing i got out f...
3,i love it when college professors randomly dra...,1,i love it when college professors randomly dra...
4,i really love it funny when professors constan...,1,i really love it funny when professors constan...
...,...,...,...
13551,['8-9ft man found in ancient indian burial mou...,0,['8-9ft man found in ancient indian burial mou...
13552,"[""Second Scottish independence referendum 'on ...",0,"[""second scottish independence referendum 'on ..."
13553,"['Pinoy Cyborg by James Simmons', 'Mag-ingat s...",0,"['pinoy cyborg by james simmons', 'mag-ingat s..."
13554,"['The logic here is flawless!', ""No it isn't, ...",0,"['the logic here is flawless!', ""no it isn't, ..."


In [None]:
# Save preprocessed data, cropped to max length of the model.
train_df['clean'] = train_df['clean'].apply(lambda x: " ".join(x.split()[:512]))
train_df.to_csv("prep_news.csv")

In [None]:
# Set random seed and set device to GPU.
torch.manual_seed(17)

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
else:
    device = torch.device('cpu')

print(device)

cuda:0


In [None]:
# Initialize tokenizer.
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [None]:
# Import necessary modules
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

# Set the maximum sequence length and batch size
MAX_SEQ_LEN = 128
BATCH_SIZE = 16

# Get the indices of the pad and unknown tokens from the tokenizer
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

# Define the fields for the tabular dataset
label_field = Field(sequential=False, use_vocab=False, batch_first=True) #target variable, the sarcastic labe
text_field = Field(use_vocab=False, 
                   tokenize=tokenizer.encode, 
                   include_lengths=False, 
                   batch_first=True,
                   fix_length=MAX_SEQ_LEN, 
                   pad_token=PAD_INDEX, 
                   unk_token=UNK_INDEX) #The text_field is used for the input text data

fields = {'clean' : ('clean', text_field), 'sarcastic' : ('sarcastic', label_field)}

# Load the tabular dataset from the csv file
train_data, valid_data, test_data = TabularDataset(path="prep_news.csv", 
                                                   format='CSV', 
                                                   fields=fields, 
                                                   skip_header=False).split(split_ratio=[0.70, 0.2, 0.1], 
                                                                            stratified=True, 
                                                                            strata_field='sarcastic')

# Create iterators for the train, validation, and test sets
train_iter, valid_iter = BucketIterator.splits((train_data, valid_data),
                                               batch_size=BATCH_SIZE,
                                               device=device,
                                               shuffle=True,
                                               sort_key=lambda x: len(x.clean), 
                                               sort=True, 
                                               sort_within_batch=False)

test_iter = Iterator(test_data, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)


In [None]:
# Save model checkpoint
def save_checkpoint(path, model, valid_loss):
    # Save model's state dictionary and valid_loss value to file at specified path
    torch.save({'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}, path)

# Load model checkpoint
def load_checkpoint(path, model):    
    # Load state dictionary from file at specified path to model
    state_dict = torch.load(path, map_location=device)
    model.load_state_dict(state_dict['model_state_dict'])
    
    # Return valid_loss value
    return state_dict['valid_loss']



# Save training metrics
def save_metrics(path, train_loss_list, valid_loss_list, global_steps_list):   
    # Save training loss list, validation loss list, and global steps list to file at specified path
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, path)

# Load training metrics
def load_metrics(path):    
    # Load training loss list, validation loss list, and global steps list from file at specified path
    state_dict = torch.load(path, map_location=device)
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']


In [None]:
class ROBERTAClassifier(torch.nn.Module):
    def __init__(self, dropout_rate=0.2):
        super(ROBERTAClassifier, self).__init__()
        
        # Load pre-trained RoBERTa model
        self.roberta = RobertaModel.from_pretrained('roberta-base',return_dict=False)
        
        # Define dropout layer, linear layers, and layer normalization
        self.d1 = torch.nn.Dropout(dropout_rate)
        self.l1 = torch.nn.Linear(768, 64)
        self.bn1 = torch.nn.LayerNorm(64)
        self.d2 = torch.nn.Dropout(dropout_rate)
        self.l2 = torch.nn.Linear(64, 2)
        
    def forward(self, input_ids, attention_mask):
        # Pass input through RoBERTa model, only keep output of last layer (not the embeddings)
        _, x = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        
        # Apply dropout, linear, normalization, and activation functions
        x = self.d1(x)
        x = self.l1(x)
        x = self.bn1(x)
        x = torch.nn.Tanh()(x)
        
        x = self.d2(x)
        x = self.l2(x)
        
        # Return output of final linear layer
        return x  

In [None]:
def pretrain(model, 
             optimizer, 
             train_iter, 
             valid_iter, 
             scheduler = None,
             valid_period = len(train_iter),
             num_epochs = 5,
             return_dict=False):
    
    # Freeze the RobertaModel parameters during pretraining
    for param in model.roberta.parameters():
        param.requires_grad = False
    
    # Set the model to train mode
    model.train()
    
    # Initialize train_loss, valid_loss, and global_step variables
    train_loss = 0.0
    valid_loss = 0.0   
    global_step = 0  
    
    # Iterate through each epoch
    for epoch in range(num_epochs):
        
        # Iterate through each batch of the training data
        for (source, target), _ in train_iter:
            # Create a binary tensor mask to indicate where padding is present in the input
            mask = (source != PAD_INDEX).type(torch.uint8)
            
            # Perform a forward pass of the model
            y_pred = model(input_ids=source,  
                           attention_mask=mask)
            
            # Calculate the loss
            loss = torch.nn.CrossEntropyLoss()(y_pred, target)
   
            # Perform backpropagation and optimization
            loss.backward()
            optimizer.step()    
            scheduler.step()  # Update the learning rate if a scheduler is provided  
            optimizer.zero_grad()
            
            # Update the training loss and global step count
            train_loss += loss.item()
            global_step += 1
            
            # Perform validation periodically during training
            if global_step % valid_period == 0:
                # Set the model to eval mode
                model.eval()
                
                # Calculate the validation loss
                with torch.no_grad():                    
                    for (source, target), _ in valid_iter:
                        mask = (source != PAD_INDEX).type(torch.uint8)
                        
                        y_pred = model(input_ids=source, 
                                       attention_mask=mask)
                        
                        loss = torch.nn.CrossEntropyLoss()(y_pred, target)
                        
                        valid_loss += loss.item()

                train_loss = train_loss / valid_period
                valid_loss = valid_loss / len(valid_iter)
                
                # Set the model back to train mode
                model.train()

                # Print training and validation loss
                print('Epoch [{}/{}], global step [{}/{}], PT Loss: {:.4f}, Val Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
                              train_loss, valid_loss))
                
                # Reset train_loss and valid_loss for the next period
                train_loss = 0.0                
                valid_loss = 0.0
    
    # Unfreeze the RobertaModel parameters after pretraining is done
    for param in model.roberta.parameters():
        param.requires_grad = True
        
    # Print a message indicating that pretraining is done
    print('Pre-training done!')


In [None]:
# Define function to train model
def train(model,
          optimizer,
          train_iter,
          valid_iter,
          scheduler = None,
          num_epochs = 5,
          valid_period = len(train_iter),
          output_path = output_path,
          return_dict=False):
    
    # Initialize loss values and lists to store them
    train_loss = 0.0
    valid_loss = 0.0
    train_loss_list = []
    valid_loss_list = []
    
    # Set best validation loss as infinity
    best_valid_loss = float('Inf')
    
    # Initialize global step count and list to store them
    global_step = 0
    global_steps_list = []
    
    # Set the model to training mode
    model.train()
    
    # Loop through specified number of epochs
    for epoch in range(num_epochs):
        
        # Loop through training data batches
        for (source, target), _ in train_iter:
            
            # Create mask to pad variable length sequences
            mask = (source != PAD_INDEX).type(torch.uint8)

            # Feed input data into the model to obtain predictions
            y_pred = model(input_ids=source,  
                           attention_mask=mask)
            
            # Calculate loss between predictions and targets
            loss = torch.nn.CrossEntropyLoss()(y_pred, target)
            
            # Compute gradients and update model parameters
            loss.backward()
            optimizer.step()    
            scheduler.step()               
            optimizer.zero_grad()
            
            # Add batch loss to running total
            train_loss += loss.item()
            
            # Increment global step count
            global_step += 1
            
            # If it's time to validate, calculate validation loss
            if global_step % valid_period == 0:
                # Set model to evaluation mode
                model.eval()
                
                # Disable gradient calculation and loop through validation data batches
                with torch.no_grad():                    
                    for (source, target), _ in valid_iter:
                        mask = (source != PAD_INDEX).type(torch.uint8)

                        y_pred = model(input_ids=source, 
                                       attention_mask=mask)
                        
                        loss = torch.nn.CrossEntropyLoss()(y_pred, target)
                        
                        valid_loss += loss.item()

                # Calculate average training and validation losses
                train_loss = train_loss / valid_period
                valid_loss = valid_loss / len(valid_iter)
                
                # Store the losses and global step count at this point
                train_loss_list.append(train_loss)
                valid_loss_list.append(valid_loss)
                global_steps_list.append(global_step)

                # Print losses and save checkpoint if validation loss improves
                print('Epoch [{}/{}], global step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_iter),
                              train_loss, valid_loss))
                
                if best_valid_loss > valid_loss:
                    best_valid_loss = valid_loss
                    save_checkpoint('model.pkl', model, best_valid_loss)
                    save_metrics('metric.pkl', train_loss_list, valid_loss_list, global_steps_list)
                        
                train_loss = 0.0                
                valid_loss = 0.0
                model.train()
    
    # Save the final training metrics and print completion message
    save_metrics('metric.pkl', train_loss_list, valid_loss_list, global_steps_list)
    print('Training done!')


In [None]:
#Training
NUM_EPOCHS = 4
steps_per_epoch = len(train_iter)  # Calculate the number of steps per epoch

model = ROBERTAClassifier(0.3)  # Initialize a ROBERTA classifier model with a dropout rate
model = model.to(device)  # Move the model to the specified device (e.g. GPU)

optimizer = AdamW(model.parameters(), lr=1e-4)  # Initialize the AdamW optimizer with a learning rate of 0.0001
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=steps_per_epoch*1, 
                                            num_training_steps=steps_per_epoch*NUM_EPOCHS)  # Initialize a linear learning rate scheduler with a warmup period of 1 epoch and a total number of training steps equal to the number of steps per epoch times the number of epochs

print("======================= Start pretraining ==============================")  # Print a message to indicate the start of pretraining

pretrain(model=model,  # Start pretraining the model with the specified settings
         train_iter=train_iter,
         valid_iter=valid_iter,
         optimizer=optimizer,
         scheduler=scheduler,
         num_epochs=NUM_EPOCHS)

NUM_EPOCHS = 12  # Set the number of epochs for training 
print("======================= Start training =================================")  # Print a message to indicate the start of training
optimizer = AdamW(model.parameters(), lr=1e-5)  # Re-initialize the optimizer with a lower learning rate for training
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=steps_per_epoch*2, 
                                            num_training_steps=steps_per_epoch*NUM_EPOCHS)  # Re-initialize the scheduler with a longer warmup period and a total number of training steps equal to the number of steps per epoch times the number of epochs

train(model=model,  # Start training the model with the specified settings
      train_iter=train_iter, 
      valid_iter=valid_iter, 
      optimizer=optimizer, 
      scheduler=scheduler, 
      num_epochs=NUM_EPOCHS)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch [1/4], global step [875/3500], PT Loss: 0.7124, Val Loss: 0.6925
Epoch [2/4], global step [1750/3500], PT Loss: 0.7011, Val Loss: 0.6929
Epoch [3/4], global step [2625/3500], PT Loss: 0.6959, Val Loss: 0.6913
Epoch [4/4], global step [3500/3500], PT Loss: 0.6938, Val Loss: 0.6923
Pre-training done!
Epoch [1/12], global step [875/10500], Train Loss: 0.6727, Valid Loss: 0.6573
Epoch [2/12], global step [1750/10500], Train Loss: 0.6290, Valid Loss: 0.6675
Epoch [3/12], global step [2625/10500], Train Loss: 0.5718, Valid Loss: 0.5986
Epoch [4/12], global step [3500/10500], Train Loss: 0.4986, Valid Loss: 0.5894
Epoch [5/12], global step [4375/10500], Train Loss: 0.4195, Valid Loss: 0.6497
Epoch [6/12], global step [5250/10500], Train Loss: 0.3519, Valid Loss: 0.6976
Epoch [7/12], global step [6125/10500], Train Loss: 0.2935, Valid Loss: 0.6585
Epoch [8/12], global step [7000/10500], Train Loss: 0.2441, Valid Loss: 0.7310
Epoch [9/12], global step [7875/10500], Train Loss: 0.2051, Val

Loss Curve

In [None]:
def evaluate(model, test_loader):
    y_pred = []
    y_true = []

    model.eval() # Put the model in evaluation mode
    with torch.no_grad(): # Disable gradient computation to save memory
        for (source, target), _ in test_loader: # Iterate over the test data
                mask = (source != PAD_INDEX).type(torch.uint8) # Create a mask for the padding tokens
                
                output = model(source, attention_mask=mask) # Forward pass through the model to get predictions

                y_pred.extend(torch.argmax(output, axis=-1).tolist()) # Append the predicted labels to y_pred list
                y_true.extend(target.tolist()) # Append the true labels to y_true list
    
    print('Classification Report:')
    print(classification_report(y_true, y_pred, digits=4)) # Print classification report of the predictions
    print(y_pred)
    
    return y_pred # Return the predicted labels as a list

In [None]:
# Create an instance of the ROBERTAClassifier model and move it to the specified device (e.g. GPU)
model = ROBERTAClassifier()
model = model.to(device)

# Load the saved parameters of the model from a checkpoint file
load_checkpoint('model.pkl', model)

# Call the 'evaluate' function to make predictions on the test data and get the predicted labels

y_pred = evaluate(model, test_iter)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Classification Report:
              precision    recall  f1-score   support

           0     0.6837    0.7229    0.7028      1902
           1     0.7348    0.6966    0.7152      2096

    accuracy                         0.7091      3998
   macro avg     0.7093    0.7097    0.7090      3998
weighted avg     0.7105    0.7091    0.7093      3998

[1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0

In [None]:
test_df

Unnamed: 0,text,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0
1,Pinball!,0
2,So the Scottish Government want people to get ...,1
3,villainous pro tip : change the device name on...,0
4,I would date any of these men 🥺,0
...,...,...
1395,I’ve just seen this and felt it deserved a Ret...,0
1396,Omg how an earth is that a pen !!! 🤡,0
1397,Bringing Kanye and drake to a tl near you,0
1398,"I love it when women are referred to as ""girl ...",1


In [None]:
df1  =  test_df   # assign test_df to a new variable df1
df1 = df1.assign(ramdom_label=[1 for i in range(len(df1["text"]))])  # create a new column named "ramdom_label" in df1 and assign the value 1 to all its rows
df1.to_csv("abc.csv")   # save df1 to a CSV file named "abc.csv" in the current working directory
df1   # return df1


Unnamed: 0,text,sarcastic,ramdom_label
0,"Size on the the Toulouse team, That pack is mo...",0,1
1,Pinball!,0,1
2,So the Scottish Government want people to get ...,1,1
3,villainous pro tip : change the device name on...,0,1
4,I would date any of these men 🥺,0,1
...,...,...,...
1395,I’ve just seen this and felt it deserved a Ret...,0,1
1396,Omg how an earth is that a pen !!! 🤡,0,1
1397,Bringing Kanye and drake to a tl near you,0,1
1398,"I love it when women are referred to as ""girl ...",1,1


In [None]:
# Define fields mapping
fields = {'text' : ('text', text_field), 'ramdom_label' : ('ramdom_label', label_field)}

# Load the preprocessed test data from the CSV file using TabularDataset
test_data = TabularDataset(path="abc.csv", format='CSV', fields=fields)

# Create an iterator to load batches of data from the test dataset
# Set train=False to indicate this is not used for training
# Set shuffle and sort to False to keep the original order of data
test_iter = Iterator(test_data, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False)


In [None]:
# Initialize a ROBERTA classifier model
model = ROBERTAClassifier()

# Send the model to the device (CPU or GPU)
model = model.to(device)

# Load the saved checkpoint into the model
load_checkpoint('model.pkl', model)

# Evaluate the model on the test dataset using the evaluate function
# actual test data 
y_pred = evaluate(model, test_iter)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Classification Report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         0
           1     1.0000    0.1614    0.2780      1400

    accuracy                         0.1614      1400
   macro avg     0.5000    0.0807    0.1390      1400
weighted avg     1.0000    0.1614    0.2780      1400

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1

In [None]:
df1 = df1.assign(task_a_en=y_pred)
df1

Unnamed: 0,text,sarcastic,ramdom_label,task_a_en
0,"Size on the the Toulouse team, That pack is mo...",0,1,1
1,Pinball!,0,1,0
2,So the Scottish Government want people to get ...,1,1,0
3,villainous pro tip : change the device name on...,0,1,0
4,I would date any of these men 🥺,0,1,0
...,...,...,...,...
1395,I’ve just seen this and felt it deserved a Ret...,0,1,0
1396,Omg how an earth is that a pen !!! 🤡,0,1,1
1397,Bringing Kanye and drake to a tl near you,0,1,0
1398,"I love it when women are referred to as ""girl ...",1,1,1


In [None]:
test_data = test_df

In [None]:
test_data = test_data.assign(pred=y_pred)
test_data

Unnamed: 0,text,sarcastic,pred
0,"Size on the the Toulouse team, That pack is mo...",0,1
1,Pinball!,0,0
2,So the Scottish Government want people to get ...,1,0
3,villainous pro tip : change the device name on...,0,0
4,I would date any of these men 🥺,0,0
...,...,...,...
1395,I’ve just seen this and felt it deserved a Ret...,0,0
1396,Omg how an earth is that a pen !!! 🤡,0,1
1397,Bringing Kanye and drake to a tl near you,0,0
1398,"I love it when women are referred to as ""girl ...",1,1


In [None]:
print(classification_report(test_data["sarcastic"], test_data["pred"]))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      1200
           1       0.41      0.46      0.43       200

    accuracy                           0.83      1400
   macro avg       0.66      0.67      0.66      1400
weighted avg       0.84      0.83      0.83      1400



In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
f1_sarcastic = f1_score(test_data["sarcastic"], test_data["pred"],average = "binary", pos_label = 1)
print('The final F1 score: ', f1_sarcastic) # returns the f score

The final F1 score:  0.431924882629108


In [None]:
result = test_data[['text','pred']]
result.to_csv('results')