In [1]:
import numpy as np # Importing numpy for numerical operations and linear algebra
import pandas as pd # Importing pandas for data processing and reading CSV files
import os
# Suppressing UserWarning category warnings to avoid cluttering the notebook output
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Additional imports for various functionalities

import torch # Importing PyTorch for building neural network models
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments # Importing components from the transformers library for BERT model
from sklearn.model_selection import train_test_split # For splitting the dataset into training and validation sets

# Importing ELMo related modules for embeddings
#from allennlp.modules.elmo import Elmo, batch_to_ids
from torch.utils.data import Dataset, DataLoader # For creating custom data handling classes and loading data
from sklearn.metrics import accuracy_score, classification_report # For evaluating model performance
from torch import nn, optim # For neural network components and optimizer
from sklearn.metrics import precision_recall_fscore_support # For detailed model evaluation metrics

from transformers import (
    AutoModel, AutoConfig,
    AutoTokenizer, logging,AutoModelForSequenceClassification
)

from tqdm.notebook import tqdm
tqdm.pandas()

def get_last_4_hidden_output(text,tokenizer,model):
    # encoded_input  = tokenizer([tweet_text], padding=True,
    #                          truncation=True, max_length=512, return_tensors="pt")

    features = tokenizer.batch_encode_plus(
        [text],
        add_special_tokens=True,
        padding='max_length',
        max_length=512,
        truncation=True,
        return_tensors='pt',
        return_attention_mask=True
    )

    outputs = model(features['input_ids'], features['attention_mask'])

    all_hidden_states = torch.stack(outputs['hidden_states']) #  torch.stack(outputs[2])

#     concatenate_pooling = torch.cat(
#         (all_hidden_states[-1], all_hidden_states[-2]),-1
#     )
    
    
    concatenate_pooling = all_hidden_states[-1]
    concatenate_pooling = concatenate_pooling[:, 0]
    hidden_as_list = concatenate_pooling.detach().numpy().tolist()[0]

    return hidden_as_list

checkpoints = ['bert-base-uncased', 'ProsusAI/finbert', 'google/electra-base-discriminator', 'SALT-NLP/FLANG-ELECTRA', 'SALT-NLP/FLANG-BERT']
augmentationed_files = ['bert-base-uncased.csv', 'Synonym-wordnet.csv', 'roberta-base.csv', 'flang-bert.csv']

flag = 1

available = ['avinasht/finbert_flang-bert','avinasht/finbert_Synonym-wordnet','avinasht/bert-base-uncased_flang-bert',
             'avinasht/bert-base-uncased_Synonym-wordnet','avinasht/finbert_roberta-base',
             'avinasht/finbert_bert-base-uncased','avinasht/bert-base-uncased_roberta-base','avinasht/bert-base-uncased_bert-base-uncased'
            ]

for base_checkpoint in checkpoints:
    # Iterate over each augmentation file
    for augmentation_file in augmentationed_files:
        # Generate the filename based on the combination of checkpoint and augmentation file
        finetuned_checkpoint = f"{base_checkpoint.split('/')[-1]}_{augmentation_file.split('.')[0]}"
        finetuned_checkpoint = f"avinasht/{finetuned_checkpoint}"
        
        if finetuned_checkpoint in available:
            data = pd.read_csv(f"/kaggle/input/augmented-financialphrasalbank/{augmentation_file}")
            data.columns = ['index','Sentiment','Text']

            # Group by 'text' and aggregate labels into a list
            grouped_df = data.groupby('Text')['Sentiment'].agg(list).reset_index()

            # Filter rows where there are multiple labels
            multiple_labels_df = grouped_df[grouped_df['Sentiment'].apply(lambda x: len(x) > 1)]
            indx = data[data['Text'].isin(multiple_labels_df['Text'].values)].index
            data = data[~data['index'].isin(indx)].reset_index(drop=True)

            # Convert sentiment labels from textual to numerical format for easier processing
            label_dict = {'negative': 0, 'neutral': 1, 'positive': 2}  # Mapping labels to numerical values
            data['Sentiment'] = data['Sentiment'].replace(label_dict)  # Replacing text labels with corresponding numerical values

            # train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)  # 80% for training, 20% for validation
            # train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)


            tokenizer = AutoTokenizer.from_pretrained(f'{base_checkpoint}') #bert-base-uncased
            config = AutoConfig.from_pretrained(f'avinasht/{finetuned_checkpoint}') #avinasht/BERT_FPB_finetuned
            config.update({'output_hidden_states':True})

            # https://www.kaggle.com/code/avitri73/fpb-bert-finetuned/edit
            model = AutoModelForSequenceClassification.from_pretrained(f'avinasht/{finetuned_checkpoint}',config=config)
            N = model.config.hidden_size

            data['Embedding'] = data['Text'].progress_apply(lambda x:get_last_4_hidden_output(x,tokenizer,model))
            data[list(range(N))] = pd.DataFrame(data.Embedding.tolist(), index= data.index)
            data.drop(columns='Embedding',inplace=True)
            data.to_csv(f'{finetuned_checkpoint}.csv',index=False)
            print(f"********************* {finetuned_checkpoint} completed#{flag} ******************")
            flag = flag + 1
            # test_data['Embedding'] = test_data['Text'].progress_apply(lambda x:get_last_4_hidden_output(x))
            # test_data[list(range(768))] = pd.DataFrame(test_data.Embedding.tolist(), index= test_data.index)
            # test_data.drop(columns='Embedding',inplace=True)
            # test_data.to_csv('WordNetFPB_Bert_test.csv',index=False)

            # val_data['Embedding'] = val_data['Text'].progress_apply(lambda x:get_last_4_hidden_output(x))
            # val_data[list(range(768))] = pd.DataFrame(val_data.Embedding.tolist(), index= val_data.index)
            # val_data.drop(columns='Embedding',inplace=True)
            # val_data.to_csv('WordNetFPB_BERT_val.csv',index=False)

            # train_data['Embedding'] = train_data['Text'].progress_apply(lambda x:get_last_4_hidden_output(x))
            # train_data[list(range(768))] = pd.DataFrame(train_data.Embedding.tolist(), index= train_data.index)
            # train_data.drop(columns='Embedding',inplace=True)
            # train_data.to_csv('WordNetFPB_BERT_train.csv',index=False)
        else:
            print('CheckPoint not found..',finetuned_checkpoint)

In [None]:
Error with bert-base-uncased.csv, finetuned on SALT-NLP/FLANG-ELECTRA
Error with Synonym-wordnet.csv, finetuned on SALT-NLP/FLANG-ELECTRA
Error with roberta-base.csv, finetuned on SALT-NLP/FLANG-ELECTRA 
Error with flang-bert.csv, finetuned on SALT-NLP/FLANG-ELECTRA Completed