In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import torch # Importing PyTorch for building neural network models
from transformers import AutoTokenizer,AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report
from torch import nn, optim
from sklearn.metrics import precision_recall_fscore_support


In [None]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
data = pd.read_csv('all-data.csv',
                   encoding='unicode_escape',
                   names=['Sentiment', 'Text'])
data.head()

Unnamed: 0,Sentiment,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [None]:
data['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
neutral,2879
positive,1363
negative,604


In [None]:
data.shape

(4846, 2)

In [None]:
Bert_checkpoint = "bert-base-uncased"
Roberta_checkpoint = "soleimanian/financial-roberta-large-sentiment"

# Run the below code twice
- First for Bert_checkpoint
- next time for Roberta_checkpoint

In [None]:
# Convert sentiment labels from textual to numerical format for easier processing
label_dict = {'negative': 0, 'neutral': 1, 'positive': 2}  # Mapping labels to numerical values
data['Sentiment'] = data['Sentiment'].replace(label_dict)  # Replacing text labels with corresponding numerical values

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)  # 80% for training, 20% for testing
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)
test_data.to_csv('test.csv',index=False)
tokenizer = AutoTokenizer.from_pretrained(Roberta_checkpoint)
# Using the 'bert-base-uncased' pre-trained tokenizer

# Defining a preprocessing function for tokenizing and encoding sequences
def preprocess_for_bert(data):
    # Tokenizing and encoding the text data with padding and truncation to handle variable lengths
    # 'max_length=512' sets the maximum length of the sequences
    # 'return_tensors="pt"' returns PyTorch tensors
    return tokenizer(data['Text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

# Applying the preprocessing function to the training and validation data
train_encoded = preprocess_for_bert(train_data)
test_encoded = preprocess_for_bert(test_data)
val_encoded = preprocess_for_bert(val_data)

# Function to compute metrics for evaluation
def compute_metrics(pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    labels = pred.label_ids  # Actual labels
    preds = pred.predictions.argmax(-1)  # Predictions from the model
    # Calculating precision, recall, F1-score, and accuracy
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Defining a custom dataset class for handling the BERT-processed data
class SentimentDataset(torch.utils.data.Dataset):
    # Initialization with encodings and labels
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    # Method to get an item by index
    def __getitem__(self, idx):
        # Preparing each item by retrieving encoded data and corresponding label
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    # Method to get the total number of items in the dataset
    def __len__(self):
        return len(self.labels)

config.json:   0%|          | 0.00/936 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
# Creating dataset objects for the training and validation datasets
train_dataset = SentimentDataset(train_encoded, train_data['Sentiment'].values)
val_dataset = SentimentDataset(val_encoded, val_data['Sentiment'].values)
test_dataset = SentimentDataset(test_encoded, test_data['Sentiment'].values)

# Loading a pre-trained BERT model specifically for sequence classification
# 'bert-base-uncased' is the model type, and 'num_labels=3' indicates three output labels (negative, neutral, positive)
model = AutoModelForSequenceClassification.from_pretrained(Roberta_checkpoint, num_labels=3)



# Defining various training parameters
training_args = TrainingArguments(
    output_dir='RoBERTa_FPB_finetuned_v2',               # Directory where the model predictions and checkpoints will be written
    num_train_epochs=4,                   # Total number of training epochs
    per_device_train_batch_size=16,       # Batch size per device during training
    per_device_eval_batch_size=64,        # Batch size for evaluation
    warmup_steps=500,                     # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                    # Weight decay if we apply some form of weight regularization
    logging_dir='./logs',                 # Directory for storing logs
    logging_steps=10,                     # How often to print logs
    evaluation_strategy="epoch",          # Evaluation is done at the end of each epoch
    report_to="none"                      # Disables the integration with any external reporting system
)


# Initializing the Trainer with the model, training arguments, and datasets
trainer = Trainer(
    model=model,                          # The pre-trained BERT model
    args=training_args,                   # Training arguments defined above
    train_dataset=train_dataset,          # Training dataset
    eval_dataset=val_dataset,             # Validation dataset
    compute_metrics=compute_metrics       # Function for computing evaluation metrics
)

# Starting the training process
trainer.train()

# Evaluating the trained model on the validation dataset
evaluation_results = trainer.evaluate()


pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3263,0.34273,0.871134,0.872511,0.879687,0.871134
2,0.377,0.430537,0.845361,0.845824,0.847012,0.845361
3,0.2243,0.514903,0.876289,0.875614,0.876886,0.876289
4,0.1564,0.508032,0.865979,0.865824,0.866331,0.865979


In [None]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Elanthamiljeeva/RoBERTa_FPB_finetuned_v2/commit/4e0e6f20d566e20cb6a7330c9c7db1cf093aca40', commit_message='End of training', commit_description='', oid='4e0e6f20d566e20cb6a7330c9c7db1cf093aca40', pr_url=None, pr_revision=None, pr_num=None)