In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
df = pd.read_csv('../data/financial-sentiment-analysis.csv', encoding='ISO-8859-1')
df.head(5)

Unnamed: 0,Sentiment,Headline
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [3]:
df['Sentiment'].value_counts()

neutral     2879
positive    1363
negative     604
Name: Sentiment, dtype: int64

In [4]:
sentiment_mapping = {'positive': 0, 'negative': 1, 'neutral': 2}
df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)

In [5]:
df.head(5)

Unnamed: 0,Sentiment,Headline
0,2,"According to Gran , the company has no plans t..."
1,2,Technopolis plans to develop in stages an area...
2,1,The international electronic industry company ...
3,0,With the new production plant the company woul...
4,0,According to the company 's updated strategy f...


In [6]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [8]:
# If you're using a Hugging Face model (e.g., BertForSequenceClassification), dropout is already integrated into the architecture by default

bert_model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=3)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
sample_data = ["I am eating","I am playing "]
bert_tokenizer(sample_data, padding=True, truncation=True, max_length=128)

{'input_ids': [[101, 146, 1821, 5497, 102], [101, 146, 1821, 1773, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [10]:
class sentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        label = row['Sentiment']
        headline = row['Headline']

        inputs = self.tokenizer.encode_plus(
            headline,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            #The attention_mask is a tensor that tells the model which tokens are actual 
            #input tokens and which ones are padding tokens. 1 = input token, 0 = padding token
            'attention_mask': inputs['attention_mask'].flatten(), 
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
train_dataset = sentimentDataset(train_df, bert_tokenizer)
val_dataset = sentimentDataset(val_df, bert_tokenizer)

In [12]:
train_dataset[7]

{'input_ids': tensor([  101,  1109,  2395,  1144,  1126,  6538,  5964,  1104,  1407,   117,
         26963,  5897,   117,  1543,  1122,  1103,  2026,  3245,  5605, 12713,
          1383,  1107,  1103,  1362,   119,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [13]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [14]:
from tqdm import tqdm 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

# Optimizer and loss
optimizer = AdamW(bert_model.parameters(), lr=2e-5)

# Training loop
EPOCHS = 3
for epoch in range(EPOCHS):
    bert_model.train()
    total_loss = 0

    # Wrap train_loader with tqdm for the progress bar
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss #In Hugging Face's transformers library, models like BertForSequenceClassification compute the loss internally using torch.nn.CrossEntropyLoss, as the Hugging Face model includes the loss function as part of its forward pass logic when labels are provided.
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Update progress bar description
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1}, Average Loss: {total_loss/len(train_loader)}")

    print(f"Calculating Validation Loss for {epoch+1}")
    bert_model.eval()  # Set the model to evaluation mode
    total_val_loss = 0
    with torch.no_grad():  # Disable gradient computation during validation
        for val_batch in val_loader:
            val_input_ids = val_batch['input_ids'].to(device)
            val_attention_mask = val_batch['attention_mask'].to(device)
            val_labels = val_batch['labels'].to(device)

            # Forward pass
            val_outputs = bert_model(input_ids=val_input_ids, attention_mask=val_attention_mask, labels=val_labels)
            # val_logits = val_outputs.logits
            val_loss = val_outputs.loss
            total_val_loss += val_loss.item()

    print(f"Epoch {epoch+1}, Average Validation Loss: {total_val_loss / len(val_loader)}")

    if (epoch + 1) % 3 == 0:
        bert_model.save_pretrained(f"../models") # Full Hugging Face model (weights + config). model.save = Any Python object (model weights, dicts)
        bert_tokenizer.save_pretrained(f"../models")



  attn_output = torch.nn.functional.scaled_dot_product_attention(
Epoch 1: 100%|██████████| 273/273 [02:00<00:00,  2.26it/s, loss=0.171]


Epoch 1, Average Loss: 0.5401069040754776
Calculating Validation Loss for 1
Epoch 1, Average Validation Loss: 0.3629640879169587


Epoch 2: 100%|██████████| 273/273 [02:01<00:00,  2.25it/s, loss=0.364] 


Epoch 2, Average Loss: 0.27647430880929963
Calculating Validation Loss for 2
Epoch 2, Average Validation Loss: 0.35744289380888783


Epoch 3: 100%|██████████| 273/273 [02:01<00:00,  2.25it/s, loss=0.0256]


Epoch 3, Average Loss: 0.1479362286115577
Calculating Validation Loss for 3
Epoch 3, Average Validation Loss: 0.4186501173422702


In [None]:
bert_model.eval()
predictions, true_labels = [], []

# Disable gradient calculation for evaluation
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Get predictions and move to CPU
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='weighted')  # Weighted average for multi-class
recall = recall_score(true_labels, predictions, average='weighted')
f1 = f1_score(true_labels, predictions, average='weighted')

# Print results
print(f"Validation Accuracy: {accuracy}")
print(f"Validation Precision: {precision}")
print(f"Validation Recall: {recall}")
print(f"Validation F1 Score: {f1}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["positive", "negative", "neutral"]))

# support = instances


Validation Accuracy: 0.8639175257731959
Validation Precision: 0.8663562452727861
Validation Recall: 0.8639175257731959
Validation F1 Score: 0.8646045145985588

Classification Report:
              precision    recall  f1-score   support

    positive       0.80      0.86      0.83       146
    negative       0.86      0.86      0.86        58
     neutral       0.90      0.87      0.89       281

    accuracy                           0.86       485
   macro avg       0.85      0.86      0.86       485
weighted avg       0.87      0.86      0.86       485



In [24]:
# Load saved model
bert_model.to(device)
bert_model.eval()

# Example prediction
headline = "China's financial sector grows after government initiatives"
inputs = bert_tokenizer(headline, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {key: val.to(device) for key, val in inputs.items()}

outputs = bert_model(**inputs)
logits = outputs.logits
prediction = torch.argmax(logits, dim=1).item()
sentiment_re_mapping = {0 : 'positive', 1 : 'negative', 2 : 'neutral'}
print("Sentiment:", sentiment_re_mapping[prediction])

Sentiment: neutral
