# Predictive Modeling of Asset Returns: Fine-Tuning_SBERT_colab

**Author:** Cem Akkus  
**Institution:** Ludwig-Maximilians-Universität München  
**Date:** 14.07.2024

__Important note: This code was fully executed on Google Colab__

## Table of Contents
1. [Pre-Processing](#pre)  
    1.1. [Modules & Seeds](#module)  
    1.2. [Data Import](#loading)  
    1.3. [Data Transformations](#data-transform)    
2. [Fine-Tuning](#fine-tuning)  
    2.1. [Definitions](#definitions)  
    2.2. [Fine-Tuning Process](#fine-tuning-process)  
3. [Post-Processing](#post)  
    3.1. [Embeddings Generation](#emb-gen)  
    3.2. [Embeddings Averaging over Dates](#emm-avg)  

## 1. Pre-Processing
<a name="pre"></a>

### 1.2. Modules & Seeds
<a name="module"></a>

In [None]:
import time
import datetime
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, AdamW
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sentence_transformers import SentenceTransformer, util

from google.colab import drive

# specify GPU
device = torch.device("cuda")

In [None]:
seed = 0

# Set a random seed for NumPy
np.random.seed(seed)

# Set a random seed for Python's built-in random module
random.seed(seed)

# Set a random seed for PyTorch (for GPU and CPU)
torch.manual_seed(seed)

# Set a seed for CUDA
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

### 1.2. Data Import
<a name="loading"></a>

In [None]:
drive.mount('/content/drive')

In [None]:
drive_path = '/content/drive/My Drive/Colab Notebooks/'
file_list = os.listdir(drive_path)
print(file_list)

In [None]:
# Define the path to the CSV file
file_path = '/content/drive/My Drive/Colab Notebooks/headlines_adapted_SAP_DE.csv'  # Adjust the path for every company

# Read the CSV file into a DataFrame
dataset = pd.read_csv(file_path)

### 1.3. Data Transformations
<a name="data-transform"></a>

In [None]:
dataset['1d_return_movement'] = np.where(dataset['1d_return'] > 0, 'Up', 'Down')

In [None]:
# Count occurrences of 0 and 1 in the '1d_return_movement' column of the DataFrame
movement_counts = dataset['1d_return_movement'].value_counts()

# Print the counts
print(movement_counts)

In [None]:
sbert = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# split train dataset into train, validation and test sets
train_text, temp_text, train_labels, temp_labels = train_test_split(dataset['title'], dataset['1d_return_movement'],
                                                                    random_state=1,
                                                                    test_size=0.3)

In [None]:
# Convert labels to binary arrays for multi-label classification
label_binarizer = MultiLabelBinarizer()
train_labels = label_binarizer.fit_transform(train_labels)
temp_labels = label_binarizer.transform(temp_labels)

# Convert the binary labels to a single column of class indices
train_labels = np.argmax(train_labels, axis=1)
temp_labels = np.argmax(temp_labels, axis=1)

In [None]:
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state=1,
                                                                test_size=0.5)

In [None]:
train_text = train_text.tolist()
val_text = val_text.tolist()
test_text = test_text.tolist()

In [None]:
train_text_embeddings = sbert.encode(train_text, convert_to_tensor=True)
val_text_embeddings = sbert.encode(val_text, convert_to_tensor=True)
test_text_embeddings = sbert.encode(test_text, convert_to_tensor=True)

In [None]:
# define a batch size
batch_size = 32

# wrap tensors (now sentence embeddings) and labels in TensorDataset
train_data = TensorDataset(train_text_embeddings, torch.tensor(train_labels))
val_data = TensorDataset(val_text_embeddings, torch.tensor(val_labels))

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)
val_sampler = SequentialSampler(val_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
num_classes = len(np.unique(train_labels))

## 2. Fine-Tuning
<a name="fine-tuning"></a>

### 2.1 Definitions
<a name="definitions"></a>

In [None]:
class SBERT_Arch(nn.Module):

    def __init__(self, bert):

      super(SBERT_Arch, self).__init__()

      self.sbert = sbert

      # dropout layer
      self.dropout = nn.Dropout(0.1)

      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(384,384)

      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(384, 2) #num_classes) #change from 512 -> 384

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

          # define the forward pass
    def forward(self, embeddings):
        # feed the embeddings through the dense layers

        x = self.fc1(embeddings)

        x = self.relu(x)

        x = self.dropout(x)

        x = self.fc2(x)

        # apply softmax activation
        x = self.softmax(x)
        return x

    def save_weights(self, path):
        torch.save(self.state_dict(), path)

    @classmethod
    def load_weights(cls, path, sbert):
        model = cls(sbert)
        model.load_state_dict(torch.load(path))
        return model

In [None]:
# pass the pre-trained SBERT to our defined architecture
model = SBERT_Arch(sbert)

# push the model to GPU
model = model.to(device)

In [None]:
# define the optimizer
optimizer = AdamW(model.parameters(),
                  lr = 1e-5)          # learning rate

In [None]:
class_weights = compute_class_weight(class_weight = "balanced", classes= np.unique(train_labels), y= train_labels)

print("Class Weights:",class_weights)

In [None]:
# converting list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
weights = weights.to(device)

# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights)

# number of training epochs
epochs = 50

In [None]:
# Function to train the model
def train():

    model.train()

    total_loss, total_accuracy = 0, 0

    # empty list to save model predictions
    total_preds = []

    # iterate over batches
    for step, batch in enumerate(train_dataloader):

        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # push the batch to GPU
        batch = [r.to(device) for r in batch]

        embeddings, labels = batch  # Extract sentence embeddings and labels

        # Clear previously calculated gradients
        model.zero_grad()

        # Get model predictions for the current batch
        preds = model(embeddings)

        # Compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # Add on to the total loss
        total_loss = total_loss + loss.item()

        # Backward pass to calculate the gradients
        loss.backward()

        # Clip the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters
        optimizer.step()

        # Model predictions are stored on GPU. So, push it to CPU
        preds = preds.detach().cpu().numpy()

        # Append the model predictions
        total_preds.append(preds)

    # Compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    # Predictions are in the form of (no. of batches, size of batch, no. of classes).
    # Reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)

    # Returns the loss and predictions
    return avg_loss, total_preds


In [None]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(time.strftime("%H:%M:%S", time.gmtime(elapsed_rounded)))

t0 = time.time()

In [None]:
# Function for evaluating the model
def evaluate():

    print("\nEvaluating...")

    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step, batch in enumerate(val_dataloader):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:

            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to GPU
        batch = [t.to(device) for t in batch]

        embeddings, labels = batch  # Extract sentence embeddings and labels

        # Deactivate autograd
        with torch.no_grad():

            # Convert the embeddings to a list of sentences
            #sentences = [dataset['title'][i] for i in batch[0]] #added change
            #sentences = [dataset['title'][int(i)] for i in batch[0]]

            # Model predictions
            preds = model(embeddings)
            #preds = model(sentences)

            # Compute the validation loss between actual and predicted values
            loss = cross_entropy(preds, labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)

    # Compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)

    # Reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds


### 2.2 Fine-Tuning Process
<a name="fine-tuning-process"></a>

In [None]:
#Saving best model
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):

    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    #train model
    train_loss, _ = train()

    #evaluate model
    valid_loss, _ = evaluate()

    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'master_saved_weights_SAP_0410.pt')

    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

In [None]:
# Saving the weights
#path = '/content/drive/My Drive/Colab Notebooks/master_saved_weights_SAP_0410.pt'
#torch.save(model.state_dict(), path)

## 3. Post-Processing
<a name="post"></a>

### 3.1 Embeddings Generation
<a name="emb-gen"></a>

In [None]:
# Load the saved weights
model_path = '/content/drive/My Drive/Colab Notebooks/master_saved_weights_SAP_0410.pt'
saved_weights = torch.load(model_path)

In [None]:
# Set the model to evaluation mode
model.sbert.eval()

In [None]:
titles = dataset['title'].tolist()

In [None]:
# Encode the titles to get embeddings
finetuned_embeddings = sbert.encode(titles, convert_to_tensor=True)

In [None]:
# Assuming `finetuned_embeddings` is a tensor
finetuned_embeddings_list = finetuned_embeddings.tolist()

### 3.2 Embeddings Averaging over Date
<a name="emb-avg"></a>

In [None]:
dataset['finetuned_embeddings'] = finetuned_embeddings_list

In [None]:
# Ensure that 'datetime' is a datetime type for proper grouping
dataset['datetime'] = pd.to_datetime(dataset['datetime'])

# Function to average lists of embeddings
def average_embeddings(embedding_lists):
    # Convert list of lists to a numpy array
    embedding_array = np.array(embedding_lists)
    # Calculate the mean along the rows
    mean_embeddings = np.mean(embedding_array, axis=0)
    return mean_embeddings.tolist()  # Return as list if preferred

# Group by 'datetime' and aggregate using the custom function
averaged_embeddings = dataset.groupby('datetime')['finetuned_embeddings'].agg(average_embeddings).reset_index()

# Rename columns to reflect the content
averaged_embeddings.columns = ['datetime', 'finetuned_embeddings_date']

# Merge this back with the original dataset to associate each original row with the averaged embeddings of its date
dataset = dataset.merge(averaged_embeddings, on='datetime', how='left')

In [None]:
# Convert the list in 'finetuned_embeddings_date' to a string format for hashable operations
dataset['finetuned_embeddings_date'] = dataset['finetuned_embeddings_date'].apply(lambda x: str(x))

# Create a new DataFrame with only 'datetime' and 'finetuned_embeddings_date' columns, now with strings
finetuning_output_SAP = dataset[['datetime', 'finetuned_embeddings_date']].drop_duplicates()

# Set 'datetime' as the index and rename the index
finetuning_output_SAP.set_index('datetime', inplace=True)
finetuning_output_SAP.index.name = 'date'

# Display the new DataFrame
finetuning_output_SAP

In [None]:
# Save the DataFrame to a CSV file
finetuning_output_SAP.to_csv('/content/drive/My Drive/Colab Notebooks/finetuning_output_SAP.csv')