# Load Library

In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel,RobertaTokenizer, RobertaModel,DistilBertModel, DistilBertTokenizer, DistilBertForSequenceClassification, RobertaTokenizer, AutoTokenizer, AutoModel

In [None]:
# Load the data
insample_df = pd.read_csv('insample_df.csv')
outsample_df = pd.read_csv('outsample_df.csv')

In [None]:
insample_df.head(2)

Unnamed: 0,companyname,Date From,Date To,date,Weekly Compound Return,Past Return Direction,Future Return Direction,market_cap,headline,eventtype
0,Omnicom Group Inc.,2005-01-08,2005-01-14,2005-01-14,0.018572,Up,Down,16145234.55,iVillage Inc. (NASDAQ:IVIL) acquired Healtholo...,M&A Transaction Announcements
1,Omnicom Group Inc.,2005-01-08,2005-01-14,2005-01-14,0.018572,Up,Down,16145234.55,[No_Headline],[No_Event]


# Preprocess

In [None]:
#Step 2: Pre-process Text Data

# Encode the 'Up' and 'Down' labels
label_mapping = {'Up': 1, 'Down': 0}

# Filter out 'No_Change' labels and encode target variable
insample_df = insample_df[insample_df['Future Return Direction'].isin(['Up', 'Down'])]
outsample_df = outsample_df[outsample_df['Future Return Direction'].isin(['Up', 'Down'])]

insample_df['Future Return Direction'] = insample_df['Future Return Direction'].map(label_mapping)
outsample_df['Future Return Direction'] = outsample_df['Future Return Direction'].map(label_mapping)

# Drop rows with NaN values in the target variable
insample_df = insample_df.dropna(subset=['Future Return Direction'])
outsample_df = outsample_df.dropna(subset=['Future Return Direction'])


# BERT

In [None]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to generate embeddings in batches
def get_average_embedding_batch(text_list, batch_size=16):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

# Apply the function to generate embeddings for headlines
insample_df['embedding'] = get_average_embedding_batch(insample_df['headline'].tolist())
outsample_df['embedding'] = get_average_embedding_batch(outsample_df['headline'].tolist())

In [None]:
# Save the dataframes to a pickle file
with open('embedding-BERT-AllCompany-NEW.pkl', 'wb') as f:
    pd.to_pickle((insample_df, outsample_df), f)

print("Dataframes with embeddings have been saved to 'embedding-BERT-AllCompany-NEW.pkl'")

Dataframes with embeddings have been saved to 'embedding-BERT-AllCompany-NEW.pkl'


# RoBERTa

In [None]:
# Load the pre-trained RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Function to generate embeddings in batches
def get_average_embedding_batch(text_list, batch_size=16):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

# Apply the function to generate embeddings for headlines
insample_df['embedding'] = get_average_embedding_batch(insample_df['headline'].tolist())
outsample_df['embedding'] = get_average_embedding_batch(outsample_df['headline'].tolist())

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Save the dataframes to a pickle file
with open('embedding-RoBERTa-AllCompany-NEW.pkl', 'wb') as f:
    pd.to_pickle((insample_df, outsample_df), f)

print("Dataframes with embeddings have been saved to 'embedding-RoBERTa-AllCompany-NEW.pkl'")

Dataframes with embeddings have been saved to 'embedding-RoBERTa-AllCompany-NEW.pkl'


# DistilBERT

In [None]:
# Load the pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Function to generate embeddings in batches
def get_average_embedding_batch(text_list, batch_size=16):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

# Apply the function to generate embeddings for headlines
insample_df['embedding'] = get_average_embedding_batch(insample_df['headline'].tolist())
outsample_df['embedding'] = get_average_embedding_batch(outsample_df['headline'].tolist())

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
# Save the dataframes to a pickle file
with open('embedding-DistilBERT-AllCompany-NEW.pkl', 'wb') as f:
    pd.to_pickle((insample_df, outsample_df), f)

print("Dataframes with embeddings have been saved to 'embedding-DistilBERT-AllCompany-NEW.pkl'")

Dataframes with embeddings have been saved to 'embedding-DistilBERT-AllCompany-NEW.pkl'


# DistilRoBERTa

In [None]:
# Load the pre-trained DistilRoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base')

# Function to generate embeddings in batches
def get_average_embedding_batch(text_list, batch_size=16):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    return embeddings

# Apply the function to generate embeddings for headlines
insample_df['embedding'] = get_average_embedding_batch(insample_df['headline'].tolist())
outsample_df['embedding'] = get_average_embedding_batch(outsample_df['headline'].tolist())

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
# Save the dataframes to a pickle file
with open('embedding-DistilRoBERTa-AllCompany-NEW.pkl', 'wb') as f:
    pd.to_pickle((insample_df, outsample_df), f)

print("Dataframes with embeddings have been saved to 'embedding-DistilRoBERTa-AllCompany-NEW.pkl'")

Dataframes with embeddings have been saved to 'embedding-DistilRoBERTa-AllCompany-NEW.pkl'


# FinBERT

In [None]:
# Load the pre-trained FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('yiyanghkust/finbert-tone', use_fast=False)
model = AutoModel.from_pretrained('yiyanghkust/finbert-tone')

# Function to generate embeddings in batches
def get_average_embedding_batch(text_list, batch_size=16):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs, return_dict=True)
        # Extract the output embeddings (CLS token representation from the last hidden state)
        last_hidden_states = outputs.last_hidden_state
        # Average pooling of the last hidden state across tokens
        avg_pooled_embeddings = torch.mean(last_hidden_states, dim=1).cpu().numpy()
        embeddings.extend(avg_pooled_embeddings)
    return embeddings

# Assuming insample_df and outsample_df are defined earlier with 'headline' column

# Apply the function to generate embeddings for headlines
insample_df['embedding'] = get_average_embedding_batch(insample_df['headline'].tolist())
outsample_df['embedding'] = get_average_embedding_batch(outsample_df['headline'].tolist())

  return self.fget.__get__(instance, owner)()
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
# Save the dataframes to a pickle file
with open('embedding-FinBERT-AllCompany-NEW.pkl', 'wb') as f:
    pd.to_pickle((insample_df, outsample_df), f)

print("Dataframes with embeddings have been saved to 'embedding-FinBERT-AllCompany-NEW.pkl'")

Dataframes with embeddings have been saved to 'embedding-FinBERT-AllCompany-NEW.pkl'
