In [1]:
%env TOKENIZERS_PARALLELISM=false
%env WANDB_DISABLED=true
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AdamW, 
    AutoConfig, 
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification
)

from datasets import Dataset

import torch
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader

env: TOKENIZERS_PARALLELISM=false
env: WANDB_DISABLED=true


In [2]:
class args:
    model = 'ProsusAI/finbert'

In [3]:

df = pd.read_csv('/kaggle/input/news-sentiment-analysis/news.csv')

In [4]:
df.rename(columns={'sentiment': 'labels', 'news': 'messages'}, inplace=True)

# Set date as index
df['date'] = pd.to_datetime(df['date'])  # Convert to datetime if necessary
df.set_index('date', inplace=True)

In [5]:
le = LabelEncoder()
df['labels'] = le.fit_transform(df['labels'])
df['labels'].value_counts()

labels
0    217443
1    210039
Name: count, dtype: int64

In [6]:
sentences_lengths = np.array(list(map(len, df['messages'])))
np.max(sentences_lengths)

1586

In [7]:
MAX_LEN = int(np.ceil(np.percentile(sentences_lengths, 90)))
MAX_LEN

177

Now let's split the given data into train, test and validation

We will be using the train data to train the mode, the validation data to determiine the performance of the model and the test data to check how the model performs on unseen data

We will stratify on the `labels` so that the data remains balanced for train, test and validation data

In [8]:
# X, y = df['messages'].values, df['labels'].values

# # train : test = 0.9 : 0.1
# xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, stratify=y)

# # train : valid = 0.8 : 0.2
# xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, test_size=0.2, stratify=ytrain)

# # train : valid : test = 0.72 : 0.18 : 0.10 (stratified on 'labels')

In [9]:
#df= df.sample(frac=0.001)

In [10]:
df = df.sort_index()

total_rows = len(df)
train_end = int(total_rows * 0.72)
valid_end = train_end + int(total_rows * 0.18)

train = df.iloc[:train_end]
valid = df.iloc[train_end:valid_end]
test = df.iloc[valid_end:]

In [11]:
# Convert to NumPy arrays
X_train = train.drop(columns=['labels']).to_numpy()  # Features for training
y_train = train['labels'].to_numpy()                # Target labels for training

X_valid = valid.drop(columns=['labels']).to_numpy()  # Features for validation
y_valid = valid['labels'].to_numpy()                 # Target labels for validation

X_test = test.drop(columns=['labels']).to_numpy()    # Features for testing
y_test = test['labels'].to_numpy()

# Dataset

In [12]:
X_train = [str(x) for x in X_train]
X_valid = [str(x) for x in X_valid]
X_test = [str(x) for x in X_test]

In [13]:
train_dataset_raw = Dataset.from_dict({'text':X_train, 'labels':y_train})
valid_dataset_raw = Dataset.from_dict({'text':X_valid, 'labels':y_valid})
pred_dataset_raw = Dataset.from_dict({'text': X_test})
pred_dataset_raw

Dataset({
    features: ['text'],
    num_rows: 42749
})

In [14]:
def tokenize_fn(examples):
    return tokenizer(examples['text'], truncation=True)

In [15]:
tokenizer = AutoTokenizer.from_pretrained(args.model)

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [16]:
train_dataset_raw

Dataset({
    features: ['text', 'labels'],
    num_rows: 307787
})

In [17]:
train_dataset = train_dataset_raw.map(tokenize_fn, batched=True)
valid_dataset = valid_dataset_raw.map(tokenize_fn, batched=True)

data_collator = DataCollatorWithPadding(tokenizer)

Map:   0%|          | 0/307787 [00:00<?, ? examples/s]

Map:   0%|          | 0/76946 [00:00<?, ? examples/s]

In [18]:
from torch.nn.init import xavier_uniform_
model = AutoModelForSequenceClassification.from_pretrained(args.model,num_labels=2,ignore_mismatched_sizes=True)


# Reinitialize the classification layer
xavier_uniform_(model.classifier.weight)
model.classifier.bias.data.zero_()

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([0., 0.])

In [19]:

import optuna
import torch
from transformers import Trainer, TrainingArguments, AdamW
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, f1_score
from transformers import get_scheduler

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    balanced_acc = balanced_accuracy_score(labels, predictions)
    auc = roc_auc_score(labels, predictions, multi_class='ovr')  # Adjust for multi-class
    f1 = f1_score(labels, predictions, average='weighted')
    return {
        "balanced_accuracy": balanced_acc,
        "auc": auc,
        "f1": f1
        }

In [20]:
from transformers import Trainer, TrainingArguments
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
import numpy as np
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, f1_score
import torch
from transformers.integrations import is_optuna_available
import joblib


# Define a custom compute_metrics function that returns multiple metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Convert predictions to probabilities for AUC (for binary classification)
    probs = torch.nn.functional.softmax(torch.tensor(pred.predictions), dim=-1).numpy()
    
    # Calculate metrics
    balanced_acc = balanced_accuracy_score(labels, preds)
    
    # Use the probability of the positive class
    auc = roc_auc_score(labels, probs[:, 1])
    f1 = f1_score(labels, preds)
    
    return {
        'balanced_accuracy': balanced_acc,
        'auc': auc,
        'f1': f1
    }

# Define the objective function for Optuna
def objective(trial):
    # Define hyperparameters to optimize
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.005, 0.02)
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.05, 0.2)
    num_epochs = trial.suggest_int("num_epochs", 2, 8)
    
    # Optimizer selection
    optimizer_name = trial.suggest_categorical("optimizer_name", [ "adamw_hf","adafactor", "sgd"])
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'/kaggle/Finbert_Trial_{trial.number}/',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size*2,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_ratio=warmup_ratio,
        do_eval=True,
        do_train=True,
        evaluation_strategy='epoch',
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model='eval_f1',  # Change this to your desired metric
        greater_is_better=True,
        optim=optimizer_name.lower(),
        # ReduceLROnPlateau-like functionality
        lr_scheduler_type="reduce_lr_on_plateau",
        logging_dir=f'/kaggle/Finbert_Logs_{trial.number}/',
        report_to="none",  # Disable wandb, tensorboard etc. during hyperparameter search
    )
    
    # Initialize the Trainer
    trainer = Trainer(
        model=model,  # Make sure this is already defined in your global scope
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    # Train the model
    trainer.train()
    
    # Evaluate on validation set
    eval_result = trainer.evaluate()
    try:
        joblib.dump(study, 'finbert_optuna_result.pkl')
    except Exception as e:
        print("There is a tiny error with saving. Moving on... ", e)
        print(param)

    return eval_result["eval_f1"]


from stopit import threading_timeoutable as timeoutable
 
@timeoutable()
def start():
    ### import warnings 
    import warnings 
    import optuna
 
    warnings.filterwarnings("ignore")
 
    optuna.logging.set_verbosity(optuna.logging.ERROR)
    global study

    USE_PREVIOUS_TRIALS=True
    
    import os
    if os.path.isfile('/kaggle/input/finbert-hyper-tuning/finbert_optuna_result.pkl') and USE_PREVIOUS_TRIALS==True:
        print("Found file !")
        study = joblib.load('/kaggle/input/finbert-hyper-tuning/finbert_optuna_result.pkl')
        print(study.best_trial.value)
    else:
        print("STARTING NEW TRAINING")
        sampler = optuna.samplers.TPESampler(multivariate=True, warn_independent_sampling=False)
        pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=5)

        study = optuna.create_study(direction='maximize', load_if_exists=True,sampler=sampler,pruner=pruner)
 
    study.optimize(objective, n_trials=1_000_000, gc_after_trial=True,
                  show_progress_bar=True, n_jobs=1, )
    joblib.dump(study, 'finbert_optuna_result.pkl')


# # Create the Optuna study
# sampler = TPESampler(seed=42)  # For reproducibility
# pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=5)

# study = optuna.create_study(
#     study_name="finbert_optimization",
#     direction="maximize",  # We want to maximize our metrics
#     sampler=sampler,
#     pruner=pruner
# )

# # Run the optimization
# n_trials = 8
# study.optimize(objective, n_trials=n_trials)

 
try:
    start(timeout=42_500) #42_500
except Exception as e:
    print("Hoba, oshibka !: ", e)


Found file !
0.5029949857704297


  0%|          | 0/1000000 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss,Balanced Accuracy,Auc,F1
1,0.0768,0.084672,0.96476,0.995883,0.965989
2,0.0712,0.075228,0.968349,0.997125,0.969134
3,0.0548,0.100365,0.949726,0.996072,0.953847
4,0.052,0.081325,0.969052,0.997203,0.970643
5,0.0501,0.076791,0.971293,0.99747,0.972372
6,0.0433,0.123533,0.953802,0.996652,0.957482


Epoch,Training Loss,Validation Loss,Balanced Accuracy,Auc,F1
1,0.0595,0.205791,0.947691,0.995693,0.951839
2,0.059,0.161554,0.971231,0.997894,0.973016
3,0.0478,0.136152,0.973191,0.997836,0.974475


In [21]:
print("DONE")

DONE


In [22]:
study = joblib.load('finbert_optuna_result.pkl')
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

Best trial:
  Value: 0.5029949857704297
  Params: 
    batch_size: 32
    learning_rate: 2.8445060587748435e-06
    weight_decay: 0.019476579288667823
    warmup_ratio: 0.13883693882206333
    num_epochs: 3
    optimizer_name: sgd


In [23]:
# # Print the results

# study = joblib.load('finbert_optuna_result.pkl')
# print("Best trial:")
# trial = study.best_trial
# print(f"  Value: {trial.value}")
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")

# # Train the final model with the best parameters
# best_batch_size = trial.params["batch_size"]
# best_lr = trial.params["learning_rate"]
# best_weight_decay = trial.params["weight_decay"]
# best_warmup_ratio = trial.params["warmup_ratio"]
# best_num_epochs = trial.params["num_epochs"]
# best_optimizer = trial.params["optimizer"]

# final_training_args = TrainingArguments(
#     output_dir='./Finbert_Final/',
#     per_device_train_batch_size=best_batch_size,
#     per_device_eval_batch_size=best_batch_size*2,
#     num_train_epochs=best_num_epochs,
#     learning_rate=best_lr,
#     weight_decay=best_weight_decay,
#     warmup_ratio=best_warmup_ratio,
#     do_eval=True,
#     do_train=True,
#     evaluation_strategy='epoch',
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     metric_for_best_model='f1',
#     optim=best_optimizer.lower(),
#     lr_scheduler_type="reduce_lr_on_plateau",
# )

# final_trainer = Trainer(
#     model=model,
#     args=final_training_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset,
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

# # Train the final model
# print("Training final model with best parameters...")
# final_trainer.train()

# # Evaluate the final model
# final_metrics = final_trainer.evaluate()
# print(f"Final model metrics: {final_metrics}")

In [24]:
# train_args = TrainingArguments(
#     './Finbert Trained/',
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=2*16,
#     num_train_epochs=5,
#     learning_rate=1e-5,
#     weight_decay=0.01,
#     warmup_ratio=0.1,    
#     do_eval=True,
#     do_train=True,
#     do_predict=True,
#     eval_strategy='epoch',
#     save_strategy="no",
# )

In [25]:
# trainer = Trainer(
#     model,
#     train_args,
#     train_dataset=train_dataset,
#     eval_dataset=valid_dataset,
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

In [26]:
# trainer.train()

In [27]:
# trainer.save_model('finbert_finetuned.bin')

In [28]:
# pred_dataset = pred_dataset_raw.map(tokenize_fn, batched=False)

In [29]:
# len(pred_dataset[0]['input_ids'])

In [30]:
# output = trainer.predict(
#     test_dataset=pred_dataset,
# )

In [31]:
# output.predictions

In [32]:
# preds = [np.argmax(x) for x in output.predictions]

In [33]:

# from sklearn.metrics import confusion_matrix, accuracy_score
# import seaborn as sns
# import matplotlib.pyplot as plt

# cm = confusion_matrix(y_test, preds)

# # Define class labels
# labels = [0, 1]

# # Create a heatmap
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
#             xticklabels=labels, yticklabels=labels)

# # Add labels and title
# plt.xlabel('Predicted Labels')
# plt.ylabel('True Labels')
# plt.title('Confusion Matrix Heatmap')

# # Display the plot
# plt.show()

In [34]:
# accuracy_score(y_test, preds)

In [35]:
# from sklearn.metrics import classification_report
# print(classification_report(y_test, preds))

In [36]:
# news = pd.read_csv('/kaggle/input/sentiment-analysis-for-financial-news/all-data.csv',names= ['Sentiment','Sentence'],encoding='ISO-8859-1')

In [37]:
# news

In [38]:
# x_test, Y_test = news['Sentence'].values, news['Sentiment'].values

In [39]:
# le = LabelEncoder()
# Y_test = le.fit_transform(Y_test)

In [40]:
# np.unique(Y_test)

In [41]:
# test_dataset_raw = Dataset.from_dict({'text': x_test})
# test_dataset_raw

In [42]:
# test_dataset = test_dataset_raw.map(tokenize_fn, batched=False)

In [43]:
# output = trainer.predict(
#     test_dataset=test_dataset,
# )

In [44]:
# preds = [np.argmax(x) for x in output.predictions]

In [45]:
# output.predictions

In [46]:
# np.unique(preds)

In [47]:
# def transform(x):
#     if x == 2:
#         return 1
#     elif x == 1:
#         return 1
#     else:  # x == 0
#         return 0
        
# vectorized_transform = np.vectorize(transform)
# Y_test = vectorized_transform(Y_test)

In [48]:
# np.unique(Y_test)

In [49]:
# cm = confusion_matrix(Y_test, preds)

# # Define class labels
# labels = [0, 1]

# # Create a heatmap
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
#             xticklabels=labels, yticklabels=labels)

# # Add labels and title
# plt.xlabel('Predicted Labels')
# plt.ylabel('True Labels')
# plt.title('Confusion Matrix Heatmap')

# # Display the plot
# plt.show()

In [50]:
# accuracy_score(Y_test, preds)

In [51]:
# print(classification_report(Y_test, preds))