In [None]:
# Imports
import os
import warnings
from io import StringIO
import pandas as pd
import gcsfs
from google.cloud import storage, bigquery
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    ProgressCallback
)
import torch
import evaluate
import inspect
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, accuracy_score
import matplotlib.pyplot as plt

In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/mnt/disks/data/diss_bucket_key.json"
# Turn off all warnings
warnings.filterwarnings('ignore')

In [None]:
# Detect number of GPUs
gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {gpus}")
print(torch.cuda.get_device_name(0))

In [None]:
# === Disable Weights & Biases to avoid API prompts ===
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "dryrun"

In [None]:
# === Configuration ===
PROJECT_ID = "bamboo-mercury-462915-f0"
BQ_DATASET = "edgar_sentiment"
BQ_TABLE = "filing_scores"
BQ_TABLE_EVAL = "filing_scores_evaluation"
REGION = "europe-west2"
OUTPUT_DIR = f"gs://diss_market_data/finbert-finetune-output"
MODEL_NAME = "yiyanghkust/finbert-tone"
NUM_LABELS = 3

# 🔁 Optional: Try stronger models if F1 plateaus
# MODEL_NAME = "ProsusAI/finbert"  # financial-news specific FinBERT
# MODEL_NAME = "microsoft/deberta-v3-base"  # smaller DeBERTa finetuned on financial news

In [None]:
# === Environment Variables for HPC ===
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "dryrun"
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_P2P_DISABLE"] = "1"

In [None]:
# === Extract data from BigQuery for training ===
bq_client = bigquery.Client()
query = f"""(SELECT chunk_text, sentiment_score, sentiment_label FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}`
        where sentiment_label IN ('Positive'))
        UNION ALL
        (SELECT chunk_text, sentiment_score, sentiment_label FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}`
        where sentiment_label IN ('Negative') LIMIT 4500)
        UNION ALL
        (SELECT chunk_text, sentiment_score, sentiment_label FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}`
        where sentiment_label IN ('Neutral') LIMIT 4500)
        """
df_filings = bq_client.query(query).to_dataframe()
print(f"Retrieved {len(df_filings)} rows from filings training table {BQ_DATASET}.{BQ_TABLE}")

In [None]:
# === Extract data from BigQuery for evaluation ===
query = f"""SELECT chunk_text text, sentiment_label FROM `{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE_EVAL}`
        where sentiment_label IN ('Positive', 'Negative', 'Neutral')
        """
df_eval = bq_client.query(query).to_dataframe()
print(f"Retrieved {len(df_eval)} rows from evaluation table {BQ_DATASET}.{BQ_TABLE_EVAL}")

In [None]:
# Gemini Labeled Filings in chunks
label_map = {"Negative": 2, "Neutral": 0, "Positive": 1}
df_filings['label'] = df_filings['sentiment_label'].map(label_map)
df_eval['label'] = df_eval['sentiment_label'].map(label_map)
df_eval.drop(columns=['sentiment_label'], inplace=True)

In [None]:
# # /financial-tweets-sentiment
# ds_tweets = load_dataset("TimKoornstra/financial-tweets-sentiment")
# df_tweets = pd.DataFrame(ds_tweets['train'])

In [None]:
# financial_phrasebank
df_phrasebank = pd.read_csv("/mnt/disks/data/project_data/Financial_Phrasebank_50__Agree.csv")

In [None]:
# Changing Labels to match -> {"Negative": 2, "Neutral": 0, "Positive": 1}
label_map = {"negative": 2, "neutral": 0, "positive": 1}
df_phrasebank['sentiment'] = df_phrasebank['label']
df_phrasebank['label'] = df_phrasebank['sentiment'].map(label_map)

In [None]:
df1 = df_filings[['chunk_text', 'label']].rename(columns={'chunk_text': 'text'})
# df2 = df_tweets[['tweet', 'sentiment']].rename(columns={'tweet': 'text', 'sentiment': 'label'})
df3 = df_phrasebank[['sentence', 'label']].rename(columns={'sentence': 'text'})

In [None]:
df_final = pd.concat([df1, df3])

In [None]:
df_final.columns, df_eval.columns

In [None]:
# === Convert to Hugging Face Dataset ===
train_dataset = Dataset.from_pandas(df_final[['text', 'label']])
eval_dataset = Dataset.from_pandas(df_eval[['text', 'label']])

In [None]:
# === Tokenization ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_batch(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='longest',
        max_length=512
    )

# 🔁 Optional: Try stronger models if F1 plateaus
# MODEL_NAME = "ProsusAI/finbert"  # financial-news specific FinBERT
# MODEL_NAME = "mrm8488/deberta-v3-small-finetuned-financial-news-sentiment-analysis"  # smaller DeBERTa finetuned on financial news

In [None]:
tokenized_train = train_dataset.map(tokenize_batch, batched=True)
tokenized_eval = eval_dataset.map(tokenize_batch, batched=True)

In [None]:
# Filter to essential columns
def keep_features(ds):
    keep = ['input_ids','attention_mask','label']
    return ds.remove_columns([c for c in ds.column_names if c not in keep])
train_ds, eval_ds = keep_features(tokenized_train), keep_features(tokenized_eval)

In [None]:
# === Model Setup ===
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

# === Configure TrainingArguments ===
base_args = {
    'output_dir': OUTPUT_DIR,
    'num_train_epochs': 5,
    'per_device_train_batch_size': 4,
    'per_device_eval_batch_size': 8,
    'eval_steps': 100,
    'save_steps': 500,
    'logging_steps': 100,
    'learning_rate': 1e-5,
    'warmup_steps': 200,
    'load_best_model_at_end': True,
    'metric_for_best_model': 'eval_loss',
    'save_total_limit': 2,
    'fp16': True,
    'dataloader_num_workers': 4,
    'report_to': 'none',
    'label_smoothing_factor': 0.1,
    'gradient_accumulation_steps': 12,  # to maintain effective batch size

}

# Dynamically add supported strategies
sig = inspect.signature(TrainingArguments)
if 'evaluation_strategy' in sig.parameters and 'save_strategy' in sig.parameters:
    base_args['evaluation_strategy'] = 'steps'
    base_args['save_strategy'] = 'steps'
else:
    base_args.pop('load_best_model_at_end', None)
    base_args.pop('metric_for_best_model', None)

# Add progress bar settings if supported
if 'disable_tqdm' in sig.parameters:
    base_args['disable_tqdm'] = False
if 'progress_bar_refresh_rate' in sig.parameters:
    base_args['progress_bar_refresh_rate'] = 20

# Initialize TrainingArguments
valid_args = {k: v for k, v in base_args.items() if k in sig.parameters}
training_args = TrainingArguments(**valid_args)

# 🔁 Optional: Try stronger models if F1 plateaus
# MODEL_NAME = "ProsusAI/finbert"  # financial-news specific FinBERT
# MODEL_NAME = "mrm8488/deberta-v3-small-finetuned-financial-news-sentiment-analysis"  # smaller DeBERTa finetuned on financial news

In [None]:
# === Metrics ===
# Pre-load accuracy and F1 metrics
accuracy_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    # Compute accuracy
    acc = accuracy_metric.compute(predictions=preds, references=labels)
    # Compute weighted F1 score
    f1 = f1_metric.compute(predictions=preds, references=labels, average='weighted')
    return {
        'accuracy': acc['accuracy'],
        'f1': f1['f1']
    }

In [None]:
# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[ProgressCallback()]
)

# === Start training ===
trainer.train()

In [None]:
# === Evaluate ===
trainer.evaluate()

In [None]:
# === Save Final Model ===
OUTPUT_DIR = "./deberta_filings"
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model and checkpoints saved to: {OUTPUT_DIR}")

In [None]:
# === Evaluate ===
results = trainer.evaluate()

# Print nicely
print("\n📊 Evaluation Metrics:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")

# Run predictions on the evaluation dataset
predictions = trainer.predict(eval_ds)
pred_labels = predictions.predictions.argmax(axis=-1)
true_labels = predictions.label_ids

In [None]:
# === Compute metrics ===
accuracy = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average='weighted')

# Compute confusion matrix
cm = confusion_matrix(true_labels, pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Neutral", "Positive", "Negative"])
disp.plot(cmap='Blues')

# === Add accuracy and F1 to the plot ===
plt.title("Confusion Matrix: Deberta Evaluation - Filings")
plt.text(
    2.1, -0.5,  # x, y position (tweak as needed)
    f"Accuracy: {accuracy:.2%}\nF1 Score: {f1:.2%}",
    fontsize=10,
    ha='right',
    va='top',
    bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.5')
)

plt.tight_layout()
plt.savefig("confusion_matrix_eval_filings_deberta.png")
print("[SUCCESS] Confusion matrix with metrics saved as 'confusion_matrix_eval_filings_deberta.png'")

# === Detailed classification report ===
from sklearn.metrics import classification_report
print("\n📋 Classification Report:")
print(classification_report(true_labels, pred_labels, target_names=["Neutral", "Positive", "Negative"]))

In [None]:
def upload_directory_to_gcs(local_dir, bucket_name, destination_dir):
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    for root, _, files in os.walk(local_dir):
        for file in files:
            local_path = os.path.join(root, file)
            relative_path = os.path.relpath(local_path, local_dir)
            gcs_path = os.path.join(destination_dir, relative_path)

            blob = bucket.blob(gcs_path)
            blob.upload_from_filename(local_path)
            print(f"[UPLOAD] {local_path} → gs://{bucket_name}/{gcs_path}")

# === Upload Model Directory to GCS ===
GCS_BUCKET = "diss_market_data"
GCS_MODEL_PATH = "deberta_filings"
upload_directory_to_gcs(OUTPUT_DIR, GCS_BUCKET, GCS_MODEL_PATH)
print(f"[DONE] Model uploaded to gs://{GCS_BUCKET}/{GCS_MODEL_PATH}")