In [1]:
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
# =============================================================================
# 1. Pfade & Basis-Settings
# =============================================================================
# Panel (mit Spalte "text") einlesen und später erweitertes Panel speichern.
PANEL_INPUT = Path(r"C:\1_Projekte\quantum_cognition\data\1-2_trancsript_panel\transcript_panel.csv")
PANEL_OUTPUT = Path(r"C:\1_Projekte\quantum_cognition\data\1-3_panel_with_sentiment\transcript_panel_finbert.csv")  # TODO: anpassen

# FinBERT-Modell (kann bei Bedarf auf ein anderes FinBERT gewechselt werden)
MODEL_NAME = "ProsusAI/finbert"

# Batch-Size für Inferenz (an GPU/CPU-Ressourcen anpassen)
BATCH_SIZE = 32

# Gerät wählen (GPU, wenn verfügbar)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# alternativ:
# DEVICE = torch.device("cpu")
# print("Using device: cpu (forced)")


Using device: cpu


In [3]:
# =============================================================================
# 2. Panel laden
# =============================================================================
df = pd.read_csv(PANEL_INPUT)

# Sicherstellen, dass die Textspalte existiert
if "text" not in df.columns:
    raise KeyError("Spalte 'text' fehlt im Panel-DataFrame.")

# NaNs entfernen, alles in Strings konvertieren
df["text"] = df["text"].fillna("").astype(str)

print(f"Panel geladen: {df.shape[0]} Zeilen, {df.shape[1]} Spalten.")
df.head()


Panel geladen: 28157 Zeilen, 6 Spalten.


Unnamed: 0,folder_relative,file_name,segment_index,start_time,end_time,text
0,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,0,00:00.000,00:08.680,"Welcome to AB InBev's third quarter, 2024 earn..."
1,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,1,00:08.680,00:13.920,Hosting the call today from AB InBev are Mr. M...
2,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,2,00:13.920,00:16.880,"and Mr. Fernando Tenenbaum, Chief Financial Of..."
3,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,3,00:16.880,00:22.200,To access the slides accompanying today's call...
4,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,4,00:22.200,00:29.480,www.ab-inbev.com and click on the Investors ta...


In [4]:
# =============================================================================
# 3. FinBERT laden
# =============================================================================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(DEVICE)
model.eval()

# Konfiguration prüfen
id2label = model.config.id2label
label2id = model.config.label2id
print("FinBERT Labels:", id2label)


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

FinBERT Labels: {0: 'positive', 1: 'negative', 2: 'neutral'}


In [5]:
# =============================================================================
# 4. Batched Inferenz: Texte -> Wahrscheinlichkeiten & Label
# =============================================================================
def finbert_predict_proba(
    texts: List[str],
    batch_size: int = 32,
    max_length: int = 128,
) -> np.ndarray:
    """Berechnet FinBERT-Wahrscheinlichkeiten für eine Liste von Texten.

    Args:
        texts: Liste von Textstrings.
        batch_size: Batch-Größe für die Inferenz (Trade-off Zeit/Memory).
        max_length: Maximale Token-Länge (BERT Limit: 512).

    Returns:
        2D-Array der Form (n_samples, n_labels) mit Softmax-Wahrscheinlichkeiten.
    """
    probs_list: list[np.ndarray] = []
    n = len(texts)

    for start in range(0, n, batch_size):
        end = min(start + batch_size, n)
        batch_texts = texts[start:end]

        enc = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        ).to(DEVICE)

        with torch.no_grad():
            outputs = model(**enc)
            batch_logits = outputs.logits
            batch_probs = torch.softmax(batch_logits, dim=-1)

        probs_list.append(batch_probs.cpu().numpy())

    return np.vstack(probs_list)


In [6]:
# =============================================================================
# 5. FinBERT auf Panel anwenden
# =============================================================================
texts = df["text"].tolist()

probs = finbert_predict_proba(
    texts=texts,
    batch_size=BATCH_SIZE,
    max_length=128,  # ggf. erhöhen, falls deine Segmente sehr lang sind
)

# Argmax über Label-Dimension -> Index des wahrscheinlichsten Labels
label_indices = probs.argmax(axis=1)

# Index -> Labelnamen (z.B. "positive", "negative", "neutral")
labels = [id2label[int(i)] for i in label_indices]

# Spalte mit "harten" Labels
df["finbert_label"] = labels

# Spalten mit Wahrscheinlichkeiten je Label; float32 spart Speicher
for label_id, label_name in id2label.items():
    col_name = f"finbert_prob_{label_name.lower()}"
    df[col_name] = probs[:, int(label_id)].astype("float32")

print("Beispielauszug:")
df[["text", "finbert_label"] + [c for c in df.columns if c.startswith("finbert_prob_")]].head()


Beispielauszug:


Unnamed: 0,text,finbert_label,finbert_prob_positive,finbert_prob_negative,finbert_prob_neutral
0,"Welcome to AB InBev's third quarter, 2024 earn...",neutral,0.06914,0.012696,0.918164
1,Hosting the call today from AB InBev are Mr. M...,neutral,0.057243,0.012598,0.930159
2,"and Mr. Fernando Tenenbaum, Chief Financial Of...",neutral,0.036067,0.01954,0.944393
3,To access the slides accompanying today's call...,neutral,0.043265,0.01498,0.941754
4,www.ab-inbev.com and click on the Investors ta...,neutral,0.02885,0.022394,0.948756


In [7]:
# =============================================================================
# 6. Export
# =============================================================================
df.to_csv(PANEL_OUTPUT, index=False, encoding="utf-8")
print(f"Fertig. {df.shape[0]} Zeilen exportiert nach:\n{PANEL_OUTPUT}")


Fertig. 28157 Zeilen exportiert nach:
C:\1_Projekte\quantum_cognition\data\1-3_panel_with_sentiment\transcript_panel_finbert.csv
