# Beispiel Sentiment Analysis

In [17]:
# PyTorch installieren
#!pip install torch

In [18]:
#Hilfscode zum finden meines Arbeitsverzeichnis und der vorhanden Dateien
import os

print("Aktuelles Arbeitsverzeichnis:", os.getcwd())

input_dir = '/kaggle/input/'

for root, dirs, files in os.walk(input_dir):
    print(f"Verzeichnis: {root}")
    print(f"Unterordner: {dirs}")
    print(f"Dateien: {files}")
    print('---')

Aktuelles Arbeitsverzeichnis: /kaggle/working
Verzeichnis: /kaggle/input/
Unterordner: ['llm-text', 'llm-test-text']
Dateien: []
---
Verzeichnis: /kaggle/input/llm-text
Unterordner: ['TEXT']
Dateien: []
---
Verzeichnis: /kaggle/input/llm-text/TEXT
Unterordner: ['EZB']
Dateien: []
---
Verzeichnis: /kaggle/input/llm-text/TEXT/EZB
Unterordner: ['17_October_2024', '17_April_2025', '14_December_2023', '11_April_2024', '12_December_2024', '21_July_2022', '08_September_2022', '14_September_2023', '12_September_2024', '07_March_2024', '27_October_2022', '16_March_2023', '06_March_2025', '26_October_2023', '27_July_2023', '25_January_2024', '02_February_2023', '30_January_2025', '18_July_2024', '05_June_2025', '09_June_2022', '15_December_2022', '15_June_2023', '04_May_2023', '06_June_2024']
Dateien: []
---
Verzeichnis: /kaggle/input/llm-text/TEXT/EZB/17_October_2024
Unterordner: []
Dateien: ['3_ECONOMIC_ACTIVITY.txt', '5_PRESS_CONFERENCE.txt', '2_INFLATION.txt', '0_FULL.txt', '4_RISK_ASSESSMEN

# individual Models

In [19]:
#finbert single
from transformers import pipeline
import os
import pandas as pd
from datetime import datetime

# OPTION: Ausgabe ein/ausschalten
SHOW_PROGRESS = True  # Setze auf True für Fortschrittsanzeige

# Model laden
sentiment_analyzer = pipeline("text-classification", model="ProsusAI/finbert", top_k=None)

# Pfad zu den Ordnern
input_folder = "/kaggle/input/llm-text/TEXT/EZB"

# Alle Datums-Ordner durchgehen
date_folders = sorted([f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))])

results = []

for date_folder in date_folders:
    conclusion_file = os.path.join(input_folder, date_folder, "0_FULL.txt")
    
    if os.path.exists(conclusion_file):
        if SHOW_PROGRESS:
            print(f"Verarbeite: {date_folder}")
        
        # Datei lesen
        with open(conclusion_file, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Text in Chunks aufteilen
        chunks = [text[i:i+512] for i in range(0, len(text), 400) if len(text[i:i+512].strip()) > 50]
        
        # Chunks analysieren
        optimism_scores = []
        for chunk in chunks:
            result = sentiment_analyzer(chunk)
            
            neg_prob = next(r['score'] for r in result[0] if r['label'] == 'negative')
            pos_prob = next(r['score'] for r in result[0] if r['label'] == 'positive')
            
            optimism_score = pos_prob - neg_prob
            optimism_scores.append(optimism_score)
        
        # Gesamtergebnis
        overall_optimism = sum(optimism_scores) / len(optimism_scores)
        
        # Zeile für DataFrame vorbereiten - KORREKTE REIHENFOLGE
        row = {'Date': date_folder}
        for idx, score in enumerate(optimism_scores, 1):
            row[f'Chunk_{idx}'] = round(score, 3)
        row['Overall_Score'] = round(overall_optimism, 3)
        results.append(row)

# DataFrame erstellen
df = pd.DataFrame(results).fillna(0)


# Spalten in korrekter Reihenfolge sortieren
chunk_cols = [col for col in df.columns if col.startswith('Chunk_')]
chunk_cols = sorted(chunk_cols, key=lambda x: int(x.split('_')[1]))
column_order = ['Date'] + chunk_cols + ['Overall_Score']
df = df[column_order]

# Zeitliche Sortierung
def parse_date(date_str):
    try:
        parts = date_str.split('_')
        if len(parts) == 3:
            day, month, year = parts
            return datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
        else:
            return datetime.max
    except:
        return datetime.max

df['Parsed_Date'] = df['Date'].apply(parse_date)
df_sorted = df.sort_values('Parsed_Date').drop(columns=['Parsed_Date'])

# Direkte Ausgabe
print(df_sorted.to_string(index=False))

Device set to use cuda:0


Verarbeite: 02_February_2023
Verarbeite: 04_May_2023
Verarbeite: 05_June_2025
Verarbeite: 06_June_2024
Verarbeite: 06_March_2025
Verarbeite: 07_March_2024
Verarbeite: 08_September_2022
Verarbeite: 09_June_2022
Verarbeite: 11_April_2024
Verarbeite: 12_December_2024
Verarbeite: 12_September_2024
Verarbeite: 14_December_2023
Verarbeite: 14_September_2023
Verarbeite: 15_December_2022
Verarbeite: 15_June_2023
Verarbeite: 16_March_2023
Verarbeite: 17_April_2025
Verarbeite: 17_October_2024
Verarbeite: 18_July_2024
Verarbeite: 21_July_2022
Verarbeite: 25_January_2024
Verarbeite: 26_October_2023
Verarbeite: 27_July_2023
Verarbeite: 27_October_2022
Verarbeite: 30_January_2025
             Date  Chunk_1  Chunk_2  Chunk_3  Chunk_4  Chunk_5  Chunk_6  Chunk_7  Chunk_8  Chunk_9  Chunk_10  Chunk_11  Chunk_12  Chunk_13  Chunk_14  Chunk_15  Chunk_16  Chunk_17  Chunk_18  Chunk_19  Chunk_20  Chunk_21  Chunk_22  Chunk_23  Chunk_24  Chunk_25  Chunk_26  Chunk_27  Chunk_28  Chunk_29  Chunk_30  Chunk_31  Chunk

In [20]:
#finbert single
from transformers import pipeline
import os
import pandas as pd
from datetime import datetime

# OPTION: Ausgabe ein/ausschalten
SHOW_PROGRESS = True  # Setze auf True für Fortschrittsanzeige

# Model laden
sentiment_analyzer = pipeline("text-classification", model="ProsusAI/finbert", top_k=None)

# Pfad zu den Ordnern
input_folder = "/kaggle/input/llm-text/TEXT/EZB"

# Alle Datums-Ordner durchgehen
date_folders = sorted([f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))])

results = []

for date_folder in date_folders:
    conclusion_file = os.path.join(input_folder, date_folder, "0_FULL.txt")
    
    if os.path.exists(conclusion_file):
        if SHOW_PROGRESS:
            print(f"Verarbeite: {date_folder}")
        
        # Datei lesen
        with open(conclusion_file, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Text in Chunks aufteilen
        chunks = [text[i:i+512] for i in range(0, len(text), 400) if len(text[i:i+512].strip()) > 50]
        
        # Chunks analysieren
        optimism_scores = []
        for chunk in chunks:
            result = sentiment_analyzer(chunk)
            
            neg_prob = next(r['score'] for r in result[0] if r['label'] == 'negative')
            pos_prob = next(r['score'] for r in result[0] if r['label'] == 'positive')
            
            optimism_score = pos_prob - neg_prob
            
            # ✅ NUR WERTE AUßERHALB des neutralen Bereichs (-0.005 bis +0.005) hinzufügen
            if optimism_score < -0.005 or optimism_score > 0.005:
                optimism_scores.append(optimism_score)
        
        # Gesamtergebnis - nur mit gefilterten Werten
        if optimism_scores:  # Prüfe ob noch Werte übrig sind
            overall_optimism = sum(optimism_scores) / len(optimism_scores)
        else:
            overall_optimism = 0  # Fallback wenn alle Werte neutral waren
        
        # Zeile für DataFrame vorbereiten - KORREKTE REIHENFOLGE
        row = {'Date': date_folder}
        for idx, score in enumerate(optimism_scores, 1):
            row[f'Chunk_{idx}'] = round(score, 3)
        row['Overall_Score'] = round(overall_optimism, 3)
        row['Valid_Chunks'] = len(optimism_scores)  # ✅ Anzahl verwendeter Chunks
        results.append(row)

# DataFrame erstellen
df = pd.DataFrame(results).fillna(0)

# Spalten in korrekter Reihenfolge sortieren
chunk_cols = [col for col in df.columns if col.startswith('Chunk_')]
chunk_cols = sorted(chunk_cols, key=lambda x: int(x.split('_')[1]))
column_order = ['Date'] + chunk_cols + ['Overall_Score', 'Valid_Chunks']
df = df[column_order]

# Zeitliche Sortierung
def parse_date(date_str):
    try:
        parts = date_str.split('_')
        if len(parts) == 3:
            day, month, year = parts
            return datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
        else:
            return datetime.max
    except:
        return datetime.max

df['Parsed_Date'] = df['Date'].apply(parse_date)
df_sorted = df.sort_values('Parsed_Date').drop(columns=['Parsed_Date'])

# Direkte Ausgabe
print(df_sorted.to_string(index=False))


Device set to use cuda:0


Verarbeite: 02_February_2023
Verarbeite: 04_May_2023
Verarbeite: 05_June_2025
Verarbeite: 06_June_2024
Verarbeite: 06_March_2025
Verarbeite: 07_March_2024
Verarbeite: 08_September_2022
Verarbeite: 09_June_2022
Verarbeite: 11_April_2024
Verarbeite: 12_December_2024
Verarbeite: 12_September_2024
Verarbeite: 14_December_2023
Verarbeite: 14_September_2023
Verarbeite: 15_December_2022
Verarbeite: 15_June_2023
Verarbeite: 16_March_2023
Verarbeite: 17_April_2025
Verarbeite: 17_October_2024
Verarbeite: 18_July_2024
Verarbeite: 21_July_2022
Verarbeite: 25_January_2024
Verarbeite: 26_October_2023
Verarbeite: 27_July_2023
Verarbeite: 27_October_2022
Verarbeite: 30_January_2025
             Date  Chunk_1  Chunk_2  Chunk_3  Chunk_4  Chunk_5  Chunk_6  Chunk_7  Chunk_8  Chunk_9  Chunk_10  Chunk_11  Chunk_12  Chunk_13  Chunk_14  Chunk_15  Chunk_16  Chunk_17  Chunk_18  Chunk_19  Chunk_20  Chunk_21  Chunk_22  Chunk_23  Chunk_24  Chunk_25  Chunk_26  Chunk_27  Chunk_28  Chunk_29  Chunk_30  Chunk_31  Chunk

In [21]:
#fin-roberta single

from transformers import pipeline
import os
import pandas as pd
from datetime import datetime

# OPTION: Ausgabe ein/ausschalten
SHOW_PROGRESS = True

# Model laden
sentiment_analyzer = pipeline("text-classification", model="soleimanian/financial-roberta-large-sentiment", top_k=None)

# Pfad zu den Ordnern
input_folder = "/kaggle/input/llm-text/TEXT/EZB/"

# Alle Datums-Ordner durchgehen
date_folders = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]

results = []

for date_folder in date_folders:
    conclusion_file = os.path.join(input_folder, date_folder, "0_FULL.txt")
    
    if os.path.exists(conclusion_file):
        if SHOW_PROGRESS:
            print(f"Verarbeite: {date_folder}")
        
        # Datei lesen
        with open(conclusion_file, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Text in Chunks aufteilen
        chunks = [text[i:i+512] for i in range(0, len(text), 400) if len(text[i:i+512].strip()) > 50]
        
        # Chunks analysieren
        optimism_scores = []
        for chunk in chunks:
            result = sentiment_analyzer(chunk)
            
            # KORREKTUR: result[0] verwenden!
            labels = {r['label']: r['score'] for r in result[0]}
            
            neg_prob = labels.get('negative', 0)
            pos_prob = labels.get('positive', 0)
            
            optimism_score = pos_prob - neg_prob
            optimism_scores.append(optimism_score)
        
        # Gesamtergebnis
        overall_optimism = sum(optimism_scores) / len(optimism_scores)
        
        # Zeile für DataFrame vorbereiten
        row = {'Date': date_folder}
        for idx, score in enumerate(optimism_scores, 1):
            row[f'Chunk_{idx}'] = round(score, 3)
        row['Overall_Score'] = round(overall_optimism, 3)
        results.append(row)

# DataFrame erstellen
df = pd.DataFrame(results).fillna(0)

# CHRONOLOGISCHE SORTIERUNG
def parse_date(date_str):
    try:
        parts = date_str.split('_')
        if len(parts) == 3:
            day, month, year = parts
            return datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
        else:
            return datetime.max
    except:
        return datetime.max

df['Parsed_Date'] = df['Date'].apply(parse_date)
df_sorted = df.sort_values('Parsed_Date').drop(columns=['Parsed_Date'])

# Spalten in korrekter Reihenfolge sortieren
chunk_cols = [col for col in df_sorted.columns if col.startswith('Chunk_')]
chunk_cols = sorted(chunk_cols, key=lambda x: int(x.split('_')[1]))
column_order = ['Date'] + chunk_cols + ['Overall_Score']
df_final = df_sorted[column_order]

print(df_final.to_string(index=False))


Device set to use cuda:0


Verarbeite: 17_October_2024
Verarbeite: 17_April_2025
Verarbeite: 14_December_2023
Verarbeite: 11_April_2024
Verarbeite: 12_December_2024
Verarbeite: 21_July_2022
Verarbeite: 08_September_2022
Verarbeite: 14_September_2023
Verarbeite: 12_September_2024
Verarbeite: 07_March_2024
Verarbeite: 27_October_2022
Verarbeite: 16_March_2023
Verarbeite: 06_March_2025
Verarbeite: 26_October_2023
Verarbeite: 27_July_2023
Verarbeite: 25_January_2024
Verarbeite: 02_February_2023
Verarbeite: 30_January_2025
Verarbeite: 18_July_2024
Verarbeite: 05_June_2025
Verarbeite: 09_June_2022
Verarbeite: 15_December_2022
Verarbeite: 15_June_2023
Verarbeite: 04_May_2023
Verarbeite: 06_June_2024
             Date  Chunk_1  Chunk_2  Chunk_3  Chunk_4  Chunk_5  Chunk_6  Chunk_7  Chunk_8  Chunk_9  Chunk_10  Chunk_11  Chunk_12  Chunk_13  Chunk_14  Chunk_15  Chunk_16  Chunk_17  Chunk_18  Chunk_19  Chunk_20  Chunk_21  Chunk_22  Chunk_23  Chunk_24  Chunk_25  Chunk_26  Chunk_27  Chunk_28  Chunk_29  Chunk_30  Chunk_31  Chunk

In [22]:
#nicht optimierte Sätze!!!!-financial-roberta

from transformers import pipeline
import os
import pandas as pd
from datetime import datetime
import re

# OPTION: Ausgabe ein/ausschalten
SHOW_PROGRESS = True

# Model laden
sentiment_analyzer = pipeline("text-classification", model="soleimanian/financial-roberta-large-sentiment", top_k=None)

# Pfad zu den Ordnern
input_folder ="/kaggle/input/llm-text/TEXT/EZB/"

# Alle Datums-Ordner durchgehen
date_folders = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]

results = []

def simple_sentence_split(text):
    # Einfacher Satz-Splitter ohne NLTK
    sentences = re.split(r'[.!?]+', text)
    # Bereinige und filtere leere Sätze
    sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
    return sentences

for date_folder in date_folders:
    full_file = os.path.join(input_folder, date_folder, "0_FULL.txt")
    
    if os.path.exists(full_file):
        if SHOW_PROGRESS:
            print(f"Verarbeite: {date_folder}")
        
        # Datei lesen
        with open(full_file, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Satz-basierte Analyse
        sentences = simple_sentence_split(text)
        optimism_scores_sentences = []
        for sentence in sentences:
            if len(sentence.strip()) > 10:
                result = sentiment_analyzer(sentence)
                labels = {r['label']: r['score'] for r in result[0]}
                neg_prob = labels.get('negative', 0)
                pos_prob = labels.get('positive', 0)
                optimism_score = pos_prob - neg_prob
                optimism_scores_sentences.append(optimism_score)
        overall_optimism_sentences = sum(optimism_scores_sentences) / len(optimism_scores_sentences) if optimism_scores_sentences else 0
        
        # Chunk-basierte Analyse
        chunks = [text[i:i+512] for i in range(0, len(text), 400) if len(text[i:i+512].strip()) > 50]
        optimism_scores_chunks = []
        for chunk in chunks:
            result = sentiment_analyzer(chunk)
            labels = {r['label']: r['score'] for r in result[0]}
            neg_prob = labels.get('negative', 0)
            pos_prob = labels.get('positive', 0)
            optimism_score = pos_prob - neg_prob
            optimism_scores_chunks.append(optimism_score)
        overall_optimism_chunks = sum(optimism_scores_chunks) / len(optimism_scores_chunks) if optimism_scores_chunks else 0
        
        # Ergebnisse speichern
        row = {
            'Date': date_folder,
            'Optimism_Score_Sentences': round(overall_optimism_sentences, 3),
            'Sentence_Count': len(optimism_scores_sentences),
            'Optimism_Score_Chunks': round(overall_optimism_chunks, 3),
            'Chunk_Count': len(optimism_scores_chunks)
        }
        results.append(row)

# DataFrame erstellen
df = pd.DataFrame(results)

# CHRONOLOGISCHE SORTIERUNG
def parse_date(date_str):
    try:
        parts = date_str.split('_')
        if len(parts) == 3:
            day, month, year = parts
            return datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
        else:
            return datetime.max
    except:
        return datetime.max

df['Parsed_Date'] = df['Date'].apply(parse_date)
df_sorted = df.sort_values('Parsed_Date').drop(columns=['Parsed_Date'])

# Excel speichern
excel_path = 'ezb_optimism_scores_compare_sentences_chunks_financial_roberta.xlsx'
df_sorted.to_excel(excel_path, index=False)

print(df_sorted.to_string(index=False))
print(f"Ergebnisse wurden in '{excel_path}' gespeichert.")

# Einfacher Download-Link
from IPython.display import FileLink
display(FileLink(excel_path))


Device set to use cuda:0


Verarbeite: 17_October_2024
Verarbeite: 17_April_2025
Verarbeite: 14_December_2023
Verarbeite: 11_April_2024
Verarbeite: 12_December_2024
Verarbeite: 21_July_2022
Verarbeite: 08_September_2022
Verarbeite: 14_September_2023
Verarbeite: 12_September_2024
Verarbeite: 07_March_2024
Verarbeite: 27_October_2022
Verarbeite: 16_March_2023
Verarbeite: 06_March_2025
Verarbeite: 26_October_2023
Verarbeite: 27_July_2023
Verarbeite: 25_January_2024
Verarbeite: 02_February_2023
Verarbeite: 30_January_2025
Verarbeite: 18_July_2024
Verarbeite: 05_June_2025
Verarbeite: 09_June_2022
Verarbeite: 15_December_2022
Verarbeite: 15_June_2023
Verarbeite: 04_May_2023
Verarbeite: 06_June_2024
             Date  Optimism_Score_Sentences  Sentence_Count  Optimism_Score_Chunks  Chunk_Count
     09_June_2022                     0.089             138                 -0.069           46
     21_July_2022                     0.125             109                  0.231           39
08_September_2022                    

In [23]:
#nicht optimierte Sätze!!!-finbert

from transformers import pipeline
import os
import pandas as pd
from datetime import datetime
from IPython.display import FileLink
import re

# OPTION: Ausgabe ein/ausschalten
SHOW_PROGRESS = True

# Model laden
sentiment_analyzer = pipeline("text-classification", model="ProsusAI/finbert", top_k=None)

# Pfad zu den Ordnern
input_folder = "/kaggle/input/llm-text/TEXT/EZB/"

# Alle Datums-Ordner durchgehen
date_folders = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]

results = []

def simple_sentence_split(text):
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
    return sentences

for date_folder in date_folders:
    full_file = os.path.join(input_folder, date_folder, "0_FULL.txt")
    
    if os.path.exists(full_file):
        if SHOW_PROGRESS:
            print(f"Verarbeite: {date_folder}")
        
        # Datei lesen
        with open(full_file, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Satz-basierte Analyse
        sentences = simple_sentence_split(text)
        optimism_scores_sentences = []
        for sentence in sentences:
            if len(sentence.strip()) > 10:
                result = sentiment_analyzer(sentence)
                labels = {r['label']: r['score'] for r in result[0]}
                neg_prob = labels.get('negative', 0)
                pos_prob = labels.get('positive', 0)
                optimism_score = pos_prob - neg_prob
                optimism_scores_sentences.append(optimism_score)
        overall_optimism_sentences = sum(optimism_scores_sentences) / len(optimism_scores_sentences) if optimism_scores_sentences else 0
        
        # Chunk-basierte Analyse
        chunks = [text[i:i+512] for i in range(0, len(text), 400) if len(text[i:i+512].strip()) > 50]
        optimism_scores_chunks = []
        for chunk in chunks:
            result = sentiment_analyzer(chunk)
            labels = {r['label']: r['score'] for r in result[0]}
            neg_prob = labels.get('negative', 0)
            pos_prob = labels.get('positive', 0)
            optimism_score = pos_prob - neg_prob
            optimism_scores_chunks.append(optimism_score)
        overall_optimism_chunks = sum(optimism_scores_chunks) / len(optimism_scores_chunks) if optimism_scores_chunks else 0
        
        # Ergebnisse speichern
        row = {
            'Date': date_folder,
            'Optimism_Score_Sentences': round(overall_optimism_sentences, 3),
            'Sentence_Count': len(optimism_scores_sentences),
            'Optimism_Score_Chunks': round(overall_optimism_chunks, 3),
            'Chunk_Count': len(optimism_scores_chunks)
        }
        results.append(row)

# DataFrame erstellen
df = pd.DataFrame(results)

# CHRONOLOGISCHE SORTIERUNG
def parse_date(date_str):
    try:
        parts = date_str.split('_')
        if len(parts) == 3:
            day, month, year = parts
            return datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
        else:
            return datetime.max
    except:
        return datetime.max

df['Parsed_Date'] = df['Date'].apply(parse_date)
df_sorted = df.sort_values('Parsed_Date').drop(columns=['Parsed_Date'])

# Excel speichern
excel_path = 'ezb_optimism_scores_compare_sentences_chunks_finbert.xlsx'
df_sorted.to_excel(excel_path, index=False)

print(df_sorted.to_string(index=False))
print(f"Ergebnisse wurden in '{excel_path}' gespeichert.")

# Einfacher Download-Link
display(FileLink(excel_path))


Device set to use cuda:0


Verarbeite: 17_October_2024
Verarbeite: 17_April_2025
Verarbeite: 14_December_2023
Verarbeite: 11_April_2024
Verarbeite: 12_December_2024
Verarbeite: 21_July_2022
Verarbeite: 08_September_2022
Verarbeite: 14_September_2023
Verarbeite: 12_September_2024
Verarbeite: 07_March_2024
Verarbeite: 27_October_2022
Verarbeite: 16_March_2023
Verarbeite: 06_March_2025
Verarbeite: 26_October_2023
Verarbeite: 27_July_2023
Verarbeite: 25_January_2024
Verarbeite: 02_February_2023
Verarbeite: 30_January_2025
Verarbeite: 18_July_2024
Verarbeite: 05_June_2025
Verarbeite: 09_June_2022
Verarbeite: 15_December_2022
Verarbeite: 15_June_2023
Verarbeite: 04_May_2023
Verarbeite: 06_June_2024
             Date  Optimism_Score_Sentences  Sentence_Count  Optimism_Score_Chunks  Chunk_Count
     09_June_2022                     0.208             138                  0.119           46
     21_July_2022                     0.268             109                  0.245           39
08_September_2022                    

In [24]:
#optimierte Sätze-finbert!!!


from transformers import pipeline
import os
import pandas as pd
from datetime import datetime
from IPython.display import FileLink
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

# NLTK Punkt Daten herunterladen (einmalig)
nltk.download('punkt')

# OPTION: Ausgabe ein/ausschalten
SHOW_PROGRESS = True

# Model laden
sentiment_analyzer = pipeline("text-classification", model="ProsusAI/finbert", top_k=None)

# Pfad zu den Ordnern
input_folder = "/kaggle/input/llm-text/TEXT/EZB/"

# Alle Datums-Ordner durchgehen
date_folders = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]

results = []

def improved_sentence_split(text):
    # Punkt-Parameter für EZB-spezifische Abkürzungen
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'mr', 'mrs', 'ms', 'dr', 'prof',  # Titel
        'e.g', 'i.e', 'etc', 'vs', 'cf',  # Lateinische Abkürzungen
        'ecb', 'eu', 'euro', 'gdp', 'cpi', 'ppp',  # Finanz-Abkürzungen
        'u.s', 'u.k', 'u.s.a', 'e.u',  # Länder
        'jan', 'feb', 'mar', 'apr', 'may', 'jun',  # Monate
        'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
        'inc', 'ltd', 'corp', 'co', 'llc',  # Unternehmen
        'no', 'nos', 'vol', 'p', 'pp', 'fig',  # Allgemeine Abkürzungen
        'tel', 'fax', 'email', 'www'  # Kontakt-Abkürzungen
    ])
    
    # Tokenizer mit angepassten Parametern
    tokenizer = PunktSentenceTokenizer(punkt_param)
    
    # Text in Sätze aufteilen
    sentences = tokenizer.tokenize(text)
    
    # Filtere zu kurze Sätze und bereinige
    sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
    
    return sentences

for date_folder in date_folders:
    full_file = os.path.join(input_folder, date_folder, "0_FULL.txt")
    
    if os.path.exists(full_file):
        if SHOW_PROGRESS:
            print(f"Verarbeite: {date_folder}")
        
        # Datei lesen
        with open(full_file, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Satz-basierte Analyse mit verbessertem Tokenizer
        sentences = improved_sentence_split(text)
        optimism_scores_sentences = []
        for sentence in sentences:
            if len(sentence.strip()) > 10:
                result = sentiment_analyzer(sentence)
                labels = {r['label']: r['score'] for r in result[0]}
                neg_prob = labels.get('negative', 0)
                pos_prob = labels.get('positive', 0)
                optimism_score = pos_prob - neg_prob
                optimism_scores_sentences.append(optimism_score)
        overall_optimism_sentences = sum(optimism_scores_sentences) / len(optimism_scores_sentences) if optimism_scores_sentences else 0
        
        # Chunk-basierte Analyse
        chunks = [text[i:i+512] for i in range(0, len(text), 400) if len(text[i:i+512].strip()) > 50]
        optimism_scores_chunks = []
        for chunk in chunks:
            result = sentiment_analyzer(chunk)
            labels = {r['label']: r['score'] for r in result[0]}
            neg_prob = labels.get('negative', 0)
            pos_prob = labels.get('positive', 0)
            optimism_score = pos_prob - neg_prob
            optimism_scores_chunks.append(optimism_score)
        overall_optimism_chunks = sum(optimism_scores_chunks) / len(optimism_scores_chunks) if optimism_scores_chunks else 0
        
        # Ergebnisse speichern
        row = {
            'Date': date_folder,
            'Optimism_Score_Sentences': round(overall_optimism_sentences, 3),
            'Sentence_Count': len(optimism_scores_sentences),
            'Optimism_Score_Chunks': round(overall_optimism_chunks, 3),
            'Chunk_Count': len(optimism_scores_chunks)
        }
        results.append(row)

# DataFrame erstellen
df = pd.DataFrame(results)

# CHRONOLOGISCHE SORTIERUNG
def parse_date(date_str):
    try:
        parts = date_str.split('_')
        if len(parts) == 3:
            day, month, year = parts
            return datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
        else:
            return datetime.max
    except:
        return datetime.max

df['Parsed_Date'] = df['Date'].apply(parse_date)
df_sorted = df.sort_values('Parsed_Date').drop(columns=['Parsed_Date'])

# Excel speichern
excel_path = 'ezb_optimism_scores_compare_sentences_chunks_finbert_improved.xlsx'
df_sorted.to_excel(excel_path, index=False)

print(df_sorted.to_string(index=False))
print(f"Ergebnisse wurden in '{excel_path}' gespeichert.")

# Einfacher Download-Link
display(FileLink(excel_path))


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0


Verarbeite: 17_October_2024
Verarbeite: 17_April_2025
Verarbeite: 14_December_2023
Verarbeite: 11_April_2024
Verarbeite: 12_December_2024
Verarbeite: 21_July_2022
Verarbeite: 08_September_2022
Verarbeite: 14_September_2023
Verarbeite: 12_September_2024
Verarbeite: 07_March_2024
Verarbeite: 27_October_2022
Verarbeite: 16_March_2023
Verarbeite: 06_March_2025
Verarbeite: 26_October_2023
Verarbeite: 27_July_2023
Verarbeite: 25_January_2024
Verarbeite: 02_February_2023
Verarbeite: 30_January_2025
Verarbeite: 18_July_2024
Verarbeite: 05_June_2025
Verarbeite: 09_June_2022
Verarbeite: 15_December_2022
Verarbeite: 15_June_2023
Verarbeite: 04_May_2023
Verarbeite: 06_June_2024
             Date  Optimism_Score_Sentences  Sentence_Count  Optimism_Score_Chunks  Chunk_Count
     09_June_2022                     0.196             123                  0.119           46
     21_July_2022                     0.260             104                  0.245           39
08_September_2022                    

In [25]:
#optimierte Sätze-financial-roberta
from transformers import pipeline
import os
import pandas as pd
from datetime import datetime
from IPython.display import FileLink
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

# NLTK Punkt Daten herunterladen (einmalig)
nltk.download('punkt')

# OPTION: Ausgabe ein/ausschalten
SHOW_PROGRESS = True

# Model laden
sentiment_analyzer = pipeline("text-classification", model="soleimanian/financial-roberta-large-sentiment", top_k=None)

# Pfad zu den Ordnern
input_folder = "/kaggle/input/llm-text/TEXT/EZB/"

# Alle Datums-Ordner durchgehen
date_folders = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]

results = []

def improved_sentence_split(text):
    # Punkt-Parameter für EZB-spezifische Abkürzungen
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'mr', 'mrs', 'ms', 'dr', 'prof',  # Titel
        'e.g', 'i.e', 'etc', 'vs', 'cf',  # Lateinische Abkürzungen
        'ecb', 'eu', 'euro', 'gdp', 'cpi', 'ppp',  # Finanz-Abkürzungen
        'u.s', 'u.k', 'u.s.a', 'e.u',  # Länder
        'jan', 'feb', 'mar', 'apr', 'may', 'jun',  # Monate
        'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
        'inc', 'ltd', 'corp', 'co', 'llc',  # Unternehmen
        'no', 'nos', 'vol', 'p', 'pp', 'fig',  # Allgemeine Abkürzungen
        'tel', 'fax', 'email', 'www'  # Kontakt-Abkürzungen
    ])
    
    # Tokenizer mit angepassten Parametern
    tokenizer = PunktSentenceTokenizer(punkt_param)
    
    # Text in Sätze aufteilen
    sentences = tokenizer.tokenize(text)
    
    # Filtere zu kurze Sätze und bereinige
    sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
    
    return sentences

for date_folder in date_folders:
    full_file = os.path.join(input_folder, date_folder, "0_FULL.txt")
    
    if os.path.exists(full_file):
        if SHOW_PROGRESS:
            print(f"Verarbeite: {date_folder}")
        
        # Datei lesen
        with open(full_file, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Satz-basierte Analyse mit verbessertem Tokenizer
        sentences = improved_sentence_split(text)
        optimism_scores_sentences = []
        for sentence in sentences:
            if len(sentence.strip()) > 10:
                result = sentiment_analyzer(sentence)
                labels = {r['label']: r['score'] for r in result[0]}
                neg_prob = labels.get('negative', 0)
                pos_prob = labels.get('positive', 0)
                optimism_score = pos_prob - neg_prob
                optimism_scores_sentences.append(optimism_score)
        overall_optimism_sentences = sum(optimism_scores_sentences) / len(optimism_scores_sentences) if optimism_scores_sentences else 0
        
        # Chunk-basierte Analyse
        chunks = [text[i:i+512] for i in range(0, len(text), 400) if len(text[i:i+512].strip()) > 50]
        optimism_scores_chunks = []
        for chunk in chunks:
            result = sentiment_analyzer(chunk)
            labels = {r['label']: r['score'] for r in result[0]}
            neg_prob = labels.get('negative', 0)
            pos_prob = labels.get('positive', 0)
            optimism_score = pos_prob - neg_prob
            optimism_scores_chunks.append(optimism_score)
        overall_optimism_chunks = sum(optimism_scores_chunks) / len(optimism_scores_chunks) if optimism_scores_chunks else 0
        
        # Ergebnisse speichern
        row = {
            'Date': date_folder,
            'Optimism_Score_Sentences': round(overall_optimism_sentences, 3),
            'Sentence_Count': len(optimism_scores_sentences),
            'Optimism_Score_Chunks': round(overall_optimism_chunks, 3),
            'Chunk_Count': len(optimism_scores_chunks)
        }
        results.append(row)

# DataFrame erstellen
df = pd.DataFrame(results)

# CHRONOLOGISCHE SORTIERUNG
def parse_date(date_str):
    try:
        parts = date_str.split('_')
        if len(parts) == 3:
            day, month, year = parts
            return datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
        else:
            return datetime.max
    except:
        return datetime.max

df['Parsed_Date'] = df['Date'].apply(parse_date)
df_sorted = df.sort_values('Parsed_Date').drop(columns=['Parsed_Date'])

# Excel speichern
excel_path = 'ezb_optimism_scores_compare_sentences_chunks_financial_roberta_improved.xlsx'
df_sorted.to_excel(excel_path, index=False)

print(df_sorted.to_string(index=False))
print(f"Ergebnisse wurden in '{excel_path}' gespeichert.")

# Einfacher Download-Link
display(FileLink(excel_path))


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0


Verarbeite: 17_October_2024
Verarbeite: 17_April_2025
Verarbeite: 14_December_2023
Verarbeite: 11_April_2024
Verarbeite: 12_December_2024
Verarbeite: 21_July_2022
Verarbeite: 08_September_2022
Verarbeite: 14_September_2023
Verarbeite: 12_September_2024
Verarbeite: 07_March_2024
Verarbeite: 27_October_2022
Verarbeite: 16_March_2023
Verarbeite: 06_March_2025
Verarbeite: 26_October_2023
Verarbeite: 27_July_2023
Verarbeite: 25_January_2024
Verarbeite: 02_February_2023
Verarbeite: 30_January_2025
Verarbeite: 18_July_2024
Verarbeite: 05_June_2025
Verarbeite: 09_June_2022
Verarbeite: 15_December_2022
Verarbeite: 15_June_2023
Verarbeite: 04_May_2023
Verarbeite: 06_June_2024
             Date  Optimism_Score_Sentences  Sentence_Count  Optimism_Score_Chunks  Chunk_Count
     09_June_2022                     0.036             123                 -0.069           46
     21_July_2022                     0.094             104                  0.231           39
08_September_2022                    

# all Modells compared

In [26]:
#complete Evaluation LLM Models
from transformers import pipeline
import os
import pandas as pd
from datetime import datetime
from IPython.display import FileLink
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

# Download NLTK Punkt data (one-time)
nltk.download('punkt')

# OPTION: Enable/disable output
SHOW_PROGRESS = True

# Load both models
finbert_analyzer = pipeline("text-classification", model="ProsusAI/finbert", top_k=None)
roberta_analyzer = pipeline("text-classification", model="soleimanian/financial-roberta-large-sentiment", top_k=None)

# Path to folders
input_folder = "/kaggle/input/llm-test-text/TEXT_TEST/"

# Process all date folders
date_folders = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]

results = []

def improved_sentence_split(text):
    # Punkt parameters for ECB-specific abbreviations
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'mr', 'mrs', 'ms', 'dr', 'prof',  # Titles
        'e.g', 'i.e', 'etc', 'vs', 'cf',  # Latin abbreviations
        'ecb', 'eu', 'euro', 'gdp', 'cpi', 'ppp',  # Financial abbreviations
        'u.s', 'u.k', 'u.s.a', 'e.u',  # Countries
        'jan', 'feb', 'mar', 'apr', 'may', 'jun',  # Months
        'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
        'inc', 'ltd', 'corp', 'co', 'llc',  # Companies
        'no', 'nos', 'vol', 'p', 'pp', 'fig',  # General abbreviations
        'tel', 'fax', 'email', 'www'  # Contact abbreviations
    ])
    
    # Tokenizer with adjusted parameters
    tokenizer = PunktSentenceTokenizer(punkt_param)
    
    # Split text into sentences
    sentences = tokenizer.tokenize(text)
    
    # Filter short sentences and clean
    sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
    
    return sentences

def analyze_with_model(analyzer, text_input):
    """Helper function for sentiment analysis"""
    result = analyzer(text_input)
    labels = {r['label']: r['score'] for r in result[0]}
    neg_prob = labels.get('negative', 0)
    pos_prob = labels.get('positive', 0)
    return pos_prob - neg_prob

for date_folder in date_folders:
    full_file = os.path.join(input_folder, date_folder, "0_FULL.txt")
    
    if os.path.exists(full_file):
        if SHOW_PROGRESS:
            print(f"Processing: {date_folder}")
        
        # Read file
        with open(full_file, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Prepare sentences and chunks
        sentences = improved_sentence_split(text)
        chunks = [text[i:i+512] for i in range(0, len(text), 400) if len(text[i:i+512].strip()) > 50]
        
        # FinBERT sentence analysis
        finbert_sentence_scores = []
        for sentence in sentences:
            if len(sentence.strip()) > 10:
                score = analyze_with_model(finbert_analyzer, sentence)
                finbert_sentence_scores.append(score)
        finbert_sentences_avg = sum(finbert_sentence_scores) / len(finbert_sentence_scores) if finbert_sentence_scores else 0
        
        # FinBERT chunk analysis
        finbert_chunk_scores = []
        for chunk in chunks:
            score = analyze_with_model(finbert_analyzer, chunk)
            finbert_chunk_scores.append(score)
        finbert_chunks_avg = sum(finbert_chunk_scores) / len(finbert_chunk_scores) if finbert_chunk_scores else 0
        
        # Financial-RoBERTa sentence analysis
        roberta_sentence_scores = []
        for sentence in sentences:
            if len(sentence.strip()) > 10:
                score = analyze_with_model(roberta_analyzer, sentence)
                roberta_sentence_scores.append(score)
        roberta_sentences_avg = sum(roberta_sentence_scores) / len(roberta_sentence_scores) if roberta_sentence_scores else 0
        
        # Financial-RoBERTa chunk analysis
        roberta_chunk_scores = []
        for chunk in chunks:
            score = analyze_with_model(roberta_analyzer, chunk)
            roberta_chunk_scores.append(score)
        roberta_chunks_avg = sum(roberta_chunk_scores) / len(roberta_chunk_scores) if roberta_chunk_scores else 0
        
        # Store results
        row = {
            'Date': date_folder,
            'FinBERT_Sentences': round(finbert_sentences_avg, 3),
            'FinBERT_Chunks': round(finbert_chunks_avg, 3),
            'RoBERTa_Sentences': round(roberta_sentences_avg, 3),
            'RoBERTa_Chunks': round(roberta_chunks_avg, 3),
            'Sentence_Count': len(finbert_sentence_scores),
            'Chunk_Count': len(finbert_chunk_scores)
        }
        results.append(row)

# Create DataFrame
df = pd.DataFrame(results)

# CHRONOLOGICAL SORTING
def parse_date(date_str):
    try:
        parts = date_str.split('_')
        if len(parts) == 3:
            day, month, year = parts
            return datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
        else:
            return datetime.max
    except:
        return datetime.max

df['Parsed_Date'] = df['Date'].apply(parse_date)
df_sorted = df.sort_values('Parsed_Date').drop(columns=['Parsed_Date'])

# Additional analysis: Consistency check
df_sorted['FinBERT_Difference'] = abs(df_sorted['FinBERT_Sentences'] - df_sorted['FinBERT_Chunks'])
df_sorted['RoBERTa_Difference'] = abs(df_sorted['RoBERTa_Sentences'] - df_sorted['RoBERTa_Chunks'])

# ADD AVERAGE ROW
average_row = df_sorted.select_dtypes(include=['number']).mean()
average_row['Date'] = 'Average'
df_with_average = pd.concat([df_sorted, average_row.to_frame().T], ignore_index=True)

print("=== COMPLETE MODEL COMPARISON ===")
print(df_with_average.to_string(index=False))
print(f"FinBERT average difference sentences/chunks: {df_sorted['FinBERT_Difference'].mean():.3f}")
print(f"RoBERTa average difference sentences/chunks: {df_sorted['RoBERTa_Difference'].mean():.3f}")

# ===========================================
# SAVE TO EXCEL WITH MULTIPLE SHEETS
# ===========================================

# Generate LaTeX table content WITH AVERAGE ROW
latex_table = df_with_average[['Date', 'FinBERT_Sentences', 'RoBERTa_Sentences', 'Sentence_Count']].to_latex(
    index=False, 
    float_format="%.3f",
    caption="ECB Sentiment Analysis Results Comparison",
    label="tab:sentiment_comparison",
    column_format="llrr"
)

# Generate summary report content
report = f"""ECB SENTIMENT ANALYSIS REPORT
============================

Dataset Overview:
- Total Documents Analyzed: {len(df_sorted)}
- Date Range: {df_sorted['Date'].min()} to {df_sorted['Date'].max()}
- Average Sentences per Document: {df_sorted['Sentence_Count'].mean():.1f}
- Average Chunks per Document: {df_sorted['Chunk_Count'].mean():.1f}

Model Performance Comparison:
- FinBERT Average Sentiment (Sentences): {df_sorted['FinBERT_Sentences'].mean():.3f}
- RoBERTa Average Sentiment (Sentences): {df_sorted['RoBERTa_Sentences'].mean():.3f}
- Correlation between FinBERT and RoBERTa: {df_sorted['FinBERT_Sentences'].corr(df_sorted['RoBERTa_Sentences']):.3f}

Sentiment Range Analysis:
- FinBERT Min/Max: {df_sorted['FinBERT_Sentences'].min():.3f} / {df_sorted['FinBERT_Sentences'].max():.3f}
- RoBERTa Min/Max: {df_sorted['RoBERTa_Sentences'].min():.3f} / {df_sorted['RoBERTa_Sentences'].max():.3f}

Consistency Analysis (Sentences vs Chunks):
- FinBERT Consistency Score: {1 - df_sorted['FinBERT_Difference'].mean():.3f}
- RoBERTa Consistency Score: {1 - df_sorted['RoBERTa_Difference'].mean():.3f}

Most Positive Sentiment:
- Date: {df_sorted.loc[df_sorted['FinBERT_Sentences'].idxmax(), 'Date']}
- FinBERT Score: {df_sorted['FinBERT_Sentences'].max():.3f}

Most Negative Sentiment:
- Date: {df_sorted.loc[df_sorted['FinBERT_Sentences'].idxmin(), 'Date']}
- FinBERT Score: {df_sorted['FinBERT_Sentences'].min():.3f}"""

# Save Excel with multiple sheets
excel_path = 'ezb_optimism_scores_ALL_MODELS_COMPARISON.xlsx'

with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
    # Main data sheet with average row
    df_with_average.to_excel(writer, sheet_name='Data', index=False)
    
    # LaTeX table sheet
    latex_df = pd.DataFrame([latex_table], columns=['LaTeX_Table'])
    latex_df.to_excel(writer, sheet_name='LaTeX_Table', index=False, header=False)
    
    # Summary report sheet
    report_df = pd.DataFrame([report], columns=['Summary_Report'])
    report_df.to_excel(writer, sheet_name='Summary_Report', index=False, header=False)

print(f"\n✅ Complete file saved: {excel_path}")
print("Sheets: Data (with Average row), LaTeX_Table (with Average row), Summary_Report")

# Download link
display(FileLink(excel_path))


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0
Device set to use cuda:0


Processing: 17_October_2024
Processing: 17_April_2025
Processing: 12_December_2024
Processing: 12_September_2024
Processing: 06_March_2025
Processing: 30_January_2025
Processing: 05_June_2025
=== COMPLETE MODEL COMPARISON ===
             Date FinBERT_Sentences FinBERT_Chunks RoBERTa_Sentences RoBERTa_Chunks Sentence_Count Chunk_Count FinBERT_Difference RoBERTa_Difference
12_September_2024            -0.048         -0.387            -0.042         -0.055            117          41              0.339              0.013
  17_October_2024             0.037         -0.283             -0.08         -0.108             98          34               0.32              0.028
 12_December_2024             0.092         -0.103             0.058          0.133            116          40              0.195              0.075
  30_January_2025              0.15          0.059             0.002          -0.02             95          35              0.091              0.022
    06_March_2025            

# Interpretation

In [27]:
#complete Evaluation LLM Models
from transformers import pipeline
import os
import pandas as pd
from datetime import datetime
from IPython.display import FileLink
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

# Download NLTK Punkt data (one-time)
nltk.download('punkt')

# ============================================================================
# CONFIGURATION
# ============================================================================
CONFIG = {
    'main_excel_filename': 'ecb_sentiment_analysis.xlsx',            
    'detailed_excel_filename': 'sentiment_analysis_details.xlsx'
}

# OPTION: Enable/disable output
SHOW_PROGRESS = True

# Load both models
finbert_analyzer = pipeline("text-classification", model="ProsusAI/finbert", top_k=None)
roberta_analyzer = pipeline("text-classification", model="soleimanian/financial-roberta-large-sentiment", top_k=None)

# Path to folders
input_folder = "/kaggle/input/llm-text/TEXT/EZB/"


# Process all date folders
date_folders = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]

results = []

def improved_sentence_split(text):
    # Punkt parameters for ECB-specific abbreviations
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'mr', 'mrs', 'ms', 'dr', 'prof',  # Titles
        'e.g', 'i.e', 'etc', 'vs', 'cf',  # Latin abbreviations
        'ecb', 'eu', 'euro', 'gdp', 'cpi', 'ppp',  # Financial abbreviations
        'u.s', 'u.k', 'u.s.a', 'e.u',  # Countries
        'jan', 'feb', 'mar', 'apr', 'may', 'jun',  # Months
        'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
        'inc', 'ltd', 'corp', 'co', 'llc',  # Companies
        'no', 'nos', 'vol', 'p', 'pp', 'fig',  # General abbreviations
        'tel', 'fax', 'email', 'www'  # Contact abbreviations
    ])
    
    # Tokenizer with adjusted parameters
    tokenizer = PunktSentenceTokenizer(punkt_param)
    
    # Split text into sentences
    sentences = tokenizer.tokenize(text)
    
    # Filter short sentences and clean
    sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
    
    return sentences

def analyze_with_model(analyzer, text_input):
    """Helper function for sentiment analysis"""
    result = analyzer(text_input)
    labels = {r['label']: r['score'] for r in result[0]}
    neg_prob = labels.get('negative',0)
    pos_prob = labels.get('positive',0)
    return pos_prob - neg_prob

for date_folder in date_folders:
    full_file = os.path.join(input_folder, date_folder, "0_FULL.txt")
    
    if os.path.exists(full_file):
        if SHOW_PROGRESS:
            print(f"Processing: {date_folder}")
        
        # Read file
        with open(full_file, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Prepare sentences and chunks
        sentences = improved_sentence_split(text)
        chunks = [text[i:i+512] for i in range(0, len(text), 400) if len(text[i:i+512].strip()) > 50]
        
        # FinBERT sentence analysis
        finbert_sentence_scores = []
        for sentence in sentences:
            if len(sentence.strip()) > 10:
                score = analyze_with_model(finbert_analyzer, sentence)
                finbert_sentence_scores.append(score)
        finbert_sentences_avg = sum(finbert_sentence_scores) / len(finbert_sentence_scores) if finbert_sentence_scores else 0
        
        # FinBERT chunk analysis
        finbert_chunk_scores = []
        for chunk in chunks:
            score = analyze_with_model(finbert_analyzer, chunk)
            finbert_chunk_scores.append(score)
        finbert_chunks_avg = sum(finbert_chunk_scores) / len(finbert_chunk_scores) if finbert_chunk_scores else 0
        
        # Financial-RoBERTa sentence analysis
        roberta_sentence_scores = []
        for sentence in sentences:
            if len(sentence.strip()) > 10:
                score = analyze_with_model(roberta_analyzer, sentence)
                roberta_sentence_scores.append(score)
        roberta_sentences_avg = sum(roberta_sentence_scores) / len(roberta_sentence_scores) if roberta_sentence_scores else 0
        
        # Financial-RoBERTa chunk analysis
        roberta_chunk_scores = []
        for chunk in chunks:
            score = analyze_with_model(roberta_analyzer, chunk)
            roberta_chunk_scores.append(score)
        roberta_chunks_avg = sum(roberta_chunk_scores) / len(roberta_chunk_scores) if roberta_chunk_scores else 0
        
        # Store results
        row = {
            'Date': date_folder,
            'FinBERT_Sentences': round(finbert_sentences_avg, 3),
            'FinBERT_Chunks': round(finbert_chunks_avg, 3),
            'RoBERTa_Sentences': round(roberta_sentences_avg, 3),
            'RoBERTa_Chunks': round(roberta_chunks_avg, 3),
            'Sentence_Count': len(finbert_sentence_scores),
            'Chunk_Count': len(finbert_chunk_scores)
        }
        results.append(row)

# Create DataFrame
df = pd.DataFrame(results)

# CHRONOLOGICAL SORTING
def parse_date(date_str):
    try:
        parts = date_str.split('_')
        if len(parts) == 3:
            day, month, year = parts
            return datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
        else:
            return datetime.max
    except:
        return datetime.max

df['Parsed_Date'] = df['Date'].apply(parse_date)
df_sorted = df.sort_values('Parsed_Date').drop(columns=['Parsed_Date'])

# Additional analysis: Consistency check
df_sorted['FinBERT_Difference'] = abs(df_sorted['FinBERT_Sentences'] - df_sorted['FinBERT_Chunks'])
df_sorted['RoBERTa_Difference'] = abs(df_sorted['RoBERTa_Sentences'] - df_sorted['RoBERTa_Chunks'])

# ADD AVERAGE ROW
average_row = df_sorted.select_dtypes(include=['number']).mean()
average_row['Date'] = 'Average'
df_with_average = pd.concat([df_sorted, average_row.to_frame().T], ignore_index=True)

print("=== COMPLETE MODEL COMPARISON ===")
print(df_with_average.to_string(index=False))
print(f"FinBERT average difference sentences/chunks: {df_sorted['FinBERT_Difference'].mean():.3f}")
print(f"RoBERTa average difference sentences/chunks: {df_sorted['RoBERTa_Difference'].mean():.3f}")

# ===========================================
# SAVE TO EXCEL WITH MULTIPLE SHEETS
# ===========================================

# Save main data to separate excel file
df_with_average.to_excel(CONFIG['main_excel_filename'], index=False)

# Generate LaTeX table content WITH AVERAGE ROW
latex_table = df_with_average[['Date', 'FinBERT_Sentences', 'RoBERTa_Sentences', 'Sentence_Count']].to_latex(
    index=False, 
    float_format="%.3f",
    caption="ECB Sentiment Analysis Results Comparison",
    label="tab:sentiment_comparison",
    column_format="llrr"
)

# Generate summary report content
report = f"""ECB SENTIMENT ANALYSIS REPORT
============================

Dataset Overview:
- Total Documents Analyzed: {len(df_sorted)}
- Date Range: {df_sorted['Date'].min()} to {df_sorted['Date'].max()}
- Average Sentences per Document: {df_sorted['Sentence_Count'].mean():.1f}
- Average Chunks per Document: {df_sorted['Chunk_Count'].mean():.1f}

Model Performance Comparison:
- FinBERT Average Sentiment (Sentences): {df_sorted['FinBERT_Sentences'].mean():.3f}
- RoBERTa Average Sentiment (Sentences): {df_sorted['RoBERTa_Sentences'].mean():.3f}
- Correlation between FinBERT and RoBERTa: {df_sorted['FinBERT_Sentences'].corr(df_sorted['RoBERTa_Sentences']):.3f}

Sentiment Range Analysis:
- FinBERT Min/Max: {df_sorted['FinBERT_Sentences'].min():.3f} / {df_sorted['FinBERT_Sentences'].max():.3f}
- RoBERTa Min/Max: {df_sorted['RoBERTa_Sentences'].min():.3f} / {df_sorted['RoBERTa_Sentences'].max():.3f}

Consistency Analysis (Sentences vs Chunks):
- FinBERT Consistency Score: {1 - df_sorted['FinBERT_Difference'].mean():.3f}
- RoBERTa Consistency Score: {1 - df_sorted['RoBERTa_Difference'].mean():.3f}

Most Positive Sentiment:
- Date: {df_sorted.loc[df_sorted['FinBERT_Sentences'].idxmax(), 'Date']}
- FinBERT Score: {df_sorted['FinBERT_Sentences'].max():.3f}

Most Negative Sentiment:
- Date: {df_sorted.loc[df_sorted['FinBERT_Sentences'].idxmin(), 'Date']}
- FinBERT Score: {df_sorted['FinBERT_Sentences'].min():.3f}"""

# Save Excel with multiple sheets
with pd.ExcelWriter(CONFIG['detailed_excel_filename'], engine='openpyxl') as writer:
    # LaTeX table sheet
    latex_df = pd.DataFrame([latex_table], columns=['LaTeX_Table'])
    latex_df.to_excel(writer, sheet_name='LaTeX_Table', index=False, header=False)
    
    # Summary report sheet
    report_df = pd.DataFrame([report], columns=['Summary_Report'])
    report_df.to_excel(writer, sheet_name='Summary_Report', index=False, header=False)

print(f"\n✅ Main data saved: {CONFIG['main_excel_filename']}")
print(f"✅ Detailed analysis saved: {CONFIG['detailed_excel_filename']}")
print("Sheets: LaTeX_Table, Summary_Report")

# Download link
display(FileLink(CONFIG['main_excel_filename']))
display(FileLink(CONFIG['detailed_excel_filename']))


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0
Device set to use cuda:0


Processing: 17_October_2024
Processing: 17_April_2025
Processing: 14_December_2023
Processing: 11_April_2024
Processing: 12_December_2024
Processing: 21_July_2022
Processing: 08_September_2022
Processing: 14_September_2023
Processing: 12_September_2024
Processing: 07_March_2024
Processing: 27_October_2022
Processing: 16_March_2023
Processing: 06_March_2025
Processing: 26_October_2023
Processing: 27_July_2023
Processing: 25_January_2024
Processing: 02_February_2023
Processing: 30_January_2025
Processing: 18_July_2024
Processing: 05_June_2025
Processing: 09_June_2022
Processing: 15_December_2022
Processing: 15_June_2023
Processing: 04_May_2023
Processing: 06_June_2024
=== COMPLETE MODEL COMPARISON ===
             Date FinBERT_Sentences FinBERT_Chunks RoBERTa_Sentences RoBERTa_Chunks Sentence_Count Chunk_Count FinBERT_Difference RoBERTa_Difference
     09_June_2022             0.196          0.119             0.036         -0.069            123          46              0.077             

In [28]:
#NEU TRY

#complete Evaluation LLM Models
from transformers import pipeline
import os
import pandas as pd
from datetime import datetime
from IPython.display import FileLink
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

# Download NLTK Punkt data (one-time)
nltk.download('punkt')

# ============================================================================
# CONFIGURATION
# ============================================================================
CONFIG = {
    'main_excel_filename': 'ecb_sentiment_analysis_sum.xlsx',            
    'detailed_excel_filename': 'sentiment_analysis_details_sum.xlsx'
}

# OPTION: Enable/disable output
SHOW_PROGRESS = True

# Load both models
finbert_analyzer = pipeline("text-classification", model="ProsusAI/finbert", top_k=None)
roberta_analyzer = pipeline("text-classification", model="soleimanian/financial-roberta-large-sentiment", top_k=None)

# Path to folders
input_folder = "/kaggle/input/llm-text/TEXT/EZB/"

# Process all date folders
date_folders = [f for f in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, f))]

results = []

def improved_sentence_split(text):
    # Punkt parameters for ECB-specific abbreviations
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'mr', 'mrs', 'ms', 'dr', 'prof',  # Titles
        'e.g', 'i.e', 'etc', 'vs', 'cf',  # Latin abbreviations
        'ecb', 'eu', 'euro', 'gdp', 'cpi', 'ppp',  # Financial abbreviations
        'u.s', 'u.k', 'u.s.a', 'e.u',  # Countries
        'jan', 'feb', 'mar', 'apr', 'may', 'jun',  # Months
        'jul', 'aug', 'sep', 'oct', 'nov', 'dec',
        'inc', 'ltd', 'corp', 'co', 'llc',  # Companies
        'no', 'nos', 'vol', 'p', 'pp', 'fig',  # General abbreviations
        'tel', 'fax', 'email', 'www'  # Contact abbreviations
    ])
    
    # Tokenizer with adjusted parameters
    tokenizer = PunktSentenceTokenizer(punkt_param)
    
    # Split text into sentences
    sentences = tokenizer.tokenize(text)
    
    # Filter short sentences and clean
    sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
    
    return sentences

def analyze_with_model(analyzer, text_input):
    """Helper function for sentiment analysis"""
    result = analyzer(text_input)
    neg_prob = next(r['score'] for r in result[0] if r['label'] == 'negative')
    pos_prob = next(r['score'] for r in result[0] if r['label'] == 'positive')
    return pos_prob - neg_prob



for date_folder in date_folders:
    full_file = os.path.join(input_folder, date_folder, "0_FULL.txt")
    
    if os.path.exists(full_file):
        if SHOW_PROGRESS:
            print(f"Processing: {date_folder}")
        
        # Read file
        with open(full_file, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Prepare sentences and chunks
        sentences = improved_sentence_split(text)
        chunks = [text[i:i+512] for i in range(0, len(text), 400) if len(text[i:i+512].strip()) > 50]
        
        # FinBERT sentence analysis
        finbert_sentence_scores = []
        for sentence in sentences:
            if len(sentence.strip()) > 10:
                score = analyze_with_model(finbert_analyzer, sentence)
                finbert_sentence_scores.append(score)
        finbert_sentences_avg = sum(finbert_sentence_scores) / len(finbert_sentence_scores) if finbert_sentence_scores else 0
        finbert_sentences_sum = sum(finbert_sentence_scores) if finbert_sentence_scores else 0
        
        # FinBERT chunk analysis
        finbert_chunk_scores = []
        for chunk in chunks:
            score = analyze_with_model(finbert_analyzer, chunk)
            finbert_chunk_scores.append(score)
        finbert_chunks_avg = sum(finbert_chunk_scores) / len(finbert_chunk_scores) if finbert_chunk_scores else 0
        finbert_chunks_sum = sum(finbert_chunk_scores) if finbert_chunk_scores else 0
        
        # Financial-RoBERTa sentence analysis
        roberta_sentence_scores = []
        for sentence in sentences:
            if len(sentence.strip()) > 10:
                score = analyze_with_model(roberta_analyzer, sentence)
                roberta_sentence_scores.append(score)
        roberta_sentences_avg = sum(roberta_sentence_scores) / len(roberta_sentence_scores) if roberta_sentence_scores else 0
        roberta_sentences_sum = sum(roberta_sentence_scores) if roberta_sentence_scores else 0
        
        # Financial-RoBERTa chunk analysis
        roberta_chunk_scores = []
        for chunk in chunks:
            score = analyze_with_model(roberta_analyzer, chunk)
            roberta_chunk_scores.append(score)
        roberta_chunks_avg = sum(roberta_chunk_scores) / len(roberta_chunk_scores) if roberta_chunk_scores else 0
        roberta_chunks_sum = sum(roberta_chunk_scores) if roberta_chunk_scores else 0
        
        # Store results with BOTH average AND sum
        row = {
            'Date': date_folder,
            'FinBERT_Sentences_Avg': round(finbert_sentences_avg, 3),
            'FinBERT_Sentences_Sum': round(finbert_sentences_sum, 3),
            'FinBERT_Chunks_Avg': round(finbert_chunks_avg, 3),
            'FinBERT_Chunks_Sum': round(finbert_chunks_sum, 3),
            'RoBERTa_Sentences_Avg': round(roberta_sentences_avg, 3),
            'RoBERTa_Sentences_Sum': round(roberta_sentences_sum, 3),
            'RoBERTa_Chunks_Avg': round(roberta_chunks_avg, 3),
            'RoBERTa_Chunks_Sum': round(roberta_chunks_sum, 3),
            'Sentence_Count': len(finbert_sentence_scores),
            'Chunk_Count': len(finbert_chunk_scores)
        }
        results.append(row)

# Create DataFrame
df = pd.DataFrame(results)

# CHRONOLOGICAL SORTING
def parse_date(date_str):
    try:
        parts = date_str.split('_')
        if len(parts) == 3:
            day, month, year = parts
            return datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
        else:
            return datetime.max
    except:
        return datetime.max

df['Parsed_Date'] = df['Date'].apply(parse_date)
df_sorted = df.sort_values('Parsed_Date').drop(columns=['Parsed_Date'])

# Additional analysis: Consistency check (updated for new column names)
df_sorted['FinBERT_Difference'] = abs(df_sorted['FinBERT_Sentences_Avg'] - df_sorted['FinBERT_Chunks_Avg'])
df_sorted['RoBERTa_Difference'] = abs(df_sorted['RoBERTa_Sentences_Avg'] - df_sorted['RoBERTa_Chunks_Avg'])

# ADD AVERAGE ROW
average_row = df_sorted.select_dtypes(include=['number']).mean()
average_row['Date'] = 'Average'
df_with_average = pd.concat([df_sorted, average_row.to_frame().T], ignore_index=True)

print("=== COMPLETE MODEL COMPARISON (AVG + SUM) ===")
print(df_with_average.to_string(index=False))
print(f"FinBERT average difference sentences/chunks: {df_sorted['FinBERT_Difference'].mean():.3f}")
print(f"RoBERTa average difference sentences/chunks: {df_sorted['RoBERTa_Difference'].mean():.3f}")

# ===========================================
# SAVE TO EXCEL WITH MULTIPLE SHEETS
# ===========================================

# Save main data to separate excel file
df_with_average.to_excel(CONFIG['main_excel_filename'], index=False)

# Generate LaTeX table content WITH AVERAGE ROW (updated for new columns)
latex_table = df_with_average[['Date', 'FinBERT_Sentences_Avg', 'FinBERT_Sentences_Sum', 'RoBERTa_Sentences_Avg', 'RoBERTa_Sentences_Sum', 'Sentence_Count']].to_latex(
    index=False, 
    float_format="%.3f",
    caption="ECB Sentiment Analysis Results Comparison (Average and Sum)",
    label="tab:sentiment_comparison",
    column_format="lrrrrr"
)

# Generate summary report content (updated for new columns)
report = f"""ECB SENTIMENT ANALYSIS REPORT (AVG + SUM)
==========================================

Dataset Overview:
- Total Documents Analyzed: {len(df_sorted)}
- Date Range: {df_sorted['Date'].min()} to {df_sorted['Date'].max()}
- Average Sentences per Document: {df_sorted['Sentence_Count'].mean():.1f}
- Average Chunks per Document: {df_sorted['Chunk_Count'].mean():.1f}

Model Performance Comparison (AVERAGES):
- FinBERT Average Sentiment (Sentences): {df_sorted['FinBERT_Sentences_Avg'].mean():.3f}
- RoBERTa Average Sentiment (Sentences): {df_sorted['RoBERTa_Sentences_Avg'].mean():.3f}
- Correlation between FinBERT and RoBERTa: {df_sorted['FinBERT_Sentences_Avg'].corr(df_sorted['RoBERTa_Sentences_Avg']):.3f}

Model Performance Comparison (SUMS):
- FinBERT Average Sum (Sentences): {df_sorted['FinBERT_Sentences_Sum'].mean():.3f}
- RoBERTa Average Sum (Sentences): {df_sorted['RoBERTa_Sentences_Sum'].mean():.3f}
- Correlation between FinBERT and RoBERTa Sums: {df_sorted['FinBERT_Sentences_Sum'].corr(df_sorted['RoBERTa_Sentences_Sum']):.3f}

Sentiment Range Analysis (AVERAGES):
- FinBERT Min/Max: {df_sorted['FinBERT_Sentences_Avg'].min():.3f} / {df_sorted['FinBERT_Sentences_Avg'].max():.3f}
- RoBERTa Min/Max: {df_sorted['RoBERTa_Sentences_Avg'].min():.3f} / {df_sorted['RoBERTa_Sentences_Avg'].max():.3f}

Sentiment Range Analysis (SUMS):
- FinBERT Sum Min/Max: {df_sorted['FinBERT_Sentences_Sum'].min():.3f} / {df_sorted['FinBERT_Sentences_Sum'].max():.3f}
- RoBERTa Sum Min/Max: {df_sorted['RoBERTa_Sentences_Sum'].min():.3f} / {df_sorted['RoBERTa_Sentences_Sum'].max():.3f}

Consistency Analysis (Sentences vs Chunks):
- FinBERT Consistency Score: {1 - df_sorted['FinBERT_Difference'].mean():.3f}
- RoBERTa Consistency Score: {1 - df_sorted['RoBERTa_Difference'].mean():.3f}

Most Positive Sentiment (AVG):
- Date: {df_sorted.loc[df_sorted['FinBERT_Sentences_Avg'].idxmax(), 'Date']}
- FinBERT Score: {df_sorted['FinBERT_Sentences_Avg'].max():.3f}

Most Negative Sentiment (AVG):
- Date: {df_sorted.loc[df_sorted['FinBERT_Sentences_Avg'].idxmin(), 'Date']}
- FinBERT Score: {df_sorted['FinBERT_Sentences_Avg'].min():.3f}

Highest Total Sentiment (SUM):
- Date: {df_sorted.loc[df_sorted['FinBERT_Sentences_Sum'].idxmax(), 'Date']}
- FinBERT Sum: {df_sorted['FinBERT_Sentences_Sum'].max():.3f}"""

# Save Excel with multiple sheets
with pd.ExcelWriter(CONFIG['detailed_excel_filename'], engine='openpyxl') as writer:
    # LaTeX table sheet
    latex_df = pd.DataFrame([latex_table], columns=['LaTeX_Table'])
    latex_df.to_excel(writer, sheet_name='LaTeX_Table', index=False, header=False)
    
    # Summary report sheet
    report_df = pd.DataFrame([report], columns=['Summary_Report'])
    report_df.to_excel(writer, sheet_name='Summary_Report', index=False, header=False)

print(f"\n✅ Main data saved: {CONFIG['main_excel_filename']}")
print(f"✅ Detailed analysis saved: {CONFIG['detailed_excel_filename']}")
print("Sheets: LaTeX_Table, Summary_Report")

# Download link
display(FileLink(CONFIG['main_excel_filename']))
display(FileLink(CONFIG['detailed_excel_filename']))


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0
Device set to use cuda:0


Processing: 17_October_2024
Processing: 17_April_2025
Processing: 14_December_2023
Processing: 11_April_2024
Processing: 12_December_2024
Processing: 21_July_2022
Processing: 08_September_2022
Processing: 14_September_2023
Processing: 12_September_2024
Processing: 07_March_2024
Processing: 27_October_2022
Processing: 16_March_2023
Processing: 06_March_2025
Processing: 26_October_2023
Processing: 27_July_2023
Processing: 25_January_2024
Processing: 02_February_2023
Processing: 30_January_2025
Processing: 18_July_2024
Processing: 05_June_2025
Processing: 09_June_2022
Processing: 15_December_2022
Processing: 15_June_2023
Processing: 04_May_2023
Processing: 06_June_2024
=== COMPLETE MODEL COMPARISON (AVG + SUM) ===
             Date FinBERT_Sentences_Avg FinBERT_Sentences_Sum FinBERT_Chunks_Avg FinBERT_Chunks_Sum RoBERTa_Sentences_Avg RoBERTa_Sentences_Sum RoBERTa_Chunks_Avg RoBERTa_Chunks_Sum Sentence_Count Chunk_Count FinBERT_Difference RoBERTa_Difference
     09_June_2022               

In [29]:
from transformers import pipeline

def test_model_labels():
    """Test function to check the exact labels used by both models"""
    
    # Load both models
    print("🔄 Loading models...")
    finbert_analyzer = pipeline("text-classification", model="ProsusAI/finbert", top_k=None)
    roberta_analyzer = pipeline("text-classification", model="soleimanian/financial-roberta-large-sentiment", top_k=None)
    
    # Test text
    sample_text = "The company's revenue increased significantly this quarter, showing strong growth."
    
    print("\n" + "="*60)
    print("📊 TESTING MODEL LABELS")
    print("="*60)
    
    # Test FinBERT
    print("\n🔍 FINBERT MODEL:")
    print("-" * 30)
    finbert_result = finbert_analyzer(sample_text)
    print(f"Raw output: {finbert_result}")
    
    finbert_labels = {r['label']: r['score'] for r in finbert_result[0]}
    print(f"Available labels: {list(finbert_labels.keys())}")
    print(f"Label-Score mapping: {finbert_labels}")
    
    # Test RoBERTa
    print("\n🔍 ROBERTA MODEL:")
    print("-" * 30)
    roberta_result = roberta_analyzer(sample_text)
    print(f"Raw output: {roberta_result}")
    
    roberta_labels = {r['label']: r['score'] for r in roberta_result[0]}
    print(f"Available labels: {list(roberta_labels.keys())}")
    print(f"Label-Score mapping: {roberta_labels}")
    
    print("\n" + "="*60)
    print("📋 SUMMARY:")
    print("="*60)
    print(f"FinBERT labels: {list(finbert_labels.keys())}")
    print(f"RoBERTa labels: {list(roberta_labels.keys())}")
    
    return finbert_labels, roberta_labels

# Korrigierte analyze_with_model Funktion basierend auf den Ergebnissen
def analyze_with_model_corrected(analyzer, text_input, model_type="finbert"):
    """Helper function for sentiment analysis with correct labels"""
    result = analyzer(text_input)
    labels = {r['label']: r['score'] for r in result[0]}
    
    if model_type.lower() == "finbert":
        # FinBERT verwendet lowercase labels
        neg_prob = labels.get('negative')
        pos_prob = labels.get('positive')
    else:  # roberta
        # RoBERTa verwendet capitalized labels
        neg_prob = labels.get('negative')
        pos_prob = labels.get('positive')
    
    return pos_prob - neg_prob

# Teste die Labels
if __name__ == "__main__":
    finbert_labels, roberta_labels = test_model_labels()
    
    # Teste die korrigierte Funktion
    print("\n🧪 TESTING CORRECTED FUNCTION:")
    print("-" * 40)
    
    # Load models again for testing
    finbert_analyzer = pipeline("text-classification", model="ProsusAI/finbert", top_k=None)
    roberta_analyzer = pipeline("text-classification", model="soleimanian/financial-roberta-large-sentiment", top_k=None)
    
    test_text = "The market outlook is very positive with strong growth expected."
    
    finbert_score = analyze_with_model_corrected(finbert_analyzer, test_text, "finbert")
    roberta_score = analyze_with_model_corrected(roberta_analyzer, test_text, "roberta")
    
    print(f"FinBERT sentiment score: {finbert_score:.4f}")
    print(f"RoBERTa sentiment score: {roberta_score:.4f}")


🔄 Loading models...


Device set to use cuda:0
Device set to use cuda:0



📊 TESTING MODEL LABELS

🔍 FINBERT MODEL:
------------------------------
Raw output: [[{'label': 'positive', 'score': 0.9563620090484619}, {'label': 'neutral', 'score': 0.02615462802350521}, {'label': 'negative', 'score': 0.017483336851000786}]]
Available labels: ['positive', 'neutral', 'negative']
Label-Score mapping: {'positive': 0.9563620090484619, 'neutral': 0.02615462802350521, 'negative': 0.017483336851000786}

🔍 ROBERTA MODEL:
------------------------------
Raw output: [[{'label': 'positive', 'score': 0.9982404708862305}, {'label': 'negative', 'score': 0.000943619990721345}, {'label': 'neutral', 'score': 0.0008159285644069314}]]
Available labels: ['positive', 'negative', 'neutral']
Label-Score mapping: {'positive': 0.9982404708862305, 'negative': 0.000943619990721345, 'neutral': 0.0008159285644069314}

📋 SUMMARY:
FinBERT labels: ['positive', 'neutral', 'negative']
RoBERTa labels: ['positive', 'negative', 'neutral']

🧪 TESTING CORRECTED FUNCTION:
---------------------------------

Device set to use cuda:0
Device set to use cuda:0


FinBERT sentiment score: 0.9334
RoBERTa sentiment score: 0.9975
