In [1]:
from pathlib import Path

import pandas as pd

In [2]:
# =============================================================================
# 1. Pfade konfigurieren 
# =============================================================================
INPUT_CSV = Path(r"C:\1_Projekte\quantum_cognition\data\1-4_panel_segmented\transcript_panel_segmentation.csv")         # Originaldatei
OUTPUT_CSV = Path(r"C:\1_Projekte\quantum_cognition\data\1-5_panel_transformed/transcript_panel_clean.csv") # Neuer Speicherort
OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)     # Ordner automatisch erzeugen

In [3]:
# =============================================================================
# 2. Laden der Eingangsdaten
# =============================================================================
df = pd.read_csv(INPUT_CSV)

print("Eingangsdaten geladen:")
print(df.shape)
df.head()


Eingangsdaten geladen:
(28157, 14)


Unnamed: 0,folder_relative,file_name,segment_index,start_time,end_time,text,finbert_label,finbert_prob_positive,finbert_prob_negative,finbert_prob_neutral,key_phrase_1,key_phrase_2,qa_dummy,key_phrase_2_match
0,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,0,00:00.000,00:08.680,"Welcome to AB InBev's third quarter, 2024 earn...",neutral,0.06914,0.012696,0.918164,1,0,0,
1,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,1,00:08.680,00:13.920,Hosting the call today from AB InBev are Mr. M...,neutral,0.057243,0.012598,0.930159,0,0,0,
2,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,2,00:13.920,00:16.880,"and Mr. Fernando Tenenbaum, Chief Financial Of...",neutral,0.036067,0.01954,0.944393,0,0,0,
3,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,3,00:16.880,00:22.200,To access the slides accompanying today's call...,neutral,0.043265,0.01498,0.941754,0,0,0,
4,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,4,00:22.200,00:29.480,www.ab-inbev.com and click on the Investors ta...,neutral,0.02885,0.022394,0.948756,0,0,0,


In [4]:
# =============================================================================
# 43 Spalten löschen (Standard: keine Löschung)
# =============================================================================

columns_to_drop = [
    "key_phrase_1"
]

# Ausführung Löschbefehl:
df = df.drop(columns=columns_to_drop, errors="ignore")

print("Form nach Löschung (falls angewandt):", df.shape)
df.head()

Form nach Löschung (falls angewandt): (28157, 13)


Unnamed: 0,folder_relative,file_name,segment_index,start_time,end_time,text,finbert_label,finbert_prob_positive,finbert_prob_negative,finbert_prob_neutral,key_phrase_2,qa_dummy,key_phrase_2_match
0,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,0,00:00.000,00:08.680,"Welcome to AB InBev's third quarter, 2024 earn...",neutral,0.06914,0.012696,0.918164,0,0,
1,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,1,00:08.680,00:13.920,Hosting the call today from AB InBev are Mr. M...,neutral,0.057243,0.012598,0.930159,0,0,
2,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,2,00:13.920,00:16.880,"and Mr. Fernando Tenenbaum, Chief Financial Of...",neutral,0.036067,0.01954,0.944393,0,0,
3,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,3,00:16.880,00:22.200,To access the slides accompanying today's call...,neutral,0.043265,0.01498,0.941754,0,0,
4,ABI.BR,earnings_conference_call_ABI.BR_2024_Q3_202410...,4,00:22.200,00:29.480,www.ab-inbev.com and click on the Investors ta...,neutral,0.02885,0.022394,0.948756,0,0,


In [5]:
# =============================================================================
# 3. Spalten explizit umbenennen (vollständig ausgeschriebenes Dict)
# =============================================================================

# Automatisch erzeugte Vorlage eines vollständigen Rename-Dictionaries.
rename_dict = {
    col: col
    for col in df.columns
}

# Ausgabe:
rename_dict


{'folder_relative': 'folder_relative',
 'file_name': 'file_name',
 'segment_index': 'segment_index',
 'start_time': 'start_time',
 'end_time': 'end_time',
 'text': 'text',
 'finbert_label': 'finbert_label',
 'finbert_prob_positive': 'finbert_prob_positive',
 'finbert_prob_negative': 'finbert_prob_negative',
 'finbert_prob_neutral': 'finbert_prob_neutral',
 'key_phrase_2': 'key_phrase_2',
 'qa_dummy': 'qa_dummy',
 'key_phrase_2_match': 'key_phrase_2_match'}

In [6]:
# Umbennennen von Spalten:
rename_dict = {
    'folder_relative': 'stock',
    'file_name': 'file_name',
    'segment_index': 'segment_index',
    'start_time': 'start_time',
    'end_time': 'end_time',
    'text': 'text',
    'finbert_label': 'finbert_label',
    'finbert_prob_positive': 'finbert_prob_positive',
    'finbert_prob_negative': 'finbert_prob_negative',
    'finbert_prob_neutral': 'finbert_prob_neutral',
    'key_phrase_2': 'q&a_trigger_dummy',
    'qa_dummy': 'q&a_segment_dummy',
    'key_phrase_2_match': 'matched_sequence'}

In [7]:
# Dataframe umbenennen
df = df.rename(columns=rename_dict)

# Ausgabe:
df.columns.tolist()


['stock',
 'file_name',
 'segment_index',
 'start_time',
 'end_time',
 'text',
 'finbert_label',
 'finbert_prob_positive',
 'finbert_prob_negative',
 'finbert_prob_neutral',
 'q&a_trigger_dummy',
 'q&a_segment_dummy',
 'matched_sequence']

In [8]:
# =============================================================================
# 5. Export (immer in neues Ziel)
# =============================================================================

df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"Transformierte Tabelle gespeichert unter:\n{OUTPUT_CSV}")


Transformierte Tabelle gespeichert unter:
C:\1_Projekte\quantum_cognition\data\1-5_panel_transformed\transcript_panel_clean.csv
