In [None]:
!pip install pandas numpy nltk matplotlib seaborn wordcloud

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


In [None]:
# Nachrichten-Dataset von Berke (FinSen)
news_url = "https://raw.githubusercontent.com/EagleAdelaide/FinSen_Dataset/refs/heads/main/data.pptx/FinSen_US_Categorized_Timestamp.csv"
df_news = pd.read_csv(news_url)
df_news['Time'] = pd.to_datetime(df_news['Time'], format='%d/%m/%Y')

# NASDAQ-Kursdaten von Andrei
df_price = pd.read_csv("./data/HistoricalData_1750671813123.csv")
df_price['Date'] = pd.to_datetime(df_price['Date'], format='%m/%d/%Y')


In [None]:
# Kursveränderung berechnen (Von Andrei)
df_price['change'] = np.round((df_price['Open'] - df_price['Close/Last']) / df_price['Open'], 5)

# Label anhand der Kursveränderung
conditions = [
    (df_price['change'] >= 0.20),
    ((df_price['change'] >= 0.13) & (df_price['change'] < 0.20)),
    ((df_price['change'] >= 0.07) & (df_price['change'] < 0.13)),
    ((df_price['change'] >= -0.07) & (df_price['change'] < 0.07)),
    ((df_price['change'] >= -0.13) & (df_price['change'] < -0.07)),
    ((df_price['change'] >= -0.20) & (df_price['change'] < -0.13)),
    (df_price['change'] < -0.20)
]
labels = ['strong_gain', 'significant_gain', 'moderate_gain', 'neutral', 'moderate_loss', 'significant_loss', 'strong_loss']
df_price['label'] = np.select(conditions, labels)


In [None]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', str(text)).lower()
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

df_news['cleaned_content'] = df_news['Content'].apply(clean_text)


In [None]:
# Beide DataFrames mit Datumsspalten angleichen
df_news['date'] = df_news['Time'].dt.date
df_price['date'] = df_price['Date'].dt.date

# Merge
merged_df = pd.merge(df_news, df_price[['date', 'label']], on='date', how='left')


In [None]:
def simplify_label(label):
    if 'gain' in label:
        return 'positive'
    elif 'loss' in label:
        return 'negative'
    else:
        return 'neutral'

merged_df['simple_label'] = merged_df['label'].apply(simplify_label)

