In [1]:
%run 2.1_data_preparation_news.ipynb

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from scipy.sparse import hstack

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Title              object
Tag                object
Date       datetime64[ns]
Content            object
dtype: object


In [2]:
# Fehlende Werte in numerischen Spalten auffüllen
for col in ['Prev_change', 'Sma_5', 'Volatility']:
    tokenized_df[col] = tokenized_df[col].fillna(tokenized_df[col].mean())

# Text zusammenfügen
tokenized_df['text'] = tokenized_df['Cleaned Content'].apply(lambda x: ' '.join(x))

# TF-IDF-Vektorisierung
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.9, min_df=5)
X_text = vectorizer.fit_transform(tokenized_df['text'])

# Numerische Features normalisieren
numeric_features = ['Prev_change', 'Sma_5', 'Volatility']
scaler = StandardScaler()
X_numeric = scaler.fit_transform(tokenized_df[numeric_features])

# Text + Zahlen kombinieren
X_combined = hstack([X_text, X_numeric])

# Zielvariable
y = tokenized_df['Simple Label']


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42
)


In [4]:
# Dummy-Modell: immer häufigste Klasse vorhersagen
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)
y_dummy_pred = dummy.predict(X_test)

print("=== Dummy Classifier (most frequent) ===")
print(classification_report(y_test, y_dummy_pred))


=== Dummy Classifier (most frequent) ===
              precision    recall  f1-score   support

    negative       0.55      1.00      0.71      1500
     positiv       0.00      0.00      0.00      1228

    accuracy                           0.55      2728
   macro avg       0.27      0.50      0.35      2728
weighted avg       0.30      0.55      0.39      2728



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [5]:
# Dummy-Modell: zufällige Vorhersagen
dummy_random = DummyClassifier(strategy="uniform")
dummy_random.fit(X_train, y_train)
y_dummy_rand_pred = dummy_random.predict(X_test)

print("=== Dummy Classifier (random) ===")
print(classification_report(y_test, y_dummy_rand_pred))

=== Dummy Classifier (random) ===
              precision    recall  f1-score   support

    negative       0.57      0.52      0.54      1500
     positiv       0.47      0.51      0.49      1228

    accuracy                           0.52      2728
   macro avg       0.52      0.52      0.52      2728
weighted avg       0.52      0.52      0.52      2728

