In [123]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [124]:
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords if needed
try:
    stopwords.words("german")
except LookupError:
    nltk.download("stopwords")

german_stopwords = stopwords.words("german")

In [125]:

# 1. Load your data
train_df = pd.read_csv('../data/challenge_1/train/classification_data.csv')
val_df = pd.read_csv('../data/challenge_1/val/classification_data.csv')


In [126]:
# Features to use
text_feature = "description"
keyword_features = [
    "has_verkehr_keywords",
    "has_bildung_keywords",
    "has_umwelt_keywords",
    "has_gesundheit_keywords"
]

In [127]:
X_train = train_df[[text_feature] + keyword_features]
y_train = train_df["responsible_entity_id"]
X_val = val_df[[text_feature] + keyword_features]
y_val = val_df["responsible_entity_id"]

In [128]:
X = df[[text_feature] + keyword_features]  # Text input
y = df["responsible_entity_id"]  # What we want to predict

In [129]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [130]:
from sklearn.compose import ColumnTransformer

# Column transformer: handle text and numeric features separately
preprocessor = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(
            stop_words=german_stopwords,
            max_features=10000,
            max_df=10,
            ngram_range=(1,2)
        ), "description"),
        ("keywords", "passthrough", keyword_features),
    ]
)


In [131]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# 4. Create a pipeline: TF-IDF vectorizer + Decision Tree
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", DecisionTreeClassifier())  # classifier
])


In [132]:

# 5. Train the model
pipeline.fit(X_train, y_train)

In [133]:
# 6. Make predictions
y_pred = pipeline.predict(X_val)


In [134]:
# 7. Evaluate
print(classification_report(y_val, y_pred))

                                                  precision    recall  f1-score   support

BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR       0.93      0.74      0.82        19
                                      LAND_01_BM       0.00      0.00      0.00         2
                                      LAND_01_IM       0.00      0.00      0.00         2
                                      LAND_01_MW       0.00      0.00      0.00         1
                                      LAND_01_SM       0.00      0.00      0.00         4
                                      LAND_01_UM       0.00      0.00      0.00         0
                                      LAND_01_VM       0.00      0.00      0.00         2
                                      LAND_02_FM       0.00      0.00      0.00         0
                                      LAND_02_GM       0.00      0.00      0.00         0
                                      LAND_02_IM       0.00      0.00      0.00         0
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
