In [261]:
# Import pandas for data handling
import pandas as pd

# Import NLTK stopwords for German text preprocessing
import nltk
from nltk.corpus import stopwords

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [262]:
# Download German stopwords if not already available
try:
    stopwords.words("german")
except LookupError:
    nltk.download("stopwords")

# Store German stopwords for use in TF-IDF vectorizer
german_stopwords = stopwords.words("german")

In [263]:
# Load training and validation datasets from CSV files
train_df = pd.read_csv('../data/challenge_1/train/classification_data.csv')
val_df = pd.read_csv('../data/challenge_1/val/classification_data.csv')

In [264]:
# Features to use
text_feature = "description"

# Define additional binary keyword indicator features
keyword_features = [
    "has_verkehr_keywords",
    "has_bildung_keywords",
    "has_umwelt_keywords",
    "has_gesundheit_keywords"
]

In [265]:
# Select features and labels from the training and validation datasets
X_train = train_df[[text_feature] + keyword_features]
y_train = train_df["responsible_entity_id"]

X_val = val_df[[text_feature] + keyword_features]
y_val = val_df["responsible_entity_id"]

In [266]:
# Limit the classification task to the top N most frequent categories
TOP_N = 5
top_classes = y_train.value_counts().nlargest(TOP_N).index

# Filter training data to only include top N classes
train_mask = y_train.isin(top_classes)
X_train_filtered = X_train[train_mask]
y_train_filtered = y_train[train_mask]

# Filter validation data to only include same top N classes
val_mask = y_val.isin(top_classes)
X_val_filtered = X_val[val_mask]
y_val_filtered = y_val[val_mask]

# Use all training and validation data as-is
#X_train_filtered = X_train
#y_train_filtered = y_train

#X_val_filtered = X_val
#y_val_filtered = y_val


In [267]:
# Create a text processing pipeline: TF-IDF vectorization + Dimensionality Reduction
text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words=german_stopwords,
        max_features=10000,
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.9
    )),
    ("svd", TruncatedSVD(n_components=300, random_state=42))  # Reduce to 300 dimensions
])

# Combine text and keyword features into a single preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipeline, "description"),
        ("keywords", "passthrough", keyword_features),
    ]
)

In [268]:
# Create full pipeline: preprocessing + classifier
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
))
])

In [269]:
# Fit the model on the filtered training data
pipeline.fit(X_train_filtered, y_train_filtered)

In [270]:
# Generate predictions on the filtered validation set
y_pred_filtered = pipeline.predict(X_val_filtered)

In [271]:
# Print overall accuracy
print("⭐ Accuracy:", accuracy_score(y_val_filtered, y_pred_filtered))

# Print detailed classification report with zero_division=0 to suppress warnings
print(classification_report(y_val_filtered, y_pred_filtered, zero_division=0))

⭐ Accuracy: 0.84
                                                  precision    recall  f1-score   support

BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR       0.90      1.00      0.95        19
                                      LAND_02_SM       0.33      1.00      0.50         1
                                      LAND_03_BM       1.00      0.50      0.67         2
                                      LAND_11_SM       0.00      0.00      0.00         2
                                      LAND_12_MW       0.00      0.00      0.00         1

                                        accuracy                           0.84        25
                                       macro avg       0.45      0.50      0.42        25
                                    weighted avg       0.78      0.84      0.80        25

