In [661]:
import logging
import os
from datetime import datetime

# Create logs directory
log_dir = '../outputs/logs'
os.makedirs(log_dir, exist_ok=True)

# Create unique log filename with timestamp (to ms)
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')[:-3]
log_filename = f'challenge1_{timestamp}.log'
log_path = os.path.join(log_dir, log_filename)

# 🔁 Remove all handlers associated with the root logger
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# ✅ Reconfigure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s | %(levelname)s | %(message)s',
    handlers=[
        logging.FileHandler(log_path),
        logging.StreamHandler()  # Optional: also print to console
    ]
)

logging.info("🚀 Logging system initialized.")


2025-06-13 17:25:08,754 | INFO | 🚀 Logging system initialized.


In [662]:
# Import pandas for data handling
import pandas as pd

# Import NLTK stopwords for German text preprocessing
import nltk
from nltk.corpus import stopwords

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [663]:
# Download German stopwords if not already available
try:
    stopwords.words("german")
except LookupError:
    nltk.download("stopwords")

# Store German stopwords for use in TF-IDF vectorizer
german_stopwords = stopwords.words("german")

In [664]:
# Load training and validation datasets from CSV files
train_df = pd.read_csv('../data/challenge_1/train/classification_data.csv')
val_df = pd.read_csv('../data/challenge_1/val/classification_data.csv')

In [665]:
# Features to use
text_feature = "description"

# Define additional binary keyword indicator features
keyword_features = [
    "has_verkehr_keywords",
    "has_bildung_keywords",
    "has_umwelt_keywords",
    "has_gesundheit_keywords",
]

categorical_features = ["state", "category"]

all_features = [text_feature] + keyword_features + categorical_features

In [666]:
# Select features and labels from the training and validation datasets
X_train = train_df[all_features]
y_train = train_df["responsible_entity_id"]

X_val = val_df[all_features]
y_val = val_df["responsible_entity_id"]

In [667]:
# Limit the classification task to the top N most frequent categories
TOP_N = 5
top_classes = y_train.value_counts().nlargest(TOP_N).index

# Filter training data to only include top N classes
#train_mask = y_train.isin(top_classes)
#X_train_filtered = X_train[train_mask]
#y_train_filtered = y_train[train_mask]

# Filter validation data to only include same top N classes
#val_mask = y_val.isin(top_classes)
#X_val_filtered = X_val[val_mask]
#y_val_filtered = y_val[val_mask]

# Use all training and validation data as-is
X_train_filtered = X_train
y_train_filtered = y_train

X_val_filtered = X_val
y_val_filtered = y_val


In [668]:
# Create a text processing pipeline: TF-IDF vectorization + Dimensionality Reduction
from sklearn.preprocessing import OneHotEncoder
text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words=german_stopwords,
        max_features=10000,
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.9
    )),
    ("svd", TruncatedSVD(n_components=300, random_state=42))  # Reduce to 300 dimensions
])

# Combine text and keyword features into a single preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipeline, "description"),
        ("keywords", "passthrough", keyword_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features) # onehot encoding
    ]
)

In [669]:
# Create full pipeline: preprocessing + classifier
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
))
])

In [670]:
# Fit the model on the filtered training data
pipeline.fit(X_train_filtered, y_train_filtered)

In [671]:
# Save the trained pipeline model to /outputs/models
import joblib
import os

model_output_dir = '../outputs/models/'
os.makedirs(model_output_dir, exist_ok=True)

model_path = os.path.join(model_output_dir, 'challenge1_model.pkl')
joblib.dump(pipeline, model_path)

logging.info(f"✅ Model saved to: {model_path}")

2025-06-13 17:25:10,999 | INFO | ✅ Model saved to: ../outputs/models/challenge1_model.pkl


In [672]:
# Generate predictions on the filtered validation set
y_pred_filtered = pipeline.predict(X_val_filtered)

In [673]:
# Print overall accuracy
accuracy = accuracy_score(y_val_filtered, y_pred_filtered)
logging.info(f"⭐ Accuracy: {accuracy}")

# Print detailed classification report with zero_division=0 to suppress warnings
logging.info(classification_report(y_val_filtered, y_pred_filtered, zero_division=0))

2025-06-13 17:25:11,216 | INFO | ⭐ Accuracy: 0.16778523489932887
2025-06-13 17:25:11,223 | INFO |                                                   precision    recall  f1-score   support

BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR       0.83      1.00      0.90        19
                                      LAND_01_BM       0.00      0.00      0.00         2
                                      LAND_01_IM       0.00      0.00      0.00         2
                                      LAND_01_MW       0.00      0.00      0.00         1
                                      LAND_01_SM       0.33      0.25      0.29         4
                                      LAND_01_UM       0.00      0.00      0.00         0
                                      LAND_01_VM       0.00      0.00      0.00         2
                                      LAND_02_GM       0.00      0.00      0.00         0
                                      LAND_02_MW       0.00      0.00      0.00         1
 

In [674]:
import os
import pandas as pd

# Load test data
test_df = pd.read_csv('../data/challenge_1/test/classification_data.csv')

# Select the same features as used during training
text_feature = "description"
keyword_features = [
    "has_verkehr_keywords",
    "has_bildung_keywords",
    "has_umwelt_keywords",
    "has_gesundheit_keywords"
]
categorical_features = ["state", "category"]
all_features = [text_feature] + keyword_features + categorical_features
test_X = test_df[all_features]

# Make predictions using the trained pipeline
predictions = pipeline.predict(test_X)

# Create submission DataFrame
submission = pd.DataFrame({
    'issue_id': test_df['issue_id'],
    'responsible_entity_id': predictions
})

# ✅ Ensure the output directory exists
output_dir = '../outputs/submission/'
os.makedirs(output_dir, exist_ok=True)

# ✅ Save submission file in the correct directory
submission_path = os.path.join(output_dir, f'challenge1_submission-{timestamp}.csv')
submission.to_csv(submission_path, index=False)

# Optional: Save test data with predictions for inspection
debug_path = os.path.join(output_dir, f'test_with_predictions-{timestamp}.csv')
test_df["predicted_responsible_entity_id"] = predictions
test_df.to_csv(debug_path, index=False)

logging.info(f"✅ Submission saved to: {submission_path}")
logging.info(f"📄 Full test data with predictions saved to: {debug_path}")

2025-06-13 17:25:11,351 | INFO | ✅ Submission saved to: ../outputs/submission/challenge1_submission-2025-06-13_17-25-08-752.csv
2025-06-13 17:25:11,352 | INFO | 📄 Full test data with predictions saved to: ../outputs/submission/test_with_predictions.csv


In [675]:
# Logging is done
try:
    # your main code here
    logging.info('🎉 Execution completed successfully.')
except Exception as e:
    logging.error(f"❌ Execution failed: {e}", exc_info=True)


2025-06-13 17:25:11,453 | INFO | 🎉 Execution completed successfully.
