In [550]:
import logging
import os
from datetime import datetime

# Create logs directory
log_dir = '../outputs/logs'
os.makedirs(log_dir, exist_ok=True)

# Create unique log filename with timestamp (to ms)
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')[:-3]
log_filename = f'challenge1_{timestamp}.log'
log_path = os.path.join(log_dir, log_filename)

# 🔁 Remove all handlers associated with the root logger
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# ✅ Reconfigure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s | %(levelname)s | %(message)s',
    handlers=[
        logging.FileHandler(log_path),
        logging.StreamHandler()  # Optional: also print to console
    ]
)

logging.info("🚀 Logging system initialized.")


2025-06-13 15:56:57,902 | INFO | 🚀 Logging system initialized.


In [551]:
# Import pandas for data handling
import pandas as pd

# Import NLTK stopwords for German text preprocessing
import nltk
from nltk.corpus import stopwords

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [552]:
# Download German stopwords if not already available
try:
    stopwords.words("german")
except LookupError:
    nltk.download("stopwords")

# Store German stopwords for use in TF-IDF vectorizer
german_stopwords = stopwords.words("german")

In [553]:
# Load training and validation datasets from CSV files
train_df = pd.read_csv('../data/challenge_1/train/classification_data.csv')
val_df = pd.read_csv('../data/challenge_1/val/classification_data.csv')

In [554]:
# Features to use
text_feature = "description"

# Define additional binary keyword indicator features
keyword_features = [
    "has_verkehr_keywords",
    "has_bildung_keywords",
    "has_umwelt_keywords",
    "has_gesundheit_keywords"
]

In [555]:
# Select features and labels from the training and validation datasets
X_train = train_df[[text_feature] + keyword_features]
y_train = train_df["responsible_entity_id"]

X_val = val_df[[text_feature] + keyword_features]
y_val = val_df["responsible_entity_id"]

In [556]:
# Limit the classification task to the top N most frequent categories
#TOP_N = 20
#top_classes = y_train.value_counts().nlargest(TOP_N).index

# Filter training data to only include top N classes
#train_mask = y_train.isin(top_classes)
#X_train_filtered = X_train[train_mask]
#y_train_filtered = y_train[train_mask]

# Filter validation data to only include same top N classes
#val_mask = y_val.isin(top_classes)
#X_val_filtered = X_val[val_mask]
#y_val_filtered = y_val[val_mask]

# Use all training and validation data as-is
X_train_filtered = X_train
y_train_filtered = y_train

X_val_filtered = X_val
y_val_filtered = y_val


In [557]:
# Create a text processing pipeline: TF-IDF vectorization + Dimensionality Reduction
text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words=german_stopwords,
        max_features=10000,
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.9
    )),
    ("svd", TruncatedSVD(n_components=300, random_state=42))  # Reduce to 300 dimensions
])

# Combine text and keyword features into a single preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipeline, "description"),
        ("keywords", "passthrough", keyword_features),
    ]
)

In [558]:
# Create full pipeline: preprocessing + classifier
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
))
])

In [559]:
# Fit the model on the filtered training data
pipeline.fit(X_train_filtered, y_train_filtered)

In [560]:
# Save the trained pipeline model to /outputs/models
import joblib
import os

model_output_dir = '../outputs/models/'
os.makedirs(model_output_dir, exist_ok=True)

model_path = os.path.join(model_output_dir, 'challenge1_model.pkl')
joblib.dump(pipeline, model_path)

logging.info(f"✅ Model saved to: {model_path}")

2025-06-13 15:56:59,704 | INFO | ✅ Model saved to: ../outputs/models/challenge1_model.pkl


In [561]:
# Generate predictions on the filtered validation set
y_pred_filtered = pipeline.predict(X_val_filtered)

In [562]:
# Print overall accuracy
logging.info("⭐ Accuracy:", accuracy_score(y_val_filtered, y_pred_filtered))

# Print detailed classification report with zero_division=0 to suppress warnings
logging.info(classification_report(y_val_filtered, y_pred_filtered, zero_division=0))

--- Logging error ---
Traceback (most recent call last):
  File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/logging/__init__.py", line 1083, in emit
    msg = self.format(record)
  File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/logging/__init__.py", line 927, in format
    return fmt.format(record)
  File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/logging/__init__.py", line 663, in format
    record.message = record.getMessage()
  File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/logging/__init__.py", line 367, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/pyth

In [563]:
import os
import pandas as pd

# Load test data
test_df = pd.read_csv('../data/challenge_1/test/classification_data.csv')

# Select the same features as used during training
text_feature = "description"
keyword_features = [
    "has_verkehr_keywords",
    "has_bildung_keywords",
    "has_umwelt_keywords",
    "has_gesundheit_keywords"
]
test_X = test_df[[text_feature] + keyword_features]

# Make predictions using the trained pipeline
predictions = pipeline.predict(test_X)

# Create submission DataFrame
submission = pd.DataFrame({
    'issue_id': test_df['issue_id'],
    'responsible_entity_id': predictions
})

# ✅ Ensure the output directory exists
output_dir = '../outputs/submission/'
os.makedirs(output_dir, exist_ok=True)

# ✅ Save submission file in the correct directory
submission_path = os.path.join(output_dir, 'challenge1_submission.csv')
submission.to_csv(submission_path, index=False)

# Optional: Save test data with predictions for inspection
debug_path = os.path.join(output_dir, 'test_with_predictions.csv')
test_df["predicted_responsible_entity_id"] = predictions
test_df.to_csv(debug_path, index=False)

logging.info(f"✅ Submission saved to: {submission_path}")
logging.info(f"📄 Full test data with predictions saved to: {debug_path}")

2025-06-13 15:56:59,926 | INFO | ✅ Submission saved to: ../outputs/submission/challenge1_submission.csv
2025-06-13 15:56:59,927 | INFO | 📄 Full test data with predictions saved to: ../outputs/submission/test_with_predictions.csv


In [564]:
# Logging is done
try:
    # your main code here
    logging.info('🎉 Execution completed successfully.')
except Exception as e:
    logging.error(f"❌ Execution failed: {e}", exc_info=True)


2025-06-13 15:56:59,972 | INFO | 🎉 Execution completed successfully.
