In [133]:
import logging
import os
from datetime import datetime

# Create logs directory
log_dir = '../outputs/logs'
os.makedirs(log_dir, exist_ok=True)

# Create unique log filename with timestamp (to ms)
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')[:-3]
log_filename = f'run_{timestamp}.log'
log_path = os.path.join(log_dir, log_filename)

# 🔁 Remove all handlers associated with the root logger
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# ✅ Reconfigure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s | %(levelname)s | %(message)s',
    handlers=[
        logging.FileHandler(log_path),
        logging.StreamHandler()  # Optional: also print to console
    ]
)

logging.info("🚀 Logging system initialized.")


--- Logging error ---
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\logging\__init__.py", line 1086, in emit
    stream.write(msg + self.terminator)
UnicodeEncodeError: 'cp950' codec can't encode character '\U0001f680' in position 33: illegal multibyte sequence
Call stack:
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.9_3.9.3568.0_x64__qbz5n2kfra8p0\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Alex\TUM\hackathon\hackathon-data\HackathonChallengeOne\venv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Alex\TUM\hackathon\hackathon-data\HackathonChallengeOne\venv\lib\site-packages\traitlets

In [134]:
# Import pandas for data handling
import pandas as pd

# Import NLTK stopwords for German text preprocessing
import nltk
from nltk.corpus import stopwords

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [135]:
# Download German stopwords if not already available
try:
    stopwords.words("german")
except LookupError:
    nltk.download("stopwords")

# Store German stopwords for use in TF-IDF vectorizer
german_stopwords = stopwords.words("german")

In [136]:
# Load training and validation datasets from CSV files
train_df = pd.read_csv('../data/challenge_1/train/classification_data.csv')
val_df = pd.read_csv('../data/challenge_1/val/classification_data.csv')

# full_df = pd.concat([train_df, val_df], ignore_index=True)

In [137]:
# Features to use
text_feature = "description"

# Define additional binary keyword indicator features
keyword_features = [
    "has_verkehr_keywords",
    "has_bildung_keywords",
    "has_umwelt_keywords",
    "has_gesundheit_keywords",
]

categorical_features = ["state", "category"]

all_features = [text_feature] + keyword_features + categorical_features

In [138]:
X_train = train_df[all_features]
X_val = val_df[all_features]
y_level_train = train_df["responsible_entity_level"]
y_level_val = val_df["responsible_entity_level"]

In [139]:
X_train.head()

Unnamed: 0,description,has_verkehr_keywords,has_bildung_keywords,has_umwelt_keywords,has_gesundheit_keywords,state,category
0,Die Wartezeit in der Notaufnahme des Klinikums...,False,False,False,False,Sachsen,Gesundheit
1,Das Online-Portal der Stadt funktioniert seit ...,False,False,False,False,Niedersachsen,Digitalisierung
2,Die Baustelle an der B176 bei Frohburg dauert ...,True,False,False,False,Sachsen,Verkehr
3,Die Baustelle in der Leipziger Straße dauert s...,True,False,False,False,Thüringen,Verkehr
4,Die Grünschnittsammelstelle an der Münchener S...,True,False,True,False,Bayern,Umwelt


In [140]:
y_level_train.head()

0    Land
1    Bund
2    Land
3    Land
4    Land
Name: responsible_entity_level, dtype: object

In [141]:
from sklearn.preprocessing import OneHotEncoder

# Create a text processing pipeline: TF-IDF vectorization + Dimensionality Reduction
text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words=german_stopwords,
        max_features=10000,
        ngram_range=(1, 2),
        min_df=3,
        max_df=0.9
    )),
    ("svd", TruncatedSVD(n_components=300, random_state=42))  # Reduce to 300 dimensions
])

# Combine text and keyword features into a single preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_pipeline, "description"),
        ("keywords", "passthrough", keyword_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features), # onehot encoding
    ]
)

In [142]:
# Create full pipeline: preprocessing + classifier
level_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
))
])

In [143]:
X_train.shape

(695, 7)

In [144]:
y_level_train.shape

(695,)

In [145]:
# Fit the model on the filtered training data
level_pipeline.fit(X_train, y_level_train)

In [146]:
# Generate predictions on the filtered validation set
y_pred = level_pipeline.predict(X_val)

In [147]:
# Print overall accuracy
print("⭐ Accuracy:", accuracy_score(y_level_val, y_pred))

# Print detailed classification report with zero_division=0 to suppress warnings
print(classification_report(y_level_val, y_pred, zero_division=0))

⭐ Accuracy: 1.0
              precision    recall  f1-score   support

        Bund       1.00      1.00      1.00        19
        Land       1.00      1.00      1.00       130

    accuracy                           1.00       149
   macro avg       1.00      1.00      1.00       149
weighted avg       1.00      1.00      1.00       149



In [148]:
# Save the trained pipeline model to /outputs/models
import joblib
import os

model_output_dir = '../outputs/models/'
os.makedirs(model_output_dir, exist_ok=True)

model_path = os.path.join(model_output_dir, 'challenge1_model-alex-new.pkl')
joblib.dump(level_pipeline, model_path)

print(f"✅ Model saved to: {model_path}")

✅ Model saved to: ../outputs/models/challenge1_model-alex-new.pkl


### For BUND entity prediction:

In [149]:
bund_df = train_df[train_df["responsible_entity_level"] == "Bund"]
X_bund = bund_df[["description"]]
y_bund = bund_df["responsible_entity_id"]

In [150]:
y_bund.head()

1     BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR
5     BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR
7     BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR
8     BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR
13    BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR
Name: responsible_entity_id, dtype: object

### For LAND_CODE prediction:

In [151]:
land_df = train_df[train_df["responsible_entity_level"] == "Land"].copy()

# Extract LAND_CODE
land_df["land_code"] = land_df["responsible_entity_id"].str.extract(r"LAND_(\d+)_")

X_land_code = land_df[["description", "issue_id"]]  # or any other feature
y_land_code = land_df["land_code"]

In [152]:
y_land_code.head()

0    04
2    08
3    09
4    15
6    09
Name: land_code, dtype: object

### For MINISTRY prediction:

In [153]:
# Use same land_df
land_df["ministry"] = land_df["responsible_entity_id"].str.extract(r"LAND_\d+_(.+)")

X_ministry = land_df[["description", "category"]]
y_ministry = land_df["ministry"]

In [154]:
y_ministry.head()

0    SM
2    VM
3    VM
4    UM
6    GM
Name: ministry, dtype: object

## Train sub-models

In [155]:
bund_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words=german_stopwords)),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

bund_pipeline.fit(X_bund["description"], y_bund)

In [156]:
bund_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words=german_stopwords)),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

bund_pipeline.fit(X_bund["description"], y_bund)

In [157]:
from sklearn.preprocessing import OneHotEncoder

land_code_preprocessor = ColumnTransformer([
    ("text", TfidfVectorizer(stop_words=german_stopwords), "description"),
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["issue_id"])
])

land_code_pipeline = Pipeline([
    ("preprocessor", land_code_preprocessor),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

land_code_pipeline.fit(X_land_code, y_land_code)

In [158]:
ministry_preprocessor = ColumnTransformer([
    ("text", TfidfVectorizer(stop_words=german_stopwords), "description"),
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["category"])
])

ministry_pipeline = Pipeline([
    ("preprocessor", ministry_preprocessor),  # reuse same preprocessor
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

ministry_pipeline.fit(X_ministry, y_ministry)

## Full prediction logic

In [159]:
def classify_entity(issue_row):

    level = level_pipeline.predict(pd.DataFrame([issue_row]))[0]
    # return "BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR"
    if level == "Bund":
        return bund_pipeline.predict([issue_row["description"]])[0]
    else:  # LAND
        land_code = land_code_pipeline.predict(pd.DataFrame([issue_row]))[0]
        ministry = ministry_pipeline.predict(pd.DataFrame([issue_row]))[0]
        return f"LAND_{land_code}_{ministry}"

In [160]:
level_pipeline.predict(pd.DataFrame([val_df.iloc[0]]))

array(['Land'], dtype=object)

# Validation + Evaluation

In [161]:

# 7. Evaluate on validation set
y_true = val_df["responsible_entity_id"]
y_pred = val_df.apply(classify_entity, axis=1)

print("Overall Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred, zero_division=0))

Overall Accuracy: 0.16778523489932887

Classification Report:
                                                   precision    recall  f1-score   support

BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR       1.00      1.00      1.00        19
                                      LAND_01_BM       0.00      0.00      0.00         2
                                      LAND_01_IM       0.00      0.00      0.00         2
                                      LAND_01_MW       0.00      0.00      0.00         1
                                      LAND_01_SM       0.00      0.00      0.00         4
                                      LAND_01_UM       0.00      0.00      0.00         0
                                      LAND_01_VM       0.00      0.00      0.00         2
                                      LAND_02_GM       0.00      0.00      0.00         0
                                      LAND_02_MW       0.00      0.00      0.00         1
                                    

In [162]:

# 7. Evaluate on validation set
y_true = val_df["responsible_entity_id"]
y_pred = val_df.apply(classify_entity, axis=1)

print("Overall Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred, zero_division=0))

Overall Accuracy: 0.16778523489932887

Classification Report:
                                                   precision    recall  f1-score   support

BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR       1.00      1.00      1.00        19
                                      LAND_01_BM       0.00      0.00      0.00         2
                                      LAND_01_IM       0.00      0.00      0.00         2
                                      LAND_01_MW       0.00      0.00      0.00         1
                                      LAND_01_SM       0.00      0.00      0.00         4
                                      LAND_01_UM       0.00      0.00      0.00         0
                                      LAND_01_VM       0.00      0.00      0.00         2
                                      LAND_02_GM       0.00      0.00      0.00         0
                                      LAND_02_MW       0.00      0.00      0.00         1
                                    

In [163]:
val_df["responsible_entity_id"]

0                                            LAND_01_SM
1                                            LAND_07_IM
2                                            LAND_05_UM
3                                            LAND_06_BM
4      BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR
                             ...                       
144                                          LAND_01_BM
145                                          LAND_13_UM
146                                          LAND_09_MW
147                                          LAND_05_MW
148                                          LAND_06_VM
Name: responsible_entity_id, Length: 149, dtype: object

In [164]:
combined_df = pd.concat([y_true, y_pred], axis=1)
combined_df.columns = ['true', 'pred']
combined_df

Unnamed: 0,true,pred
0,LAND_01_SM,LAND_12_SM
1,LAND_07_IM,LAND_05_IM
2,LAND_05_UM,LAND_10_IM
3,LAND_06_BM,LAND_16_BM
4,BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR,BUND_BUNDESMINISTERIUM_FÜR_DIGITALES_UND_VERKEHR
...,...,...
144,LAND_01_BM,LAND_07_BM
145,LAND_13_UM,LAND_07_MW
146,LAND_09_MW,LAND_13_FM
147,LAND_05_MW,LAND_03_MW


In [167]:
train_df[train_df['responsible_entity_id'].str.contains('01')][['responsible_entity_id', 'issue_id']]

Unnamed: 0,responsible_entity_id,issue_id
32,LAND_01_UM,ISS_20250613033656_1038
109,LAND_01_UM,ISS_20250613033706_2932
127,LAND_01_UM,ISS_20250613034213_5603
147,LAND_01_UM,ISS_20250613034046_6196
154,LAND_01_SM,ISS_20250613034103_1626
160,LAND_01_SM,ISS_20250613034241_9153
218,LAND_01_BM,ISS_20250613034048_6273
228,LAND_01_MW,ISS_20250613034234_3783
245,LAND_01_UM,ISS_20250613033841_2258
259,LAND_01_SM,ISS_20250613033936_9592


In [168]:
train_df[train_df['responsible_entity_id'].str.contains('02')][['responsible_entity_id', 'issue_id']]

Unnamed: 0,responsible_entity_id,issue_id
35,LAND_02_UM,ISS_20250613033857_6448
41,LAND_02_UM,ISS_20250613034044_1005
43,LAND_02_IM,ISS_20250613034128_1945
49,LAND_02_VM,ISS_20250613034130_5554
63,LAND_02_SM,ISS_20250613033817_9198
81,LAND_02_FM,ISS_20250613033652_6157
88,LAND_02_FM,ISS_20250613033741_9131
94,LAND_02_SM,ISS_20250613033850_9103
107,LAND_02_SM,ISS_20250613033648_2496
114,LAND_02_SM,ISS_20250613033936_2140


In [165]:
import os
import pandas as pd

# Load test data
test_df = pd.read_csv('../data/challenge_1/test/classification_data.csv')

# Select the same features as used during training
text_feature = "description"
keyword_features = [
    "has_verkehr_keywords",
    "has_bildung_keywords",
    "has_umwelt_keywords",
    "has_gesundheit_keywords"
]
test_X = test_df[all_features]

# Make predictions using the trained pipeline
predictions = pipeline.predict(test_X)

# Create submission DataFrame
submission = pd.DataFrame({
    'issue_id': test_df['issue_id'],
    'responsible_entity_id': predictions
})

# ✅ Ensure the output directory exists
output_dir = '../outputs/submission/'
os.makedirs(output_dir, exist_ok=True)

# ✅ Save submission file in the correct directory
submission_path = os.path.join(output_dir, f'challenge1_submission-{timestamp}-alex_17.csv')
submission.to_csv(submission_path, index=False, encoding='utf-8')

# Optional: Save test data with predictions for inspection
debug_path = os.path.join(output_dir, 'test_with_predictions_new.csv')
test_df["predicted_responsible_entity_id"] = predictions
test_df.to_csv(debug_path, index=False, encoding='utf-8')

logging.info(f"✅ Submission saved to: {submission_path}")
logging.info(f"📄 Full test data with predictions saved to: {debug_path}")

NameError: name 'pipeline' is not defined

In [None]:
# Logging is done
try:
    # your main code here
    logging.info('🎉 Execution completed successfully.')
except Exception as e:
    logging.error(f"❌ Execution failed: {e}", exc_info=True)
