In [50]:
import pandas as pd
import json

# Load training and validation data
train_df = pd.read_csv('../data/challenge_1/train/classification_data.csv')
val_df = pd.read_csv('../data/challenge_1/val/classification_data.csv')

# Load entity catalog
with open('../data/shared/entity_catalog.json', 'r', encoding='utf-8') as f:
    catalog = json.load(f)

# Optional: Load category descriptions
with open('../data/shared/categories.json', 'r', encoding='utf-8') as f:
    category_descriptions = json.load(f)

In [62]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Sample feature pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['category', 'state']),
    ('text', TfidfVectorizer(max_features=200), 'description'),
    ('num', 'passthrough', ['description_length', 'hour', 'month'])
])

In [63]:
train_df['level_target'] = train_df['responsible_entity_id'].apply(lambda x: x.split('_')[0])
val_df['level_target'] = val_df['responsible_entity_id'].apply(lambda x: x.split('_')[0])

In [64]:
train_df.head()

Unnamed: 0,issue_id,timestamp,category,description,municipality,district,state,age_group,gender,origin,...,day_of_week,day_of_month,week_of_year,month,quarter,is_weekend,is_business_hours,is_morning,is_afternoon,level_target
0,ISS_20250613034209_7483,2023-11-12 03:54:00,Gesundheit,Die Wartezeit in der Notaufnahme des Klinikums...,Chemnitz,Stadt,Sachsen,55-64,female,citizen,...,6,12,45,11,4,True,False,False,False,LAND
1,ISS_20250613034253_2513,2024-12-26 06:24:00,Digitalisierung,Das Online-Portal der Stadt funktioniert seit ...,Moringen,Stadt,Niedersachsen,35-44,female,citizen,...,3,26,52,12,4,False,False,True,False,BUND
2,ISS_20250613033812_4910,2023-06-04 12:10:00,Verkehr,Die Baustelle an der B176 bei Frohburg dauert ...,Frohburg,Stadt,Sachsen,25-34,male,citizen,...,6,4,22,6,2,True,True,True,True,LAND
3,ISS_20250613033659_2513,2024-09-13 11:25:00,Verkehr,Die Baustelle in der Leipziger Straße dauert s...,Artern,Stadt,Thüringen,35-44,male,citizen,...,4,13,37,9,3,False,True,True,False,LAND
4,ISS_20250613034155_1027,2024-02-20 14:58:00,Umwelt,Die Grünschnittsammelstelle an der Münchener S...,Olching,Stadt,Bayern,45-54,male,citizen,...,1,20,8,2,1,False,True,False,True,LAND


In [65]:
train_filtered = train_df[train_df['level_target'].isin(['BUND', 'LAND'])]
val_filtered = val_df[val_df['level_target'].isin(['BUND', 'LAND'])]

In [66]:
train_filtered.head()

Unnamed: 0,issue_id,timestamp,category,description,municipality,district,state,age_group,gender,origin,...,day_of_week,day_of_month,week_of_year,month,quarter,is_weekend,is_business_hours,is_morning,is_afternoon,level_target
0,ISS_20250613034209_7483,2023-11-12 03:54:00,Gesundheit,Die Wartezeit in der Notaufnahme des Klinikums...,Chemnitz,Stadt,Sachsen,55-64,female,citizen,...,6,12,45,11,4,True,False,False,False,LAND
1,ISS_20250613034253_2513,2024-12-26 06:24:00,Digitalisierung,Das Online-Portal der Stadt funktioniert seit ...,Moringen,Stadt,Niedersachsen,35-44,female,citizen,...,3,26,52,12,4,False,False,True,False,BUND
2,ISS_20250613033812_4910,2023-06-04 12:10:00,Verkehr,Die Baustelle an der B176 bei Frohburg dauert ...,Frohburg,Stadt,Sachsen,25-34,male,citizen,...,6,4,22,6,2,True,True,True,True,LAND
3,ISS_20250613033659_2513,2024-09-13 11:25:00,Verkehr,Die Baustelle in der Leipziger Straße dauert s...,Artern,Stadt,Thüringen,35-44,male,citizen,...,4,13,37,9,3,False,True,True,False,LAND
4,ISS_20250613034155_1027,2024-02-20 14:58:00,Umwelt,Die Grünschnittsammelstelle an der Münchener S...,Olching,Stadt,Bayern,45-54,male,citizen,...,1,20,8,2,1,False,True,False,True,LAND


In [67]:
from lightgbm import LGBMClassifier

# Pipeline to predict BUND vs LAND
from sklearn.pipeline import make_pipeline

X_train = train_filtered.drop(columns=['responsible_entity_id', 'responsible_entity_name', 'responsible_entity_level', 'level_target'])
y_train = train_filtered['level_target']

X_val = val_filtered.drop(columns=['responsible_entity_id', 'responsible_entity_name', 'responsible_entity_level', 'level_target'])
y_val = val_filtered['level_target']

pipeline_lvl1 = make_pipeline(
    preprocessor,
    LGBMClassifier(random_state=42)
)

pipeline_lvl1.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 605, number of negative: 90
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5277
[LightGBM] [Info] Number of data points in the train set: 695, number of used features: 223
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.870504 -> initscore=1.905419
[LightGBM] [Info] Start training from score 1.905419


In [69]:
train_bund = train_df[train_df['level_target'] == 'BUND']
train_land = train_df[train_df['level_target'] == 'LAND']

In [73]:
bund_pipeline = make_pipeline(
    preprocessor,
    LGBMClassifier(num_class=len(train_bund['responsible_entity_id'].unique()), random_state=42)
)
bund_pipeline.fit(
    train_bund.drop(columns=["issue_id"]),
    train_bund['responsible_entity_id']
)

land_pipeline = make_pipeline(
    preprocessor,
    LGBMClassifier(num_class=len(train_land['responsible_entity_id'].unique()), random_state=42)
)
land_pipeline.fit(
    train_land.drop(columns=["issue_id"]),
    train_land['responsible_entity_id']
)

[LightGBM] [Info] Number of positive: 0, number of negative: 90
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000113 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 575
[LightGBM] [Info] Number of data points in the train set: 90, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001022 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4680
[LightGBM] [Info] Number of data points in the train set: 605, number of used features: 222
[LightGBM] [Info] Start training from score -4.795791
[LightGBM] [Info] Start training from score -5.712081
[LightGBM] [Info] Start training from score -5.712081
[LightGBM] [Info] Start training from score -5.712081
[LightGBM] 

In [74]:
test_df = pd.read_csv('../data/challenge_1/test/classification_data.csv')

# Step 1: predict BUND vs LAND
level_preds = pipeline_lvl1.predict(test_df)

# Step 2: use that to route to the appropriate model
final_preds = []
for i, row in test_df.iterrows():
    if level_preds[i] == 'BUND':
        final_preds.append(bund_pipeline.predict([row])[0])
    else:
        final_preds.append(land_pipeline.predict([row])[0])

submission = pd.DataFrame({
    'issue_id': test_df['issue_id'],
    'responsible_entity_id': final_preds
})



ValueError: X has 225 features, but LGBMClassifier is expecting 226 features as input.