In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

train = pd.read_csv("atlantis_citizens_final.csv")
test = pd.read_csv("test_atlantis_hidden.csv")

for df in [train, test]:
    df['Commutes'] = (df['District_Name'] != df['Work_District']).astype(int)
    df['Hash_Len'] = df['Bio_Hash'].str.len()

for col in ['Wealth_Index','House_Size_sq_ft']:
    train[col].fillna(train[col].median(), inplace=True)
    test[col].fillna(train[col].median(), inplace=True)

cat_cols = train.select_dtypes(include='object').columns.drop('Occupation')

encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]])
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])
    encoders[col] = le

target_enc = LabelEncoder()
y = target_enc.fit_transform(train['Occupation'])
X = train.drop('Occupation', axis=1)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=18,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

val_pred = model.predict(X_val)
f1_score(y_val, val_pred, average='macro')



model.fit(X, y)
test_preds = model.predict(test)

submission = pd.DataFrame({
    'Citizen_ID': test['Citizen_ID'],
    'Occupation': test_preds
})

submission.to_csv("submission.csv", index=False)