In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier


In [4]:
train_df = pd.read_csv("atlantis_citizens_final.csv")
test_df = pd.read_csv("test_atlantis_hidden.csv")


In [5]:
for col in ["House_Size_sq_ft", "Wealth_Index", "Life_Expectancy"]:
    train_df[col] = train_df[col].fillna(train_df[col].median())
    test_df[col] = test_df[col].fillna(train_df[col].median())


In [6]:
label_encoder = LabelEncoder()
train_df["Occupation_Encoded"] = label_encoder.fit_transform(train_df["Occupation"])


In [7]:
TARGET_COL = "Occupation_Encoded"

X = train_df.drop(columns=["Occupation", TARGET_COL, "Citizen_ID", "Bio_Hash"], errors="ignore")
y = train_df[TARGET_COL]

X = pd.get_dummies(X)

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [8]:
model = RandomForestClassifier(
    n_estimators=600,
    max_depth=15,
    min_samples_leaf=3,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

val_preds = model.predict(X_val)
macro_f1 = f1_score(y_val, val_preds, average="macro")

print("Validation Macro F1:", macro_f1)


Validation Macro F1: 0.5828526696827829


In [9]:
X_train_full = train_df.drop(
    columns=["Occupation", TARGET_COL, "Citizen_ID", "Bio_Hash"],
    errors="ignore"
)
X_train_full = pd.get_dummies(X_train_full)

X_test_full = test_df.drop(columns=["Citizen_ID", "Bio_Hash"], errors="ignore")
X_test_full = pd.get_dummies(X_test_full)

X_train_full, X_test_full = X_train_full.align(
    X_test_full, join="left", axis=1, fill_value=0
)

final_model = RandomForestClassifier(
    n_estimators=600,
    max_depth=15,
    min_samples_leaf=3,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

final_model.fit(X_train_full, train_df[TARGET_COL])

test_preds_encoded = final_model.predict(X_test_full)
test_preds = test_preds_encoded + 1

submission = pd.DataFrame({
    "Citizen_ID": test_df["Citizen_ID"],
    "Occupation": test_preds
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,Citizen_ID,Occupation
0,CIT_15383,3
1,CIT_14830,4
2,CIT_17388,5
3,CIT_17438,1
4,CIT_16735,1
