In [7]:
import os
import json
import joblib
import argparse
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

import xgboost as xgb

In [8]:
SEED = 42

meta = json.load(open("./dataset/meta.json"))
test = pd.read_csv("./dataset/test_processed.csv")
train = pd.read_csv("./dataset/train_processed.csv")

features = meta["features"]
num_classes = meta["n_classes"]
class_names = meta["class_names"]

xgb_features =  ["X", "Y","X+Y","X-Y","XY_rad","XYpca1","XYpca2","Hour","Month","Year","Day","Minute","StreetName_freq","AddrType_freq",
                "PdDistrict_freq","DayOfWeek_freq","GeoCluster_freq","Night","Is_weekend","Intersection","DayOfWeek_le","PdDistrict_le","AddrType_le", "GeoCluster_le"]

In [9]:
y = train["target"].values
X, X_test = train[xgb_features], test[xgb_features]
test_ids = test.index if "Id" not in test.columns else test["Id"].astype(int).values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=SEED, stratify=y)

num_val_class = len(np.unique(y_val))
num_train_class = len(np.unique(y_train))
print(f"X_train shape:{X_train.shape}, X_val shape:{X_val.shape}, X_test shape:{X_test.shape}")

assert num_val_class == num_classes
assert num_val_class == num_train_class
print(f"num of classes: {num_train_class}")
print(class_names)

X_train shape:(744367, 24), X_val shape:(131359, 24), X_test shape:(884262, 24)
num of classes: 39
['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


In [12]:
def xgboost(X_train, y_train, X_val, y_val, class_names):

    xgb_params = {
        "objective": "multi:softprob",
        "num_class": len(class_names),
        "booster": "gbtree",
        "eta": 0.03,
        "max_depth": 7,
        "subsample": 0.77,
        "colsample_bytree": 0.97,
        "lambda": 0.14,
        "alpha": 6.4e-05,
        "seed": 42,
        "verbosity": 0,
        "tree_method": "hist"
    }

    # final training
    print("training XGBoost model...")
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    clf = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=2000,
        evals=[(dtrain, "train"), (dval, "eval")],
        early_stopping_rounds=100,
        verbose_eval=False,
    )

    # predictions
    val_preds = clf.predict(dval)
    val_loss =  log_loss(y_val, val_preds, labels=list(range(xgb_params["num_class"])))

    return val_loss, clf

In [13]:
best_loss_xgb, clf = xgboost(X_train, y_train, X_val, y_val, class_names)
print("XGBoost best log-loss:", best_loss_xgb)

# save
joblib.dump(clf, "./checkpoints/xgboost.pkl")
dtest = xgb.DMatrix(X_test)
test_preds = clf.predict(dtest)
submission = pd.DataFrame(test_preds, columns=class_names)
submission.insert(0, "Id", test_ids)
submission.to_csv("./submissions/xgboost_submission.csv", index=False)

training XGBoost model...
XGBoost best log-loss: 2.2013107990603293
