In [9]:
import json
import joblib
import numpy as np
import pandas as pd

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

import lightgbm as lgb

In [10]:
SEED = 42

meta = json.load(open("./dataset/meta.json"))
test = pd.read_csv("./dataset/test_processed.csv")
train = pd.read_csv("./dataset/train_processed.csv")

features = meta["features"]
num_classes = meta["n_classes"]
class_names = meta["class_names"]

lgb_features =  ["X", "Y","X+Y","X-Y","XY_rad","XYpca1","XYpca2","Hour","Month","Year","Day","Minute","StreetName_freq","AddrType_freq",
                "PdDistrict_freq","DayOfWeek_freq","GeoCluster_freq","Night","Is_weekend","Intersection","DayOfWeek_le","PdDistrict_le","AddrType_le", "GeoCluster_le"]

In [11]:
y = train["target"].values
X, X_test = train[lgb_features], test[lgb_features]
test_ids = test.index if "Id" not in test.columns else test["Id"].astype(int).values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=SEED, stratify=y)

num_val_class = len(np.unique(y_val))
num_train_class = len(np.unique(y_train))
print(f"X_train shape:{X_train.shape}, X_val shape:{X_val.shape}, X_test shape:{X_test.shape}")

assert num_val_class == num_classes
assert num_val_class == num_train_class
print(f"num of classes: {num_train_class}")
print(class_names)

X_train shape:(744367, 24), X_val shape:(131359, 24), X_test shape:(884262, 24)
num of classes: 39
['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


In [None]:
def lightgbm(X_train, y_train, X_val, y_val, class_names):

    gbm_params = {
        "objective": "multiclass",
        "num_class": len(class_names),
        "metric": "multi_logloss",
        "verbosity": -1,
        "learning_rate": 0.009,
        "num_leaves": 350,
        "min_data_in_leaf": 100,
        'max_bin': 500,
        "feature_fraction": 0.59,
        "bagging_fraction": 0.87,
        "lambda_l1": 0.725,
        "lambda_l2": 0.002,
        "seed": SEED,
    }

    print("training final lightgbm model...")
    val_set = lgb.Dataset(X_val, label=y_val)
    train_set = lgb.Dataset(X_train, label=y_train)
    
    clf = lgb.train(
        params=gbm_params,
        train_set=train_set,
        num_boost_round=2000,
        valid_sets=[val_set],
        callbacks=[
                lgb.early_stopping(
                    stopping_rounds=100,
                    first_metric_only=True,
                    verbose=True
                ),
                # lgb.log_evaluation(period=100),
                # lgb.record_evaluation(evals_result)
            ]
    )

    # predictions
    val_preds = clf.predict(X_val)
    val_loss =  log_loss(y_val, val_preds, labels=list(range(gbm_params["num_class"])))
    return val_loss, clf


In [15]:
best_loss_lgb, clf = lightgbm(X_train, y_train, X_val, y_val, class_names)
print("LightGBM best log-loss:", best_loss_lgb)

training final lightgbm model...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[268]	valid_0's multi_logloss: 2.2089
Evaluated only: multi_logloss
LightGBM best log-loss: 2.2089122342858096


In [16]:
# save
test_preds = clf.predict(X_test)
joblib.dump(clf, "./checkpoints/lightgbm.pkl")
submission = pd.DataFrame(test_preds, columns=class_names)
submission.insert(0, "Id", test_ids)
submission.to_csv("./submissions/lightgbm_submission.csv", index=False)