In [14]:
import os
import json
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier

import joblib
from sklearn.metrics import log_loss

In [2]:
def train_and_eval(model, X_train, y_train, X_val, y_val, X_test, eps=1e-15):

    model.fit(X_train, y_train)
    proba_val = model.predict_proba(X_val)
    
    # clip and normalize for numerical stability
    proba_val = np.clip(proba_val, eps, 1 - eps)
    proba_val = proba_val / proba_val.sum(axis=1, keepdims=True)
    
    # calculate loss - use model.classes_ to match probability order
    loss = log_loss(y_val, proba_val, labels=model.classes_)
    
    proba_test = model.predict_proba(X_test)
    proba_test = np.clip(proba_test, eps, 1 - eps)
    proba_test = proba_test / proba_test.sum(axis=1, keepdims=True)
    
    return loss, proba_val, proba_test

In [None]:
SEED = 42

meta = json.load(open("./dataset/meta.json"))
test = pd.read_csv("./dataset/test_processed.csv")
train = pd.read_csv("./dataset/train_processed.csv")

features = meta["features"]
num_classes = meta["n_classes"]
class_names = meta["class_names"]

In [5]:
y = train["target"].values
X, X_test = train[features], test[features]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=SEED, stratify=y)

num_val_class = len(np.unique(y_val))
num_train_class = len(np.unique(y_train))
print(f"X_train shape:{X_train.shape}, X_val shape:{X_val.shape}, X_test shape:{X_test.shape}")

assert num_val_class == num_classes
assert num_val_class == num_train_class
print(f"num of classes: {num_train_class}")
print(class_names)

results, val_store, test_store = {}, {}, {}

X_train shape:(744367, 124), X_val shape:(131359, 124), X_test shape:(884262, 124)
num of classes: 39
['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


### Random Forest

In [7]:
rf_features =  ["X", "Y","X+Y","X-Y","XY_rad","XYpca1","XYpca2","Hour","Month","Year","Day","Minute","StreetName_freq","AddrType_freq",
                "PdDistrict_freq","DayOfWeek_freq","GeoCluster_freq","Night","Is_weekend","Intersection","DayOfWeek_le","PdDistrict_le","AddrType_le", "GeoCluster_le"]

y = train["target"].values
X, X_test = train[rf_features], test[rf_features]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=SEED, stratify=y)

num_val_class = len(np.unique(y_val))
num_train_class = len(np.unique(y_train))
print(f"X_train shape:{X_train.shape}, X_val shape:{X_val.shape}, X_test shape:{X_test.shape}")

assert num_val_class == num_classes
assert num_val_class == num_train_class
print(f"num of classes: {num_train_class}")
print(class_names)

X_train shape:(744367, 24), X_val shape:(131359, 24), X_test shape:(884262, 24)
num of classes: 39
['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


In [11]:
print("training random forest...")
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=SEED)
rf_loss, rf_val, rf_test = train_and_eval(rf, X_train, y_train, X_val, y_val, X_test)
results['RandomForest'] = rf_loss
val_store['RandomForest'] = rf_val
test_store['RandomForest'] = rf_test
print("random forest validation log-loss:", rf_loss)

training random forest...
random forest validation log-loss: 3.151723650550059


### Histogram-based Gradient Boosting

In [13]:
print("training HistGradientBoostingClassifier...")
hgb = HistGradientBoostingClassifier( max_iter=500, random_state=SEED)
hgb_loss, hgb_val, hgb_test = train_and_eval(hgb, X_train, y_train, X_val, y_val, X_test)
results['HistGB'] = hgb_loss
val_store['HistGB'] = hgb_val
test_store['HistGB'] = hgb_test
print("HistGB validation log-loss:", hgb_loss)

training HistGradientBoostingClassifier...




HistGB validation log-loss: 2.528305126608513


### Adaboost

In [15]:
print("training adaBoost...")
adb = AdaBoostClassifier(n_estimators=200, random_state=SEED)
adb_loss, adb_val, adb_test = train_and_eval(adb, X_train, y_train, X_val, y_val, X_test)
results['AdaBoost'] = adb_loss
val_store['AdaBoost'] = adb_val
test_store['AdaBoost'] = adb_test
print("adaBoost validation log-loss:", adb_loss)

training adaBoost...
adaBoost validation log-loss: 3.6228212586949313


### Multinomial Regression

In [17]:
y = train["target"].values
X, X_test = train[features], test[features]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=SEED, stratify=y)

num_val_class = len(np.unique(y_val))
num_train_class = len(np.unique(y_train))
print(f"X_train shape:{X_train.shape}, X_val shape:{X_val.shape}, X_test shape:{X_test.shape}")

assert num_val_class == num_classes
assert num_val_class == num_train_class
print(f"num of classes: {num_train_class}")
print(class_names)

X_train shape:(744367, 124), X_val shape:(131359, 124), X_test shape:(884262, 124)
num of classes: 39
['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY', 'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS']


In [18]:
print("training multi-logistic regression...")
lr = LogisticRegression(multi_class="multinomial", solver="saga", C=5.0, max_iter=300, random_state=SEED, n_jobs=-1)
lr_loss, lr_val, lr_test = train_and_eval(lr, X_train, y_train, X_val, y_val, X_test)
results['LogisticRegression'] = lr_loss
val_store['LogisticRegression'] = lr_val
test_store['LogisticRegression'] = lr_test
print("logistic regression validation log-loss:", lr_loss)

training multi-logistic regression...




logistic regression validation log-loss: 2.4444801511664873


In [24]:
joblib.dump(lr, "./checkpoints/logistic_regression.pkl")
lr_loaded = joblib.load("./checkpoints/logistic_regression.pkl")

### Summary

In [19]:
print("model validation log-loss:")
for k, v in sorted(results.items(), key=lambda kv: kv[1]):
    print(f"  {k:20s} : {v:.5f}")

# soft average of top 4 models
ordered = sorted(results.items(), key=lambda kv: kv[1])
top_models = [k for k, _ in ordered[:4]]
print("top 4 models for soft-average ensemble:", top_models)

val_avg = np.zeros_like(val_store[top_models[0]])
test_avg = np.zeros_like(test_store[top_models[0]])

for m in top_models:
    val_avg += val_store[m] / len(top_models)
    test_avg += test_store[m] / len(top_models)

avg_loss = log_loss(y_val, val_avg)
print("soft-average validation log-loss:", avg_loss)

# save best single and soft-average submission
best_single = min(results.items(), key=lambda kv: kv[1])[0]
print("best single model:", best_single, "loss:", results[best_single])



model validation log-loss:
  LogisticRegression   : 2.44448
  HistGB               : 2.52831
  RandomForest         : 3.15172
  AdaBoost             : 3.62282
top 4 models for soft-average ensemble: ['LogisticRegression', 'HistGB', 'RandomForest', 'AdaBoost']
soft-average validation log-loss: 2.392075591988043
best single model: LogisticRegression loss: 2.4444801511664873


In [22]:
# save submissions
class_cols = class_names

# best single model
submission_single = pd.DataFrame(test_store[best_single], columns=class_cols)
submission_single["Id"] = test.index if "Id" not in test.columns else test["Id"].astype(int).values
submission_single = submission_single[["Id"] + class_cols]
submission_single.to_csv(f"./submissions/{best_single}_submission.csv", index=False)

# Soft-Average
submission_avg = pd.DataFrame(test_avg, columns=class_cols)
submission_avg["Id"] = test.index if "Id" not in test.columns else test["Id"].astype(int).values
submission_avg = submission_avg[["Id"] + class_cols]
submission_avg.to_csv("./submissions/soft_average_submission.csv", index=False)