In [1]:
!pip -q install lightgbm

In [2]:
import gc, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

import lightgbm as lgb

In [3]:
# CREATE EXAMPLE DATA

rng = np.random.default_rng(42)

n_train = 5000
n_test  = 2000
n_families = 8

families = np.array([f"family_{i}" for i in range(n_families)])

# Simulate class imbalance a bit
class_probs = np.array([0.22, 0.18, 0.14, 0.12, 0.10, 0.09, 0.08, 0.07])
y_train = rng.choice(families, size=n_train, p=class_probs)

# Numeric features (pretend they came from PE metadata / byte stats etc.)
# We'll make signal by giving each family a slightly different mean shift.
n_num = 30
family_to_shift = {fam: i * 0.35 for i, fam in enumerate(families)}
shifts = np.array([family_to_shift[y] for y in y_train])

X_num = rng.normal(loc=0.0, scale=1.0, size=(n_train, n_num)) + shifts[:, None]

# A couple "hand-crafted" features that correlate with family
entropy = np.clip(rng.normal(6.5, 1.0, size=n_train) + shifts * 0.15, 0, 8)
import_count = np.clip(rng.normal(120, 30, size=n_train) + shifts * 4.0, 10, 400)
section_count = np.clip(rng.normal(5, 1.5, size=n_train) + shifts * 0.2, 1, 12)

# Categorical-ish feature (compiler / packer)
packers = np.array(["none", "upx", "aspack", "themida", "custom"])
# Make packer distribution vary by family a bit
packer_base = rng.choice(packers, size=n_train, p=[0.45, 0.25, 0.12, 0.10, 0.08])
mask = (y_train == "family_6") | (y_train == "family_7")
packer_base[mask] = rng.choice(packers, size=mask.sum(), p=[0.20, 0.20, 0.10, 0.40, 0.10])

train = pd.DataFrame(X_num, columns=[f"f_{i}" for i in range(n_num)])
train["entropy"] = entropy
train["import_count"] = import_count
train["section_count"] = section_count
train["packer"] = packer_base
train["id"] = np.arange(n_train)
train["label"] = y_train

# Create test data with similar distribution (no labels)
# For test, just sample families to generate realistic feature distribution
y_hidden = rng.choice(families, size=n_test, p=class_probs)
shifts_test = np.array([family_to_shift[y] for y in y_hidden])

X_num_t = rng.normal(loc=0.0, scale=1.0, size=(n_test, n_num)) + shifts_test[:, None]
entropy_t = np.clip(rng.normal(6.5, 1.0, size=n_test) + shifts_test * 0.15, 0, 8)
import_count_t = np.clip(rng.normal(120, 30, size=n_test) + shifts_test * 4.0, 10, 400)
section_count_t = np.clip(rng.normal(5, 1.5, size=n_test) + shifts_test * 0.2, 1, 12)

packer_t = rng.choice(packers, size=n_test, p=[0.45, 0.25, 0.12, 0.10, 0.08])
mask_t = (y_hidden == "family_6") | (y_hidden == "family_7")
packer_t[mask_t] = rng.choice(packers, size=mask_t.sum(), p=[0.20, 0.20, 0.10, 0.40, 0.10])

test = pd.DataFrame(X_num_t, columns=[f"f_{i}" for i in range(n_num)])
test["entropy"] = entropy_t
test["import_count"] = import_count_t
test["section_count"] = section_count_t
test["packer"] = packer_t
test["id"] = np.arange(n_test)

print("Example train:", train.shape, "Example test:", test.shape)
print(train.head())

Example train: (5000, 36) Example test: (2000, 35)
        f_0       f_1       f_2       f_3       f_4       f_5       f_6  \
0  0.651075  1.208686  2.022001  1.121680  1.471634  2.191359  1.401227   
1  0.605155 -0.075407  1.200465  1.056909 -0.388308  1.896817  0.802310   
2  1.146429  2.470608  1.578380  1.835581  1.343090  1.938771  3.989380   
3  1.739593  1.221897 -0.013509  2.389065 -1.338811  0.953635  0.779802   
4 -0.892732 -0.257952 -0.299486 -0.696660  0.428483  0.234646 -1.590650   

        f_7       f_8       f_9  ...      f_26      f_27      f_28      f_29  \
0  1.976060  3.342728  1.578141  ...  2.990597  2.535061  0.787923  1.237005   
1 -1.029071  0.648657  0.981206  ...  0.031730  0.956784 -0.971059  1.399673   
2  3.485060  2.454479  2.074706  ...  2.225836  1.690534  0.990763  1.767437   
3  1.548179  1.961662  2.526765  ...  2.477479  1.056744  2.180681  1.449741   
4 -1.651526  0.424684 -0.815286  ... -2.624167 -0.120769 -0.170119  0.824743   

    entropy  impo

In [4]:
# PREPROCESS

IDCOL = "id"
TARGET = "label"

y = train[TARGET].astype(str)
X = train.drop(columns=[TARGET]).copy()
X_test = test.copy()

train_ids = X[IDCOL].values
test_ids = X_test[IDCOL].values

X = X.drop(columns=[IDCOL])
X_test = X_test.drop(columns=[IDCOL])

le = LabelEncoder()
y_enc = le.fit_transform(y)
n_classes = len(le.classes_)
print("\nFamilies:", list(le.classes_))

# Identify categorical columns BEFORE converting
cat_cols = [c for c in X.columns if X[c].dtype == "object"]

# Fill missing values
# - numeric -> -1
# - categorical -> "__MISSING__"
for c in cat_cols:
    X[c] = X[c].fillna("__MISSING__")
    X_test[c] = X_test[c].fillna("__MISSING__")

num_cols = [c for c in X.columns if c not in cat_cols]
X[num_cols] = X[num_cols].fillna(-1)
X_test[num_cols] = X_test[num_cols].fillna(-1)

# Now convert to category
for c in cat_cols:
    X[c] = X[c].astype("category")
    X_test[c] = X_test[c].astype("category")



Families: ['family_0', 'family_1', 'family_2', 'family_3', 'family_4', 'family_5', 'family_6', 'family_7']


In [5]:
# TRAIN (LightGBM + 5-fold CV)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_pred = np.zeros((len(X), n_classes), dtype=np.float32)
test_pred = np.zeros((len(X_test), n_classes), dtype=np.float32)

params = {
    "objective": "multiclass",
    "num_class": n_classes,
    "metric": "multi_logloss",
    "learning_rate": 0.05,
    "num_leaves": 128,
    "min_data_in_leaf": 50,
    "feature_fraction": 0.85,
    "bagging_fraction": 0.85,
    "bagging_freq": 1,
    "lambda_l2": 1.0,
    "verbosity": -1,
    "seed": 42,
}

print("\nStarting training...\n")

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y_enc), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y_enc[tr_idx], y_enc[va_idx]

    dtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols if cat_cols else "auto")
    dvalid = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols if cat_cols else "auto")

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=5000,
        valid_sets=[dvalid],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(200)],
    )

    va_proba = model.predict(X_va, num_iteration=model.best_iteration)
    oof_pred[va_idx] = va_proba

    test_pred += model.predict(X_test, num_iteration=model.best_iteration) / skf.n_splits

    va_pred = va_proba.argmax(axis=1)
    acc = accuracy_score(y_va, va_pred)
    f1  = f1_score(y_va, va_pred, average="macro")

    print(f"Fold {fold}: ACC={acc:.4f}  MacroF1={f1:.4f}")

    del model, dtrain, dvalid
    gc.collect()

oof_label = oof_pred.argmax(axis=1)
print("\n=================================")
print("FINAL OOF Accuracy:", accuracy_score(y_enc, oof_label))
print("FINAL OOF Macro F1:", f1_score(y_enc, oof_label, average="macro"))
print("=================================\n")


Starting training...

Training until validation scores don't improve for 200 rounds
[200]	valid_0's multi_logloss: 0.868944
[400]	valid_0's multi_logloss: 0.898321
Early stopping, best iteration is:
[229]	valid_0's multi_logloss: 0.866874
Fold 1: ACC=0.6290  MacroF1=0.6016
Training until validation scores don't improve for 200 rounds
[200]	valid_0's multi_logloss: 0.853689
[400]	valid_0's multi_logloss: 0.881137
Early stopping, best iteration is:
[241]	valid_0's multi_logloss: 0.852359
Fold 2: ACC=0.6190  MacroF1=0.5976
Training until validation scores don't improve for 200 rounds
[200]	valid_0's multi_logloss: 0.874658
[400]	valid_0's multi_logloss: 0.898127
Early stopping, best iteration is:
[222]	valid_0's multi_logloss: 0.872142
Fold 3: ACC=0.6130  MacroF1=0.5916
Training until validation scores don't improve for 200 rounds
[200]	valid_0's multi_logloss: 0.865422
[400]	valid_0's multi_logloss: 0.892579
Early stopping, best iteration is:
[216]	valid_0's multi_logloss: 0.864147
Fold

In [6]:
# Create Result
test_label = test_pred.argmax(axis=1)
test_family = le.inverse_transform(test_label)

submission = pd.DataFrame({"id": test_ids, "label": test_family})
submission.to_csv("submission.csv", index=False)

print("Saved submission.csv")
print(submission.head())

Saved submission.csv
   id     label
0   0  family_6
1   1  family_4
2   2  family_4
3   3  family_3
4   4  family_6
