In [None]:
import h5py

IMG_H5 = "../data/image_features.h5"

# Загружаем HDF5
with h5py.File(IMG_H5, "r") as f:
    img_features = f["features"][:]   # (N, D)
    img_ids = f["ids"][:]             # (N,)
    
print("Image features shape:", img_features.shape)
print("First 5 ids:", img_ids[:5])


In [None]:
# Словарь {ItemID: индекс в h5}
id2idx = {int(i): j for j, i in enumerate(img_ids)}

# Матрица для train.csv
img_vectors = []
for item_id in df["ItemID"]:
    if item_id in id2idx:
        img_vectors.append(img_features[id2idx[item_id]])
    else:
        # если вдруг нет картинки → заполним нулями
        img_vectors.append(np.zeros(img_features.shape[1]))
        
img_vectors = np.array(img_vectors)
print("img_vectors shape:", img_vectors.shape)


In [None]:
import pandas as pd, numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from scipy import sparse
import lightgbm as lgb

TRAIN_CSV = "../data/ml_ozon_сounterfeit_train.csv"
TEXT_COLS = ["name_rus", "description"]
CAT_COLS = ["brand_name", "CommercialTypeName4"]   
TARGET = "resolution"
ID_COL = "id"   

N_FOLDS = 5
RANDOM_STATE = 42

df = pd.read_csv(TRAIN_CSV)

df[TARGET] = df[TARGET].astype(int)

exclude = {ID_COL, "ItemID", "SellerID", TARGET}
num_cols = [c for c in df.select_dtypes(include=[np.number]).columns if c not in exclude]
cat_cols = [c for c in CAT_COLS if c in df.columns]

print("Numeric cols:", num_cols)
print("Cat cols:", cat_cols)
print("Text cols:", [c for c in TEXT_COLS if c in df.columns])

for c in cat_cols:
    counts = df[c].fillna("##NA##").value_counts(dropna=False).to_dict()
    df[c + "_freq"] = df[c].fillna("##NA##").map(counts).astype(float)
cat_freq_cols = [c + "_freq" for c in cat_cols]

X_num = df[num_cols].copy()
imp = SimpleImputer(strategy="median")
X_num_imp = imp.fit_transform(X_num)          
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num_imp)

texts = []
for i, row in df.iterrows():
    pieces = []
    for c in TEXT_COLS:
        if c in df.columns:
            pieces.append(str(row.get(c, "")) if pd.notna(row.get(c, "")) else "")
    texts.append(" ".join(pieces))
    
tf = TfidfVectorizer(max_features=80_000, ngram_range=(1,2))
X_text = tf.fit_transform(texts) 

if cat_freq_cols:
    X_catfreq = df[cat_freq_cols].fillna(0).values
else:
    X_catfreq = np.zeros((len(df), 0))

X_img_sparse = sparse.csr_matrix(img_vectors)

X_num_sparse = sparse.csr_matrix(X_num_scaled)
X_cat_sparse = sparse.csr_matrix(X_catfreq)

# склеиваем всё вместе
X = sparse.hstack([X_text, X_cat_sparse, X_num_sparse, X_img_sparse], format="csr")
print("Final feature matrix shape:", X_full.shape)

y = df[TARGET].values

print("Feature matrix shape:", X.shape)

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
oof = np.zeros(len(df))
fold_scores = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, X_va = X[tr_idx], X[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    clf = lgb.LGBMClassifier(
        n_estimators=2000,
        learning_rate=0.05,
        num_leaves=31,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        class_weight='balanced'   # handle imbalance
    )
    clf.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric='binary_logloss'
    )

    pval = clf.predict_proba(X_va)[:,1]
    ths = np.linspace(0.01, 0.99, 99)
    best_th, best_f1 = 0.5, 0.0
    for t in ths:
        f = f1_score(y_va, (pval >= t).astype(int))
        if f > best_f1:
            best_f1 = f
            best_th = t

    oof[va_idx] = (pval >= best_th).astype(int)
    f1_fold = f1_score(y_va, oof[va_idx])
    fold_scores.append(f1_fold)
    print(f"Fold {fold} -> best_th={best_th:.3f} | val F1={f1_fold:.4f}")

print("CV folds F1:", fold_scores)
print("OOF F1:", f1_score(y, oof))


Numeric cols: ['rating_1_count', 'rating_2_count', 'rating_3_count', 'rating_4_count', 'rating_5_count', 'comments_published_count', 'photos_published_count', 'videos_published_count', 'PriceDiscounted', 'item_time_alive', 'item_count_fake_returns7', 'item_count_fake_returns30', 'item_count_fake_returns90', 'item_count_sales7', 'item_count_sales30', 'item_count_sales90', 'item_count_returns7', 'item_count_returns30', 'item_count_returns90', 'GmvTotal7', 'GmvTotal30', 'GmvTotal90', 'ExemplarAcceptedCountTotal7', 'ExemplarAcceptedCountTotal30', 'ExemplarAcceptedCountTotal90', 'OrderAcceptedCountTotal7', 'OrderAcceptedCountTotal30', 'OrderAcceptedCountTotal90', 'ExemplarReturnedCountTotal7', 'ExemplarReturnedCountTotal30', 'ExemplarReturnedCountTotal90', 'ExemplarReturnedValueTotal7', 'ExemplarReturnedValueTotal30', 'ExemplarReturnedValueTotal90', 'ItemVarietyCount', 'ItemAvailableCount', 'seller_time_alive']
Cat cols: ['brand_name', 'CommercialTypeName4']
Text cols: ['name_rus', 'descrip



Fold 1 -> best_th=0.660 | val F1=0.8150
[LightGBM] [Info] Number of positive: 10441, number of negative: 147317
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 63.912883 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2256785
[LightGBM] [Info] Number of data points in the train set: 157758, number of used features: 69755
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




Fold 2 -> best_th=0.630 | val F1=0.8247
[LightGBM] [Info] Number of positive: 10441, number of negative: 147317
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 48.030056 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2255220
[LightGBM] [Info] Number of data points in the train set: 157758, number of used features: 69699
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




Fold 3 -> best_th=0.700 | val F1=0.8183
[LightGBM] [Info] Number of positive: 10442, number of negative: 147317
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 47.896523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2255345
[LightGBM] [Info] Number of data points in the train set: 157759, number of used features: 69590
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




Fold 4 -> best_th=0.760 | val F1=0.8233
[LightGBM] [Info] Number of positive: 10442, number of negative: 147317
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 48.365392 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2250354
[LightGBM] [Info] Number of data points in the train set: 157759, number of used features: 69632
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




Fold 5 -> best_th=0.760 | val F1=0.8236
CV folds F1: [0.8150203477617463, 0.8247041420118343, 0.8182861514919664, 0.8233236151603499, 0.823552049259188]
OOF F1: 0.820952092177077


In [2]:

# ---------- Save OOF and vectorizer/scaler for inference ----------
df_oof = df[[ID_COL]].copy() if ID_COL in df.columns else df.reset_index()[["index"]].rename(columns={"index":"id"})
df_oof["oof_pred"] = oof.astype(int)
df_oof[TARGET] = y
df_oof.to_csv("../out/oof_baseline_text_tabular.csv", index=False)
print("Saved out/oof_baseline_text_tabular.csv")


Saved out/oof_baseline_text_tabular.csv


In [3]:
# ---------- LOAD TEST ----------
TEST_CSV = "../data/ml_ozon_сounterfeit_test.csv"
test_df = pd.read_csv(TEST_CSV)

# numeric
X_num_test = test_df[num_cols].copy()
X_num_test_imp = imp.transform(X_num_test)
X_num_test_scaled = scaler.transform(X_num_test_imp)
X_num_sparse_test = sparse.csr_matrix(X_num_test_scaled)

# cat frequency encoding
X_catfreq_test = np.zeros((len(test_df), 0))
if cat_freq_cols:
    for c in cat_cols:
        counts = df[c].fillna("##NA##").value_counts(dropna=False).to_dict()
        test_df[c + "_freq"] = test_df[c].fillna("##NA##").map(counts).astype(float)
    X_catfreq_test = test_df[cat_freq_cols].fillna(0).values
X_cat_sparse_test = sparse.csr_matrix(X_catfreq_test)

# text
texts_test = []
for i, row in test_df.iterrows():
    pieces = []
    for c in TEXT_COLS:
        if c in test_df.columns:
            pieces.append(str(row.get(c, "")) if pd.notna(row.get(c, "")) else "")
    texts_test.append(" ".join(pieces))
X_text_test = tf.transform(texts_test)

# final sparse matrix
X_test = sparse.hstack([X_text_test, X_cat_sparse_test, X_num_sparse_test], format="csr")
print("Test feature matrix shape:", X_test.shape)


Test feature matrix shape: (22760, 80039)


In [None]:
# допустим, models = список моделей по фолдам
oof_probs_test = np.zeros(len(test_df))

for model in models:
    oof_probs_test += model.predict_proba(X_test)[:,1] / len(models)

# можно взять средний threshold по фолдам
threshold_test = np.mean(best_thresholds)
test_preds = (oof_probs_test >= threshold_test).astype(int)


In [5]:
# ---------- ADD/REPLACE in your script: keep previous code up to CV setup ----------
# (предполагается, что до этого у тебя уже выполнены: df, X, y, tf, imp, scaler, num_cols, cat_cols, cat_freq_cols)

from sklearn.model_selection import StratifiedKFold
import joblib  # опционально для сохранения моделей на диск
import os

# CONFIG for inference
TEST_CSV = "../data/ml_ozon_сounterfeit_test.csv"
OUT_SUBMISSION = "submission.csv"

# Prepare containers
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
oof = np.zeros(len(df))
oof_probs = np.zeros(len(df))
models = []             # <-- будем сохранять модели по фолдам
best_thresholds = []
fold_scores = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, X_va = X[tr_idx], X[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    clf = lgb.LGBMClassifier(
        n_estimators=2000,
        learning_rate=0.05,
        num_leaves=31,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        class_weight='balanced'
    )
    # fit (можно добавить callbacks=..., verbose=False если нужно)
    clf.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='binary_logloss')

    # сохранить модель в список
    models.append(clf)

    # predict proba on val
    pval = clf.predict_proba(X_va)[:,1]
    oof_probs[va_idx] = pval

    # tune threshold on this val
    ths = np.linspace(0.01, 0.99, 99)
    best_th, best_f1 = 0.5, 0.0
    for t in ths:
        f = f1_score(y_va, (pval >= t).astype(int))
        if f > best_f1:
            best_f1 = f
            best_th = t
    best_thresholds.append(best_th)

    oof[va_idx] = (pval >= best_th).astype(int)
    f1_fold = f1_score(y_va, oof[va_idx])
    fold_scores.append(f1_fold)
    print(f"Fold {fold} -> best_th={best_th:.3f} | val F1={f1_fold:.4f}")

print("CV folds F1:", fold_scores)
print("OOF F1:", f1_score(y, oof))

# (опционально) сохраним модели и трансформеры на диск для повторного использования
os.makedirs("models", exist_ok=True)
for i, m in enumerate(models):
    joblib.dump(m, f"models/lgb_fold{i+1}.pkl")
joblib.dump(tf, "models/tfidf.pkl")
joblib.dump(imp, "models/imputer.pkl")
joblib.dump(scaler, "models/scaler.pkl")
# сохраняем мэппинг частот категорий (если понадобится)
cat_counts = {c: df[c].fillna("##NA##").value_counts(dropna=False).to_dict() for c in cat_cols}
joblib.dump(cat_counts, "models/cat_counts.pkl")

# ---------- INFERENCE ON TEST ----------
test_df = pd.read_csv(TEST_CSV)

# prepare numeric part (use same num_cols, imp, scaler)
X_num_test = test_df[num_cols].copy()
X_num_test_imp = imp.transform(X_num_test)           # imputer fitted on train
X_num_test_scaled = scaler.transform(X_num_test_imp)
X_num_sparse_test = sparse.csr_matrix(X_num_test_scaled)

# prepare cat freq columns (use counts from train: cat_counts)
for c in cat_cols:
    cnts = cat_counts[c]  # from saved dict
    test_df[c + "_freq"] = test_df[c].fillna("##NA##").map(cnts).astype(float)
X_catfreq_test = test_df[[c + "_freq" for c in cat_cols]].fillna(0).values
X_cat_sparse_test = sparse.csr_matrix(X_catfreq_test)

# prepare text
texts_test = []
for i, row in test_df.iterrows():
    pieces = []
    for c in TEXT_COLS:
        if c in test_df.columns:
            pieces.append(str(row.get(c, "")) if pd.notna(row.get(c, "")) else "")
    texts_test.append(" ".join(pieces))
X_text_test = tf.transform(texts_test)   # TF-IDF fitted on train

# final test matrix
X_test = sparse.hstack([X_text_test, X_cat_sparse_test, X_num_sparse_test], format="csr")
print("Test feature matrix shape:", X_test.shape)

# predict: average probabilities from CV models
probs_test = np.zeros(len(test_df))
for m in models:
    probs_test += m.predict_proba(X_test)[:,1] / len(models)

# threshold to use: average of best_thresholds across folds
threshold_test = float(np.mean(best_thresholds))
print("Using threshold (mean over folds):", threshold_test)

preds_test = (probs_test >= threshold_test).astype(int)

# build submission; prefer column ID_COL if exists, else fallback to ItemID or index
if ID_COL in test_df.columns:
    ids_out = test_df[ID_COL].astype(int)
elif "ItemID" in test_df.columns:
    ids_out = test_df["ItemID"].astype(int)
else:
    ids_out = test_df.index.astype(int)

submission = pd.DataFrame({"id": ids_out, "prediction": preds_test})
submission.to_csv(OUT_SUBMISSION, index=False)
print("Saved submission to", OUT_SUBMISSION)


[LightGBM] [Info] Number of positive: 10442, number of negative: 147316
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 50.529773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2253640
[LightGBM] [Info] Number of data points in the train set: 157758, number of used features: 69812
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




Fold 1 -> best_th=0.660 | val F1=0.8150
[LightGBM] [Info] Number of positive: 10441, number of negative: 147317
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 48.866597 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2256785
[LightGBM] [Info] Number of data points in the train set: 157758, number of used features: 69755
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




Fold 2 -> best_th=0.630 | val F1=0.8247
[LightGBM] [Info] Number of positive: 10441, number of negative: 147317
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 49.126455 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2255220
[LightGBM] [Info] Number of data points in the train set: 157758, number of used features: 69699
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




Fold 3 -> best_th=0.700 | val F1=0.8183
[LightGBM] [Info] Number of positive: 10442, number of negative: 147317
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 48.632538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2255345
[LightGBM] [Info] Number of data points in the train set: 157759, number of used features: 69590
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




Fold 4 -> best_th=0.760 | val F1=0.8233
[LightGBM] [Info] Number of positive: 10442, number of negative: 147317
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 49.679432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2250354
[LightGBM] [Info] Number of data points in the train set: 157759, number of used features: 69632
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




Fold 5 -> best_th=0.760 | val F1=0.8236
CV folds F1: [0.8150203477617463, 0.8247041420118343, 0.8182861514919664, 0.8233236151603499, 0.823552049259188]
OOF F1: 0.820952092177077
Test feature matrix shape: (22760, 80039)




Using threshold (mean over folds): 0.702
Saved submission to submission.csv
