In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasets-avito/train-dset.parquet
/kaggle/input/datasets-avito/test-dset-small.parquet


# Preprocessing + TF-IDF (Train)

In [3]:
import os, gc, json
import numpy as np
import pandas as pd

import pyarrow as pa
import pyarrow.dataset as ds

from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
import joblib

In [4]:
TRAIN_PATH = "/kaggle/input/datasets-avito/train-dset.parquet"
TEST_PATH  = "/kaggle/input/datasets-avito/test-dset-small.parquet"  
OUT_DIR_TRAIN = "train_featurized_parts"   

os.makedirs(OUT_DIR_TRAIN, exist_ok=True)

In [5]:
print("Reading schemas only...")
train_cols = pd.read_parquet(TRAIN_PATH, columns=["query_id"]).shape[0]
test_cols  = pd.read_parquet(TEST_PATH, columns=["query_id"]).shape[0]
print("train rows:", train_cols)
print("test rows:", test_cols)

Reading schemas only...
train rows: 7781790
test rows: 335348


In [6]:
print("Compute global price clip...")
price = pd.read_parquet(TRAIN_PATH, columns=["price"])["price"]
q = price.quantile([0.999])  
PRICE_CLIP_LOW = 0.0
PRICE_CLIP_HIGH = float(q.loc[0.999])
print("PRICE_CLIP_HIGH =", PRICE_CLIP_HIGH)

with open("global_price_clip.json", "w") as f:
    json.dump({"PRICE_CLIP_LOW": PRICE_CLIP_LOW, "PRICE_CLIP_HIGH": PRICE_CLIP_HIGH}, f)

Compute global price clip...
PRICE_CLIP_HIGH = 22000000.0


In [7]:
print("Create qid folds...")
qids = pd.read_parquet(TRAIN_PATH, columns=["query_id"])["query_id"].drop_duplicates()
qids = qids.sample(frac=1, random_state=42).reset_index(drop=True)
folds = np.array_split(qids.values, 5)

for i, arr in enumerate(folds):
    pd.DataFrame({"query_id": arr}).to_parquet(f"qids_fold_{i}.parquet", index=False)

print("Saved folds sizes:", [len(arr) for arr in folds])
print("Total unique qids:", len(qids))

Create qid folds...
Saved folds sizes: [135638, 135638, 135638, 135638, 135638]
Total unique qids: 678190


In [8]:
import numpy as np
import pyarrow as pa
import pyarrow.dataset as ds
import gc, joblib
from sklearn.feature_extraction.text import HashingVectorizer
import scipy.sparse as sp

TRAIN_PATH = "/kaggle/input/datasets-avito/train-dset.parquet"

N_FEATURES = 2**18
NGRAM_RANGE = (1, 2)
BATCH_SIZE_IDF = 50_000
MAX_ROWS_IDF = None  

hv = HashingVectorizer(
    n_features=N_FEATURES,
    alternate_sign=False,
    norm=None,
    lowercase=True,
    token_pattern=r"(?u)\b\w+\b",
    ngram_range=NGRAM_RANGE
)

dataset = ds.dataset(TRAIN_PATH, format="parquet")
scanner = dataset.scanner(columns=["query_text","item_title","item_description"], batch_size=BATCH_SIZE_IDF)

df_A = np.zeros(N_FEATURES, dtype=np.int64)
df_B = np.zeros(N_FEATURES, dtype=np.int64)
n_docs = 0

def update_df(df_vec, X_counts):
    present = (X_counts > 0).astype(np.int8)
    df_vec += np.asarray(present.sum(axis=0)).ravel()

total = 0
for bi, batch in enumerate(scanner.to_batches()):
    tbl = pa.Table.from_batches([batch]).to_pandas()

    qt = tbl["query_text"].astype(str)
    title = tbl["item_title"].fillna("").astype(str)
    desc  = tbl["item_description"].fillna("").astype(str)

    textA = (qt + " " + title).values
    textB = (qt + " " + desc).values

    XA = hv.transform(textA)
    XB = hv.transform(textB)

    update_df(df_A, XA)
    update_df(df_B, XB)

    n = len(tbl)
    n_docs += n
    total += n

    del tbl, qt, title, desc, textA, textB, XA, XB
    gc.collect()

    if bi % 10 == 0:
        print(f"IDF batch={bi}, processed_rows={total}")

    if (MAX_ROWS_IDF is not None) and (total >= MAX_ROWS_IDF):
        print("Reached MAX_ROWS_IDF, stopping.")
        break

idf_A = np.log((1.0 + n_docs) / (1.0 + df_A)) + 1.0
idf_B = np.log((1.0 + n_docs) / (1.0 + df_B)) + 1.0

joblib.dump(hv, "hashing_vectorizer.joblib")
joblib.dump(idf_A.astype(np.float32), "idf_A.npy")
joblib.dump(idf_B.astype(np.float32), "idf_B.npy")

print("Saved hashing_vectorizer.joblib, idf_A.npy, idf_B.npy")
print("n_docs:", n_docs)

IDF batch=0, processed_rows=50000
IDF batch=10, processed_rows=550000
IDF batch=20, processed_rows=1048576
IDF batch=30, processed_rows=1548576
IDF batch=40, processed_rows=2048576
IDF batch=50, processed_rows=2547152
IDF batch=60, processed_rows=3047152
IDF batch=70, processed_rows=3545728
IDF batch=80, processed_rows=4045728
IDF batch=90, processed_rows=4544304
IDF batch=100, processed_rows=5044304
IDF batch=110, processed_rows=5542880
IDF batch=120, processed_rows=6042880
IDF batch=130, processed_rows=6541456
IDF batch=140, processed_rows=7041456
IDF batch=150, processed_rows=7540032
Saved hashing_vectorizer.joblib, idf_A.npy, idf_B.npy
n_docs: 7781790


In [9]:
import numpy as np
import scipy.sparse as sp

def tfidf_transform_counts(X_counts_csr, idf_vec, sublinear_tf=True, l2_norm=True):
    X = X_counts_csr.tocsr(copy=True)

    if sublinear_tf:
        X.data = np.log1p(X.data)

    X = X.multiply(idf_vec)

    if l2_norm:
        row_sums = np.sqrt(X.power(2).sum(axis=1)).A1
        row_sums[row_sums == 0] = 1.0
        X = X.multiply(1.0 / row_sums[:, None])

    return X

In [10]:
import numpy as np
import pyarrow as pa
import pyarrow.dataset as ds
import gc, joblib
from sklearn.decomposition import TruncatedSVD

hv = joblib.load("hashing_vectorizer.joblib")
idf_A = joblib.load("idf_A.npy")
idf_B = joblib.load("idf_B.npy")

TRAIN_PATH = "/kaggle/input/datasets-avito/train-dset.parquet"
dataset = ds.dataset(TRAIN_PATH, format="parquet")

BATCH_SIZE_SVD_SAMPLE = 80_000
SVD_TRAIN_ROWS = 300_000
N_COMP = 128

scanner = dataset.scanner(columns=["query_text","item_title","item_description"], batch_size=BATCH_SIZE_SVD_SAMPLE)

textsA_chunks, textsB_chunks = [], []
collected = 0

for batch in scanner.to_batches():
    tbl = pa.Table.from_batches([batch]).to_pandas()
    qt = tbl["query_text"].astype(str)
    title = tbl["item_title"].fillna("").astype(str)
    desc  = tbl["item_description"].fillna("").astype(str)

    textsA_chunks.append((qt + " " + title).values)
    textsB_chunks.append((qt + " " + desc).values)

    collected += len(tbl)
    del tbl, qt, title, desc
    gc.collect()

    if collected >= SVD_TRAIN_ROWS:
        break

textsA = np.concatenate(textsA_chunks)[:SVD_TRAIN_ROWS]
textsB = np.concatenate(textsB_chunks)[:SVD_TRAIN_ROWS]
del textsA_chunks, textsB_chunks
gc.collect()

XA_counts = hv.transform(textsA)
XB_counts = hv.transform(textsB)

XA = tfidf_transform_counts(XA_counts, idf_A, sublinear_tf=True, l2_norm=True)
XB = tfidf_transform_counts(XB_counts, idf_B, sublinear_tf=True, l2_norm=True)

svd_A = TruncatedSVD(n_components=N_COMP, random_state=42)
svd_B = TruncatedSVD(n_components=N_COMP, random_state=42)
svd_A.fit(XA)
svd_B.fit(XB)

joblib.dump(svd_A, "svd_A_128.joblib")
joblib.dump(svd_B, "svd_B_128.joblib")
print("Saved SVD models")

del textsA, textsB, XA_counts, XB_counts, XA, XB
gc.collect()

Saved SVD models


0

In [11]:
import os, json
import pyarrow as pa
import pyarrow.dataset as ds
import numpy as np
import pandas as pd
import joblib, gc

TRAIN_PATH = "/kaggle/input/datasets-avito/train-dset.parquet"
OUT_DIR_TRAIN = "train_featurized_parts"
os.makedirs(OUT_DIR_TRAIN, exist_ok=True)

hv = joblib.load("hashing_vectorizer.joblib")
idf_A = joblib.load("idf_A.npy")
idf_B = joblib.load("idf_B.npy")
svd_A = joblib.load("svd_A_128.joblib")
svd_B = joblib.load("svd_B_128.joblib")

with open("global_price_clip.json", "r") as f:
    clip = json.load(f)
PRICE_CLIP_LOW = clip["PRICE_CLIP_LOW"]
PRICE_CLIP_HIGH = clip["PRICE_CLIP_HIGH"]

BATCH_SIZE_FEAT = 30_000

dataset = ds.dataset(TRAIN_PATH, format="parquet")
scanner = dataset.scanner(columns=[
    "query_id","item_id",
    "query_text","item_title","item_description",
    "query_cat","query_mcat","query_loc",
    "item_cat_id","item_mcat_id","item_loc",
    "price","item_query_click_conv",
    "item_contact"
], batch_size=BATCH_SIZE_FEAT)

part = 0
rows_total = 0

for bi, batch in enumerate(scanner.to_batches()):
    df = pa.Table.from_batches([batch]).to_pandas()

    df["item_title"] = df["item_title"].fillna("")
    df["item_description"] = df["item_description"].fillna("")
    df["query_mcat"] = df["query_mcat"].fillna(-1)

    df["conv_missing"] = (df["item_query_click_conv"] == -1).astype(np.int8)
    df["conv_val"] = df["item_query_click_conv"].where(df["item_query_click_conv"] != -1, 0).astype(np.float32)

    df["price_clip"] = df["price"].clip(lower=PRICE_CLIP_LOW, upper=PRICE_CLIP_HIGH).astype(np.float32)
    df["price_log"]  = np.log1p(df["price_clip"]).astype(np.float32)

    df["is_loc_match"] = (df["query_loc"].astype("float32") == df["item_loc"].astype("float32")).astype(np.int8)
    df["is_cat_match"] = (df["query_cat"].astype("float32") == df["item_cat_id"].astype("float32")).astype(np.int8)

    textA = (df["query_text"].astype(str) + " " + df["item_title"].astype(str)).values
    textB = (df["query_text"].astype(str) + " " + df["item_description"].astype(str)).values

    XA_counts = hv.transform(textA)
    XB_counts = hv.transform(textB)

    XA = tfidf_transform_counts(XA_counts, idf_A, sublinear_tf=True, l2_norm=True)
    XB = tfidf_transform_counts(XB_counts, idf_B, sublinear_tf=True, l2_norm=True)

    ZA = svd_A.transform(XA).astype(np.float32)
    ZB = svd_B.transform(XB).astype(np.float32)

    ZA_df = pd.DataFrame(ZA, columns=[f"tfidfA_svd_{j}" for j in range(ZA.shape[1])])
    ZB_df = pd.DataFrame(ZB, columns=[f"tfidfB_svd_{j}" for j in range(ZB.shape[1])])

    df = pd.concat([df.reset_index(drop=True), ZA_df, ZB_df], axis=1)

    df = df.drop(columns=["query_text","item_title","item_description","price","item_query_click_conv"])

    out_path = f"{OUT_DIR_TRAIN}/part_{part:03d}.parquet"
    df.to_parquet(out_path, index=False)

    rows_total += len(df)
    part += 1

    del df, textA, textB, XA_counts, XB_counts, XA, XB, ZA, ZB
    gc.collect()

    if bi % 10 == 0:
        print(f"FEAT batch={bi}, saved_parts={part}, rows_total={rows_total}")

print("DONE. parts:", part, "rows:", rows_total)

FEAT batch=0, saved_parts=1, rows_total=30000
FEAT batch=10, saved_parts=11, rows_total=330000
FEAT batch=20, saved_parts=21, rows_total=630000
FEAT batch=30, saved_parts=31, rows_total=930000
FEAT batch=40, saved_parts=41, rows_total=1228576
FEAT batch=50, saved_parts=51, rows_total=1528576
FEAT batch=60, saved_parts=61, rows_total=1828576
FEAT batch=70, saved_parts=71, rows_total=2127152
FEAT batch=80, saved_parts=81, rows_total=2427152
FEAT batch=90, saved_parts=91, rows_total=2727152
FEAT batch=100, saved_parts=101, rows_total=3027152
FEAT batch=110, saved_parts=111, rows_total=3325728
FEAT batch=120, saved_parts=121, rows_total=3625728
FEAT batch=130, saved_parts=131, rows_total=3925728
FEAT batch=140, saved_parts=141, rows_total=4224304
FEAT batch=150, saved_parts=151, rows_total=4524304
FEAT batch=160, saved_parts=161, rows_total=4824304
FEAT batch=170, saved_parts=171, rows_total=5124304
FEAT batch=180, saved_parts=181, rows_total=5422880
FEAT batch=190, saved_parts=191, rows_t

# Preprocessing TF-IDF (Test)

In [12]:
import os, gc, json
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import joblib
import scipy.sparse as sp

In [13]:
TEST_PATH = "/kaggle/input/datasets-avito/test-dset-small.parquet"
OUT_DIR_TEST = "test_featurized_parts"
os.makedirs(OUT_DIR_TEST, exist_ok=True)

In [14]:
hv = joblib.load("hashing_vectorizer.joblib")
idf_A = joblib.load("idf_A.npy")              
idf_B = joblib.load("idf_B.npy")              
svd_A = joblib.load("svd_A_128.joblib")
svd_B = joblib.load("svd_B_128.joblib")

with open("global_price_clip.json", "r") as f:
    clip = json.load(f)
PRICE_CLIP_LOW = float(clip["PRICE_CLIP_LOW"])
PRICE_CLIP_HIGH = float(clip["PRICE_CLIP_HIGH"])

In [15]:
def tfidf_transform_counts(X_counts_csr, idf_vec, sublinear_tf=True, l2_norm=True):
    X = X_counts_csr.tocsr(copy=True)
    if sublinear_tf:
        X.data = np.log1p(X.data)
    X = X.multiply(idf_vec)
    if l2_norm:
        row_sums = np.sqrt(X.power(2).sum(axis=1)).A1
        row_sums[row_sums == 0] = 1.0
        X = X.multiply(1.0 / row_sums[:, None])
    return X

In [16]:
BATCH_SIZE = 30_000  

dataset = ds.dataset(TEST_PATH, format="parquet")
scanner = dataset.scanner(columns=[
    "query_id","item_id",
    "query_text","item_title","item_description",
    "query_cat","query_mcat","query_loc",
    "item_cat_id","item_mcat_id","item_loc",
    "price","item_query_click_conv",
], batch_size=BATCH_SIZE)

part = 0
rows_total = 0

for bi, batch in enumerate(scanner.to_batches()):
    df = pa.Table.from_batches([batch]).to_pandas()

    df["item_title"] = df["item_title"].fillna("")
    df["item_description"] = df["item_description"].fillna("")
    df["query_mcat"] = df["query_mcat"].fillna(-1)

    df["conv_missing"] = (df["item_query_click_conv"] == -1).astype(np.int8)
    df["conv_val"] = df["item_query_click_conv"].where(df["item_query_click_conv"] != -1, 0).astype(np.float32)

    df["price_clip"] = df["price"].clip(lower=PRICE_CLIP_LOW, upper=PRICE_CLIP_HIGH).astype(np.float32)
    df["price_log"]  = np.log1p(df["price_clip"]).astype(np.float32)

    df["is_loc_match"] = (df["query_loc"].astype("float32") == df["item_loc"].astype("float32")).astype(np.int8)
    df["is_cat_match"] = (df["query_cat"].astype("float32") == df["item_cat_id"].astype("float32")).astype(np.int8)

    textA = (df["query_text"].astype(str) + " " + df["item_title"].astype(str)).values
    textB = (df["query_text"].astype(str) + " " + df["item_description"].astype(str)).values

    XA_counts = hv.transform(textA)
    XB_counts = hv.transform(textB)

    XA = tfidf_transform_counts(XA_counts, idf_A, sublinear_tf=True, l2_norm=True)
    XB = tfidf_transform_counts(XB_counts, idf_B, sublinear_tf=True, l2_norm=True)

    ZA = svd_A.transform(XA).astype(np.float32)
    ZB = svd_B.transform(XB).astype(np.float32)

    ZA_df = pd.DataFrame(ZA, columns=[f"tfidfA_svd_{j}" for j in range(ZA.shape[1])])
    ZB_df = pd.DataFrame(ZB, columns=[f"tfidfB_svd_{j}" for j in range(ZB.shape[1])])
    df = pd.concat([df.reset_index(drop=True), ZA_df, ZB_df], axis=1)

    df = df.drop(columns=["query_text","item_title","item_description","price","item_query_click_conv"])

    out_path = f"{OUT_DIR_TEST}/part_{part:03d}.parquet"
    df.to_parquet(out_path, index=False)

    rows_total += len(df)
    part += 1

    del df, ZA, ZB, ZA_df, ZB_df, XA_counts, XB_counts, XA, XB, textA, textB
    gc.collect()

    if bi % 10 == 0:
        print(f"TEST batch={bi}, saved_parts={part}, rows_total={rows_total}")

print("DONE. test parts:", part, "rows:", rows_total)
print("Output dir:", OUT_DIR_TEST)

TEST batch=0, saved_parts=1, rows_total=30000
TEST batch=10, saved_parts=11, rows_total=330000
DONE. test parts: 12 rows: 335348
Output dir: test_featurized_parts


# Model

In [17]:
import glob, gc
import pandas as pd
import numpy as np
from catboost import CatBoostRanker, Pool

In [18]:
PARTS_DIR = "train_featurized_parts"
parts = sorted(glob.glob(f"{PARTS_DIR}/part_*.parquet"))
print("n_parts:", len(parts))

val_qids = set(pd.read_parquet("qids_fold_0.parquet")["query_id"].values)

train_qids = set(pd.read_parquet("qids_fold_1.parquet")["query_id"].values) | \
             set(pd.read_parquet("qids_fold_2.parquet")["query_id"].values)

print("val_qids:", len(val_qids), "train_qids:", len(train_qids))

train_chunks, val_chunks = [], []

for i, p in enumerate(parts):
    df = pd.read_parquet(p)

    m_val = df["query_id"].isin(val_qids)
    if m_val.any():
        val_chunks.append(df[m_val])

    m_tr = df["query_id"].isin(train_qids)
    if m_tr.any():
        train_chunks.append(df[m_tr])

    if i % 20 == 0:
        tr_rows = sum(len(x) for x in train_chunks)
        va_rows = sum(len(x) for x in val_chunks)
        print(f"processed {i}/{len(parts)} | tr_rows={tr_rows} va_rows={va_rows}")

    del df
    gc.collect()

tr = pd.concat(train_chunks, ignore_index=True)
va = pd.concat(val_chunks, ignore_index=True)
del train_chunks, val_chunks
gc.collect()

print("FINAL tr shape:", tr.shape, "va shape:", va.shape)
print("tr target mean:", tr["item_contact"].mean(), "va target mean:", va["item_contact"].mean())

n_parts: 260
val_qids: 135638 train_qids: 271276
processed 0/260 | tr_rows=12635 va_rows=5473
processed 20/260 | tr_rows=251457 va_rows=124516
processed 40/260 | tr_rows=490909 va_rows=242458
processed 60/260 | tr_rows=730689 va_rows=361931
processed 80/260 | tr_rows=970499 va_rows=482273
processed 100/260 | tr_rows=1210114 va_rows=603283
processed 120/260 | tr_rows=1446432 va_rows=722355
processed 140/260 | tr_rows=1687490 va_rows=841704
processed 160/260 | tr_rows=1927545 va_rows=961651
processed 180/260 | tr_rows=2165897 va_rows=1081899
processed 200/260 | tr_rows=2407558 va_rows=1202363
processed 220/260 | tr_rows=2645871 va_rows=1322770
processed 240/260 | tr_rows=2887513 va_rows=1443137
FINAL tr shape: (3112699, 271) va shape: (1556058, 271)
tr target mean: 0.04416553 va target mean: 0.043723308


In [19]:
TARGET = "item_contact"
GROUP = "query_id"
drop_cols = [TARGET, "item_id", "query_id"]
features = [c for c in tr.columns if c not in drop_cols]

print("n_features:", len(features))

tr = tr.sort_values(GROUP, kind="mergesort").reset_index(drop=True)
va = va.sort_values(GROUP, kind="mergesort").reset_index(drop=True)

cat_features = []

train_pool = Pool(
    tr[features],
    label=tr[TARGET].astype(int),
    group_id=tr[GROUP].values,
    cat_features=cat_features
)
valid_pool = Pool(
    va[features],
    label=va[TARGET].astype(int),
    group_id=va[GROUP].values,
    cat_features=cat_features
)

ranker = CatBoostRanker(
    loss_function="YetiRank",
    eval_metric="NDCG:top=10",
    iterations=3000,
    learning_rate=0.05,
    depth=8,
    random_seed=42,
    verbose=100,
    task_type="GPU"
)

ranker.fit(train_pool, eval_set=valid_pool, use_best_model=True)

print("best_iteration:", ranker.get_best_iteration())
print("best_score:", ranker.get_best_score())

ranker.save_model("catboost_ranker_40pct.cbm")
print("Saved model: catboost_ranker_40pct.cbm")

n_features: 268


Default metric period is 5 because PFound, NDCG is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7968169	best: 0.7968169 (0)	total: 515ms	remaining: 25m 44s
100:	test: 0.8135200	best: 0.8135200 (100)	total: 12.7s	remaining: 6m 5s
200:	test: 0.8142720	best: 0.8142720 (200)	total: 24.9s	remaining: 5m 46s
300:	test: 0.8149713	best: 0.8149713 (300)	total: 36.9s	remaining: 5m 31s
400:	test: 0.8155117	best: 0.8155117 (400)	total: 49s	remaining: 5m 17s
500:	test: 0.8159535	best: 0.8159593 (490)	total: 1m 1s	remaining: 5m 4s
600:	test: 0.8163110	best: 0.8163110 (600)	total: 1m 13s	remaining: 4m 52s
700:	test: 0.8165800	best: 0.8165800 (700)	total: 1m 25s	remaining: 4m 41s
800:	test: 0.8166666	best: 0.8167175 (790)	total: 1m 38s	remaining: 4m 29s
900:	test: 0.8169356	best: 0.8169356 (900)	total: 1m 50s	remaining: 4m 17s
1000:	test: 0.8170393	best: 0.8171149 (985)	total: 2m 3s	remaining: 4m 6s
1100:	test: 0.8171425	best: 0.8171739 (1090)	total: 2m 16s	remaining: 3m 54s
1200:	test: 0.8171862	best: 0.8172302 (1175)	total: 2m 29s	remaining: 3m 43s
1300:	test: 0.8172736	best: 0.81730

# Predict + Submission

In [20]:
import glob, gc
import pandas as pd
from catboost import CatBoostRanker

In [21]:
ranker = CatBoostRanker()
ranker.load_model("catboost_ranker_40pct.cbm")

<catboost.core.CatBoostRanker at 0x7925a0517b30>

In [22]:
PARTS_DIR_TEST = "test_featurized_parts"
test_parts = sorted(glob.glob(f"{PARTS_DIR_TEST}/part_*.parquet"))
print("n_test_parts:", len(test_parts))

pred_parts = []
total = 0

for i, p in enumerate(test_parts):
    df = pd.read_parquet(p)

    feature_cols = [c for c in df.columns if c not in ["query_id", "item_id"]]
    df["score"] = ranker.predict(df[feature_cols])

    pred_parts.append(df[["query_id", "item_id", "score"]])
    total += len(df)

    del df
    gc.collect()

    print(f"pred {i+1}/{len(test_parts)} total_rows={total}")

pred = pd.concat(pred_parts, ignore_index=True)
del pred_parts
gc.collect()

n_test_parts: 12
pred 1/12 total_rows=30000
pred 2/12 total_rows=60000
pred 3/12 total_rows=90000
pred 4/12 total_rows=120000
pred 5/12 total_rows=150000
pred 6/12 total_rows=180000
pred 7/12 total_rows=210000
pred 8/12 total_rows=240000
pred 9/12 total_rows=270000
pred 10/12 total_rows=300000
pred 11/12 total_rows=330000
pred 12/12 total_rows=335348


0

In [23]:
submission_df = pred.sort_values(["query_id", "score"], ascending=[True, False])[["query_id", "item_id"]]
submission_df.to_csv("solution.csv", header=["query_id", "item_id"], index=False)

print("Saved solution.csv shape:", submission_df.shape)
print(submission_df.head(20))

Saved solution.csv shape: (335348, 2)
    query_id     item_id
4         55  7552455685
15        55   823036541
11        55  7464296355
19        55  3635185843
14        55  7555956997
22        55  7562354327
28        55  7489404885
8         55  7552064016
34        55  7549689548
27        55  7576666895
24        55  2427651981
3         55  7587733901
30        55  4600495891
31        55  7568390402
21        55  2231316338
26        55  7585975325
29        55  7377513020
6         55  3298429910
0         55  7540855789
18        55  2499344704
