In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import torch
from catboost import CatBoostClassifier

In [2]:
train = pd.read_csv("vseross/E/data/train.tsv", sep="\t")
test  = pd.read_csv("vseross/E/data/test.tsv", sep="\t")
sample_sub = pd.read_csv("vseross/E/data/sample_submission.tsv", sep="\t")

In [3]:
def make_text(df):
    return (
        df['app_name'].fillna('').astype(str) + ' . ' +
        df['shortDescription'].fillna('').astype(str) + ' . ' +
        df['full_description'].fillna('').astype(str)
    )

train['text'] = make_text(train)
test['text']  = make_text(test)

In [4]:
train['labels_list'] = train['labels_str'].fillna('').apply(lambda s: s.split('|') if s != '' else [])
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(train['labels_list'])
len(mlb.classes_)

45

In [5]:
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
model_name = "sergeyzh/LaBSE-ru-turbo" #"sergeyzh/BERTA" 
model = SentenceTransformer(model_name, device=device)

Device: cuda


In [6]:
# def encode_texts(texts, batch_size=64, cache_path=None):
#     """ Кодируем батчами, можно кэшировать на диск """
#     if cache_path and os.path.exists(cache_path):
#         print("Loading embeddings from", cache_path)
#         return np.load(cache_path)
#     embs = model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)
#     if cache_path:
#         np.save(cache_path, embs)
#     return embs

# X_all = encode_texts(train['text'].tolist(), batch_size=64, cache_path="train_embs2.npy")
# X_test = encode_texts(test['text'].tolist(), batch_size=64, cache_path="test_embs2.npy")

In [7]:
X_all1 = np.load('train_embs1.npy')
X_test1 = np.load('test_embs1.npy')

X_all2 = np.load('train_embs2.npy')
X_test2 = np.load('test_embs2.npy')

In [8]:
X_all = np.concat([X_all1, X_all2], axis=1)
X_test = np.concat([X_test1, X_test2], axis=1)

In [9]:
X_tr, X_val, y_tr, y_val = train_test_split(X_all, y, test_size=0.1, random_state=42)

In [18]:
y[:, 0]

array([2, 2, 2, ..., 2, 2, 2], shape=(53494,))

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from catboost import CatBoostClassifier

# clf = OneVsRestClassifier(
#     LogisticRegression(max_iter=2000, n_jobs=-1, solver='saga')
# )
# clf.fit(X_tr, y_tr)

base_clf = LinearSVC()
clf = OneVsRestClassifier(CalibratedClassifierCV(base_clf))
clf.fit(X_all1, y)

In [None]:
# proba_val = clf.predict_proba(X_val)

In [None]:
# def hitrate_at_k(y_true_bin, y_proba, k=3):
#     topk = np.argsort(y_proba, axis=1)[:, -k:]
#     topk_sets = [set(row) for row in topk]
#     hits = 0
#     for i in tqdm(range(len(y_true_bin))):
#         true_inds = set(np.where(y_true_bin[i] == 1)[0])
#         if len(true_inds) == 0:
#             continue
#         if len(true_inds & topk_sets[i]) > 0:
#             hits += 1
#     return hits / len(y_true_bin)

# h3 = hitrate_at_k(y_val, proba_val, k=3)

In [None]:
proba_test = clf.predict_proba(X_test1)

In [None]:
logreg_df = pd.DataFrame(proba_test.tolist(), columns=mlb.classes_)
berta_probas_df = pd.read_csv('berta_probs1.csv')
berta_probas_df1 = pd.read_csv('berta_probs2.csv')
berta_probas_df2 = pd.read_csv('berta_probs3.csv')
berta_probas_df3 = pd.read_csv('berta_probs4.csv')
berta_probas_df4 = pd.read_csv('berta_probs5.csv')

In [None]:
probas_df = berta_probas_df * 0.3 + logreg_df[berta_probas_df.columns] * 0.2 + berta_probas_df1 * 0.2 + berta_probas_df2 * 0.2 + berta_probas_df3 * 0.1 #+ berta_probas_df4 * 0.1

In [None]:
# probas_df.to_csv('all_probas.csv', index=False)

In [None]:
proba_test1 = probas_df[logreg_df.columns].to_numpy()

In [None]:
top3_idx = np.argsort(proba_test1, axis=1)[:, -3:][:, ::-1]  # индексы топ-3, в порядке убывания вероятности

In [None]:
def indices_to_labelstr(indices_row):
    labels = [mlb.classes_[i] for i in indices_row]
    return "|".join(labels)

test_labels_pred = ["|".join([mlb.classes_[i] for i in row]) for row in top3_idx]

In [None]:
submission = pd.DataFrame({
    "app_name": test['app_name'],
    "labels_str": test_labels_pred
})
submission.to_csv("bbbb.tsv", sep="\t", index=False)

In [None]:
submission