In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss
from scipy import sparse

In [2]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [25]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Train columns:", train.columns.tolist())

Train shape: (11216, 3)
Test shape: (2805, 2)
Train columns: ['id', 'TEXT', 'LABEL']


In [26]:
labels_str = train['LABEL'].fillna('').astype(str)
y_raw = labels_str.apply(lambda s: [] if s == '' else s.split(','))

In [27]:
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(y_raw)
print("Количество меток:", len(mlb.classes_))
print("Примеры меток:", mlb.classes_[:10])

Количество меток: 20
Примеры меток: ['Б/у' 'Возврат' 'Вопрос клиента' 'Вопрос про гарантию' 'Доставка негатив'
 'Доставка позитив' 'Другой товар' 'Каспи' 'Неинформативный'
 'Неполная комплектация']


In [53]:
X_text = train['TEXT'].astype(str)
X_test_text = test['TEXT'].astype(str)
X_train, X_val, Y_train, Y_val = train_test_split(
    X_text, Y, test_size=0.2, random_state=42
)

In [99]:
word_tfidf = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=40000,
    sublinear_tf=True,
)
char_tfidf = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(3, 6),
    max_features=20000,
    sublinear_tf=True
)

In [100]:
# Обучаем TF-IDF на обучающей выборке
W_train = word_tfidf.fit_transform(X_train)
C_train = char_tfidf.fit_transform(X_train)
X_train_all = sparse.hstack([W_train, C_train]).tocsr()

# Преобразуем валидацию и тест
W_val = word_tfidf.transform(X_val)
C_val = char_tfidf.transform(X_val)
X_val_all = sparse.hstack([W_val, C_val]).tocsr()

W_test = word_tfidf.transform(X_test_text)
C_test = char_tfidf.transform(X_test_text)
X_test_all = sparse.hstack([W_test, C_test]).tocsr()

In [101]:
print("Train features:", X_train_all.shape)
print("Val features:", X_val_all.shape)
print("Test features:", X_test_all.shape)

Train features: (8972, 90000)
Val features: (2244, 90000)
Test features: (2805, 90000)


In [102]:
base_clf = SGDClassifier(
    loss='log_loss',      # аналог LogisticRegression
    penalty='l2',
    alpha=1e-4,
    max_iter=5,           # быстрая сходимость
    random_state=42
)
clf = OneVsRestClassifier(base_clf, n_jobs=-1)

In [103]:
print("Обучаем модель...")
clf.fit(X_train_all, Y_train)
print("Готово ✅")

Обучаем модель...
Готово ✅


In [104]:
P_val = clf.predict_proba(X_val_all)
print("Подбираем пороги для каждой метки...")

Подбираем пороги для каждой метки...


In [105]:
grid = np.linspace(0.05, 0.95, 19)
best_thresholds = np.zeros(P_val.shape[1])

In [106]:
for j in range(P_val.shape[1]):
    y_true, p = Y_val[:, j], P_val[:, j]
    best_t, best_err = 0.5, 1.0
    for t in grid:
        y_hat = (p >= t).astype(int)
        err = np.mean(y_hat != y_true)
        if err < best_err:
            best_err, best_t = err, t
    best_thresholds[j] = best_t

Y_val_pred_opt = (P_val >= best_thresholds).astype(int)
hl_opt = hamming_loss(Y_val, Y_val_pred_opt)
print(f"Hamming Loss (optimized): {hl_opt:.5f}")

Hamming Loss (optimized): 0.03509


In [107]:
print("Переобучаем на всём train...")
W_full = word_tfidf.fit_transform(X_text)
C_full = char_tfidf.fit_transform(X_text)
X_full = sparse.hstack([W_full, C_full]).tocsr()

Переобучаем на всём train...


In [108]:
clf_full = OneVsRestClassifier(base_clf, n_jobs=-1)
clf_full.fit(X_full, Y)
print("Обучение завершено ✅")

Обучение завершено ✅


In [109]:
W_test_full = word_tfidf.transform(X_test_text)
C_test_full = char_tfidf.transform(X_test_text)
X_test_full = sparse.hstack([W_test_full, C_test_full]).tocsr()

P_test = clf_full.predict_proba(X_test_full)
Y_test_pred = (P_test >= best_thresholds).astype(int)

pred_labels = mlb.inverse_transform(Y_test_pred)

In [110]:
submission = pd.DataFrame({
    'id': test['id'],
    'LABEL': [','.join(lbls) if len(lbls) > 0 else '' for lbls in pred_labels]
})

submission.to_csv('submission-clf8.csv', index=False)
print("Первые строки:")
print(submission.head())

Первые строки:
      id                         LABEL
0   7366                      Продавец
1  10959                              
2  13290  Товар или упаковка поврежден
3   9625                  Другой товар
4    487                              
