In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import hamming_loss

In [47]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [48]:
print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist())


Train columns: ['id', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147'

In [49]:
drop_cols = [col for col in ['Unnamed: 0', 'id', 'label'] if col in train.columns]
X = train.drop(columns=drop_cols)
y_raw = train['label'].apply(lambda x: x.split(','))

In [50]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_raw)

In [51]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [52]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test.drop(columns=[col for col in ['Unnamed: 0', 'id'] if col in test.columns]))

In [53]:
lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model = MultiOutputClassifier(lgbm)
model.fit(X_train_scaled, y_train)

[LightGBM] [Info] Number of positive: 289, number of negative: 8603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25473
[LightGBM] [Info] Number of data points in the train set: 8892, number of used features: 961
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.032501 -> initscore=-3.393440
[LightGBM] [Info] Start training from score -3.393440
[LightGBM] [Info] Number of positive: 1722, number of negative: 7170
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044966 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25473
[LightGBM] [Info] Number of data points in the train set: 8892, number of used features: 961
[LightGBM] [Info] [bi

In [54]:
y_pred = model.predict(X_val_scaled)
loss = hamming_loss(y_val, y_pred)
print(f"Hamming loss на валидации: {loss:.5f}")



Hamming loss на валидации: 0.05959


In [55]:
test_pred = model.predict(X_test_scaled)
pred_labels = mlb.inverse_transform(test_pred)



In [56]:
if 'id' in test.columns:
    ids = test['id']
elif 'Unnamed: 0' in test.columns:
    ids = test['Unnamed: 0']
else:
    ids = range(len(test))

submission = pd.DataFrame({
    'id': ids,
    'label': [','.join(labels) if len(labels) > 0 else '' for labels in pred_labels]
})

submission = submission.drop_duplicates(subset='id', keep='first')

submission.to_csv('submission-moc-lgbm1.csv', index=False)