## Необходимые импорты

In [None]:
import os
import zipfile
import gzip
import shutil
import gdown

import pandas as pd
import numpy as np
from scipy import sparse

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_extraction import FeatureHasher
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

## Загрузка данных

In [None]:
import os

if not(os.path.exists('vk-predict-cmc-hackathon.zip')):
    ! pip install kaggle
    ! kaggle competitions download -c vk-predict-cmc-hackathonv

In [None]:
archive_path = 'vk-predict-cmc-hackathon.zip'
target_file = 'train.csv'

if not os.path.exists(target_file):
    if not os.path.exists(archive_path):
        raise FileNotFoundError(f'Не найден архив: {archive_path}')
    with zipfile.ZipFile(archive_path, 'r') as z:
        z.extractall('.')
    print(f'Файл {target_file} успешно извлечён из {archive_path}')
else:
    print(f'Файл {target_file} уже существует — распаковка не нужна')

In [None]:
FILE_ID = '15Aw_wF1JeiShnZ2vSccLWLnuJdRVMRep'
GZ_NAME = 'text_data.tsv.gz'
TSV_NAME = 'text_data.tsv'

if not os.path.exists(TSV_NAME):
    url = f'https://drive.google.com/uc?export=download&id={FILE_ID}'
    gdown.download(url, GZ_NAME, quiet=False)
    with gzip.open(GZ_NAME, 'rb') as f_in, open(TSV_NAME, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    print(f'{TSV_NAME} успешно получен из {GZ_NAME}')
else:
    print(f'{TSV_NAME} уже существует, пропускаем загрузку и распаковку')

## Загрузка с downcast

In [None]:
dtype_train = {f'feature_{i}': np.float32 for i in range(1367)}
dtype_train['target'] = np.int8
data_train = pd.read_csv('train.csv', dtype=dtype_train)
dtype_test = {**dtype_train, 'index': np.int32}
data_test  = pd.read_csv('test.csv', dtype=dtype_test)
text_data = pd.read_csv('text_data.tsv', sep='\t', dtype={'index': np.int32, 'word_count': str})

## Numeric features → median → normal_quantile → CSR

In [None]:
num_cols = [c for c in data_train.columns if c.startswith('feature_')]

imp = SimpleImputer(strategy='median')
qt  = QuantileTransformer(output_distribution='normal', n_quantiles=500, random_state=42)

Xn_tr = imp.fit_transform(data_train[num_cols])
Xn_tr = qt.fit_transform(Xn_tr)
Xn_tr = sparse.csr_matrix(Xn_tr.astype(np.float32))

Xn_te = imp.transform(data_test[num_cols])
Xn_te = qt.transform(Xn_te)
Xn_te = sparse.csr_matrix(Xn_te.astype(np.float32))

## Text features → FeatureHasher → SVD → CSR

### Функция обработки тектса в словари

In [None]:
def gen_dicts(series):
    for s in series:
        parts = s.split()
        it = iter(parts)
        d = {}
        for tok, cnt in zip(it, it):
            d[tok] = int(cnt)
        yield d

### Основная предобработка

In [None]:
text_data['word_count'] = text_data['word_count'].fillna('')

hasher = FeatureHasher(n_features=2**14, input_type='dict')
Xh_tr = hasher.transform(gen_dicts(text_data.set_index('index').reindex(data_train['index'], fill_value='')['word_count']))

svd = TruncatedSVD(n_components=50, random_state=42)
Xsv_tr = svd.fit_transform(Xh_tr).astype(np.float32)

Xsv_tr = sparse.csr_matrix(Xsv_tr)

Xh_te = hasher.transform(gen_dicts(text_data.set_index('index').reindex(data_test['index'], fill_value='')['word_count']))
Xsv_te = svd.transform(Xh_te).astype(np.float32)

Xsv_te = sparse.csr_matrix(Xsv_te)

## Создание итогового датасета из CSR

In [None]:
X_train = sparse.hstack([Xn_tr, Xsv_tr], format='csr')
X_test  = sparse.hstack([Xn_te, Xsv_te], format='csr')
y_train = data_train['target'].values

## LightGBM с CV

In [None]:
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'device': 'gpu',
    'learning_rate': 0.01,
    'num_leaves': 128,
    'min_data_in_leaf': 30,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 1.0,
    'lambda_l2': 1.0,
    'is_unbalance': True,
    'verbosity': 1
}

dtrain = lgb.Dataset(X_train, label=y_train, free_raw_data=False)

cv_res = lgb.cv(
    params,
    dtrain,
    nfold=5,
    stratified=True,
    shuffle=True,
    num_boost_round=2000,
    seed=42,
    callbacks=[
        lgb.early_stopping(stopping_rounds=30),
        lgb.log_evaluation(period=50)
    ]
)

### Получаем лучший iter

In [None]:
mean_key = next(k for k in cv_res if k.endswith('-mean'))
best_iter = len(cv_res[mean_key])
print(f"Best CV rounds: {best_iter}, CV AUC: {cv_res[mean_key][-1]:.5f}")

## Финальное обучение и проверка на train

In [None]:
model = lgb.train(params, dtrain, num_boost_round=best_iter)
train_preds = model.predict(X_train)
print("Train ROC‑AUC:", roc_auc_score(y_train, train_preds))

## Предсказания на тесте

In [None]:
preds = model.predict(X_test)
submission = pd.DataFrame({'index': data_test['index'], 'score': preds})
submission.to_csv('submission.csv', index=False)
print("Предсказания для test готовы в файле submission.csv")