In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
# from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
# data1 = pd.read_parquet('train_transactions_contest/part_048_970384_to_987313.parquet')

app_id - Идентификатор заявки. Заявки пронумерованы так, что более поздним заявкам соответствует более поздняя дата
amnt - Нормированная сумма транзакции. 0.0 - соответствует пропускам
currency - Идентификатор валюты транзакции
operation_kind - Идентификатор типа транзакции
card_type - Уникальный идентификатор типа карты
operation_type - Идентификатор типа операции по пластиковой карте
operation_type_group - Идентификатор группы карточных операций, например, дебетовая карта или кредитная карта
ecommerce_flag - Признак электронной коммерции
payment_system - Идентификатор типа платежной системы
income_flag - Признак списания/внесения денежных средств на карту
mcc - Уникальный идентификатор типа торговой точки
country - Идентификатор страны транзакции
city - Идентификатор города транзакции
mcc_category - Идентификатор категории магазина транзакции
day_of_week - День недели, когда транзакция была совершена
hour - Час, когда транзакция была совершена
days_before - Количество дней до даты выдачи кредита
weekofyear - Номер недели в году, когда транзакция была совершена
hour_diff - Количество часов с момента прошлой транзакции для данного клиента
transaction_number - Порядковый номер транзакции клиента

In [492]:
pd.set_option('display.max_rows', 300)

In [119]:
num_features = ['amnt', 'day_of_week', 'hour', 'days_before', 'weekofyear', 'hour_diff']
cat_features = ['currency', 'operation_kind', 'card_type', 'operation_type', 'operation_type_group', 
                'ecommerce_flag', 'payment_system', 'income_flag', 'mcc', 'country', 'city', 'mcc_category']

def get_mode(series):
    return series.mode().iloc[0] if not series.mode().empty else None

def aggregate_features(data):
    n_transactions = data.groupby(['app_id']).count()['transaction_number'].to_frame()
    n_transactions.columns = ['n_transactions']
    num_stat = data.groupby('app_id')[num_features].agg('describe')
    num_stat.columns = num_stat.columns.map(' '.join)
    cat_agg = data.groupby('app_id')[cat_features].agg(['nunique', get_mode])
    cat_agg.columns = cat_agg.columns.map(' '.join)
    cat_counts = []
    for feature in cat_features:
        cat_count = data.groupby(['app_id', feature])['transaction_number'].count().unstack(fill_value=0)
        cat_count.columns = [' '.join([feature, str(col)]) for col in cat_count.columns]
        cat_counts.append(cat_count)
    cat_counts = pd.concat(cat_counts, axis=1)
    amnt_type = data.groupby(['app_id', 'income_flag'])['amnt'].sum().unstack()
    amnt_type.columns = [' '.join(['amnt type', str(col)]) for col in amnt_type.columns]
    
    return pd.concat([n_transactions, num_stat, cat_agg, cat_counts, amnt_type], axis=1)     

In [120]:
def process_file(dir, filename):
    data = pd.read_parquet(os.path.join(dir, filename))
    main_features = aggregate_features(data)
    last_data = data.groupby(['app_id']).apply(lambda x: x.tail(round(0.2 *(len(x)))))
    last_features = aggregate_features(last_data.reset_index(drop=True))
    abs_in_last = list(set(main_features.columns) - set(last_features.columns))
    last_features[abs_in_last] = 0
    last_features.columns = 'last period ' + last_features.columns

    last_values = data.groupby(['app_id']).agg('last').reset_index(drop=True)
    last_values.columns = 'last ' + last_values.columns

    fin_data = pd.concat([main_features.reset_index(drop=True), last_features.reset_index(drop=True), last_values], axis=1)
    fin_data['app_id'] = main_features.index
    return fin_data

In [None]:
dirname = 'train_transactions_contest'
all_data = []
for filename in tqdm(os.listdir(dirname)[35:]):
    all_data.append(process_file(dirname, filename))
train_data = pd.concat(all_data)

In [None]:
# def process_file(dir, filename, func):
#     data = pd.read_parquet(os.path.join(dir, filename)).reset_index(drop=True)
#     cumsum = data.groupby(['app_id'])['hour_diff'].apply(lambda x: x[::-1].cumsum()[::-1]).reset_index()
#     data['cum_sum'] = cumsum['hour_diff']
#     data['month_bucket'] = data['cum_sum'] // (24 * 30)
#     data = data[data['month_bucket'] <= 2]
#     last_features = func(data)
#     last_features.columns = 'last period ' + last_features.columns
#     return last_features

In [None]:
# all_data = []
# for filename in tqdm(os.listdir(dirname)[:25]):
#     all_data.append(process_file(dirname, filename, aggregate_features))
# last_data = pd.concat(all_data)

100%|██████████| 25/25 [1:45:12<00:00, 252.49s/it]


In [3]:
target = pd.read_csv('train_target.csv')
train_data = train_data.merge(target, on='app_id', how='left')

In [4]:
train_data = train_data.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

In [5]:
train_0 = train_data[train_data['flag'] == 0]
train_1 = train_data[train_data['flag'] == 1]
sample0 = train_0.sample(n=23000).reset_index()
sample1 = train_1.sample(n=23000).reset_index()
train_sample = pd.concat([sample0, sample1])
train_sample = train_sample.sample(frac=1).reset_index(drop=True)
train_sample = train_sample.drop(['index'], axis=1)

In [6]:
train_size = round(0.8 * len(train_sample))
train = train_sample[:train_size]
test = train_sample[train_size:]

In [11]:
train.to_csv('train_final.csv')
test.to_csv('test_final.csv')

In [5]:
y = train_data['flag']
X = train_data.drop('flag', axis=1)
X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
lgb_model = lgb.LGBMClassifier(verbose=-1, random_state=42)
# lgb_model.fit(X_train, y_train)
cv_scores = cross_val_score(lgb_model, X_train, y_train, scoring='f1_weighted', cv=6)
print(sum(cv_scores) / len(cv_scores))

found 0 physical cores < 1
  File "c:\Users\zayda\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


MemoryError: Unable to allocate 1.11 GiB for an array with shape (232, 642540) and data type float64

In [53]:
cat_model = CatBoostClassifier(verbose=0, random_state=42)
cv_scores = cross_val_score(cat_model, X_train, y_train, scoring='f1_weighted', cv=6)
print(sum(cv_scores) / len(cv_scores))

0.6959344361515267


In [72]:
importances = list(zip(X.columns, lgb_model.feature_importances_.tolist()))
importances.sort(key = lambda x: x[1], reverse=True)

In [74]:
[imp[0] for imp in importances if imp[1] == 0]

['amnt count',
 'day_of_week count',
 'day_of_week min',
 'hour count',
 'days_before count',
 'weekofyear count',
 'hour_diff count',
 'hour_diff min',
 'currency get_mode',
 'operation_type_group nunique',
 'ecommerce_flag get_mode',
 'income_flag nunique',
 'income_flag get_mode',
 'country get_mode',
 'city nunique',
 'currency 4',
 'currency 5',
 'currency 6',
 'currency 7',
 'currency 8',
 'currency 9',
 'currency 10',
 'currency 11',
 'operation_kind 7',
 'card_type 7',
 'card_type 8',
 'card_type 9',
 'card_type 10',
 'card_type 11',
 'card_type 12',
 'card_type 13',
 'card_type 14',
 'card_type 16',
 'card_type 19',
 'card_type 21',
 'card_type 22',
 'card_type 27',
 'card_type 29',
 'card_type 31',
 'card_type 34',
 'card_type 35',
 'card_type 43',
 'card_type 45',
 'card_type 47',
 'card_type 49',
 'card_type 50',
 'card_type 51',
 'card_type 56',
 'card_type 57',
 'card_type 61',
 'card_type 62',
 'card_type 66',
 'card_type 69',
 'card_type 71',
 'card_type 72',
 'card_typ

In [63]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [65]:
tabnet = TabNetClassifier()
tabnet.fit(
        X_train=X_train_scaled,
        y_train=y_train.values,
        eval_set=[(X_train_scaled, y_train.values), (X_test_scaled, y_test.values)],
        eval_name=["train", "valid"],
        eval_metric=["auc"],
        max_epochs=50,
        patience=10,
        batch_size=512,
        num_workers=2,
    )
preds = tabnet.predict(X_test)
print(f1_score(preds, y_test))



epoch 0  | loss: 0.75623 | train_auc: 0.5092  | valid_auc: 0.51774 |  0:00:35s
epoch 1  | loss: 0.69973 | train_auc: 0.47947 | valid_auc: 0.4664  |  0:01:08s
epoch 2  | loss: 0.69429 | train_auc: 0.5501  | valid_auc: 0.55057 |  0:01:40s
epoch 3  | loss: 0.69246 | train_auc: 0.57811 | valid_auc: 0.59673 |  0:02:13s
epoch 4  | loss: 0.68827 | train_auc: 0.61101 | valid_auc: 0.62872 |  0:02:45s
epoch 5  | loss: 0.68181 | train_auc: 0.62946 | valid_auc: 0.63758 |  0:03:20s
epoch 6  | loss: 0.67533 | train_auc: 0.62278 | valid_auc: 0.63899 |  0:03:56s
epoch 7  | loss: 0.67291 | train_auc: 0.63962 | valid_auc: 0.65707 |  0:04:31s
epoch 8  | loss: 0.66645 | train_auc: 0.65176 | valid_auc: 0.66074 |  0:05:03s
epoch 9  | loss: 0.66176 | train_auc: 0.65619 | valid_auc: 0.64729 |  0:05:37s
epoch 10 | loss: 0.65868 | train_auc: 0.65476 | valid_auc: 0.65604 |  0:06:10s
epoch 11 | loss: 0.65353 | train_auc: 0.67349 | valid_auc: 0.68073 |  0:06:44s
epoch 12 | loss: 0.6512  | train_auc: 0.68161 | vali



KeyError: 0

In [None]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [None]:
all_kappa = []
for i, (train_ind, val_ind) in enumerate(skf.split(X.values, y)):
    X_train, X_val = X.iloc[train_ind, :], X.iloc[val_ind, :]
    y_train, y_val = y.iloc[train_ind], y.iloc[val_ind]
    tabnet = TabNetClassifier()
    tabnet.fit(
            X_train=X_train,
            y_train=y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            eval_name=["train", "valid"],
            eval_metric=["f1_score"],
            max_epochs=30,
            patience=3,
            batch_size=1024,
            num_workers=2,
        )
    val_preds = tabnet.predict(X_val)
    kappa = f1_score(y_val, np.round(val_preds), weights='quadratic')
    all_kappa.append(kappa)
    print(f'Fold {i}. Kappa: {kappa}')
print(f'Cross-val kappa: {sum(all_kappa) / len(all_kappa)}')