In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install catboost lightgbm xgboost



In [3]:
RANDOM_STATE = 42

In [4]:
df_train = pd.read_csv('train (1).csv', index_col=0)
df_train['ts'] = pd.to_datetime(df_train['ts'])
df_train.shape

(37518, 3)

In [5]:
df_test = pd.read_csv('test (1).csv', index_col=0)
df_test['ts'] = pd.to_datetime(df_test['ts'])
df_test.shape

(7125, 3)

In [6]:
print('Number of duplicated rows:')
print(df_train[df_train.duplicated()].shape[0])
print(df_test[df_test.duplicated()].shape[0])

Number of duplicated rows:
2346
505


In [61]:
df_train_cleaned = df_train.drop_duplicates()
df_test_cleaned = df_test.drop_duplicates()

In [8]:
print("-----Train min() and max()-----")
print(df_train_cleaned['ts'].min())
print(df_train_cleaned['ts'].max())
print("-----Test min() and max()-----")
print(df_test_cleaned['ts'].min())
print(df_test_cleaned['ts'].max())

-----Train min() and max()-----
2022-07-29 09:08:54
2022-12-31 20:39:31
-----Test min() and max()-----
2023-01-03 08:21:00
2023-02-24 19:44:09


In [9]:
missing_1 = set(df_test_cleaned['gate_id']) - set(df_train_cleaned['gate_id'])
print('Gate(s) that in test but not in train:', missing_1)

missing_2 = set(df_train_cleaned['gate_id']) - set(df_test_cleaned['gate_id'])
print('Gate(s) that in train but not in test:', missing_2)

Gate(s) that in test but not in train: {2}
Gate(s) that in train but not in test: {0, 16}


In [10]:
# users которые встречаются редко
counts = df_train_cleaned.groupby('user_id').size()
counts[counts <= 4]

user_id
4     1
44    4
51    2
dtype: int64

In [11]:
np.sort(df_train_cleaned['user_id'].unique())

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
       53, 54, 55, 56, 57])

In [None]:
df = df_train_cleaned.copy()
user_gate_counts = df.groupby(['user_id', 'gate_id']).size().rename("count")
user_gate_probs = user_gate_counts.groupby(level='user_id').apply(lambda x: x / x.sum())
user_gate_probs.index = user_gate_probs.index.droplevel(0)
user_gate_probs = user_gate_probs.unstack(fill_value=0)

for g in all_gates:
    if g not in user_gate_probs.columns:
        user_gate_probs[g] = float(0)

user_gate_probs.columns = [f"user_gate_prob_{c}" for c in user_gate_probs.columns]
df = df.merge(user_gate_probs, on='user_id', how='left')
df

Unnamed: 0_level_0,user_gate_prob_-1,user_gate_prob_0,user_gate_prob_1,user_gate_prob_3,user_gate_prob_4,user_gate_prob_5,user_gate_prob_6,user_gate_prob_7,user_gate_prob_8,user_gate_prob_9,user_gate_prob_10,user_gate_prob_11,user_gate_prob_12,user_gate_prob_13,user_gate_prob_14,user_gate_prob_15,user_gate_prob_16,user_gate_prob_2
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0.0,0.0,0.0,0.195101,0.201858,0.10473,0.09375,0.101351,0.0,0.05152,0.133446,0.118243,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000854,0.0,0.0,0.041845,0.244236,0.22374,0.023911,0.084543,0.0,0.091375,0.122118,0.12468,0.001708,0.0,0.0,0.040991,0.0,0.0
2,0.0,0.0,0.0,0.051282,0.307692,0.25641,0.0,0.102564,0.0,0.0,0.153846,0.128205,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.09396,0.246085,0.178971,0.020134,0.104027,0.0,0.104027,0.128635,0.124161,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.4,0.4,0.0,0.1,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.000539,0.0,0.0,0.151858,0.141088,0.084006,0.097469,0.078621,0.0,0.045773,0.172321,0.087237,0.031772,0.109316,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.333333,0.3125,0.0,0.0,0.020833,0.0,0.0,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.222222,0.296296,0.074074,0.0,0.037037,0.0,0.074074,0.148148,0.148148,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.093596,0.209852,0.166502,0.036453,0.093596,0.0,0.00197,0.196059,0.20197,0.0,0.0,0.0,0.0,0.0,0.0


In [289]:
df_train_cleaned = df_train.drop_duplicates()
df_test_cleaned = df_test.drop_duplicates()

df_train_cleaned['date'] = df_train_cleaned['ts'].dt.date
df_test_cleaned['date'] = df_test_cleaned['ts'].dt.date

In [290]:
# Shifting gates by +1 to remove -1 and for coding transitions later 
all_gates = sorted(set(df_train_cleaned['gate_id'].unique()) | set(df_test_cleaned['gate_id'].unique()))
min_gate = min(all_gates)
shift = abs(min_gate)

df_train_cleaned['gate_id'] = df_train_cleaned['gate_id'] + shift
df_test_cleaned['gate_id'] = df_test_cleaned['gate_id'] + shift

shifted_all_gates = sorted(set(df_train_cleaned['gate_id'].unique()) | set(df_test_cleaned['gate_id'].unique()))

In [291]:
def add_daily_top_gates(df, user_col, n):
    # Счёт количества gate_id внутри (user_col, date)
    counts = (df.groupby([user_col, 'date', 'gate_id']).size().reset_index(name='count'))

    # Сортировки для топов и анти-топов
    sorted_desc = counts.sort_values([user_col, 'date', 'count'], ascending=[True, True, False])
    sorted_asc  = counts.sort_values([user_col, 'date', 'count'], ascending=[True, True, True])

    # Добавляем признаки
    for i in range(n):
        top_i = (sorted_desc.groupby([user_col, 'date']).nth(i)[[user_col, 'date', 'gate_id']].rename(columns={'gate_id': f'top{i+1}_gate_daily'}))
        anti_top_i = (sorted_asc.groupby([user_col, 'date']).nth(i)[[user_col, 'date', 'gate_id']].rename(columns={'gate_id': f'anti_top{i+1}_gate_daily'}))

        # Merge обратно
        df = df.merge(top_i, on=[user_col, 'date'], how='left')
        df = df.merge(anti_top_i, on=[user_col, 'date'], how='left')

    for i in range(n):
        df[f'top{i+1}_gate_daily'] = df[f'top{i+1}_gate_daily'].fillna(df[f'top{i+1}_gate_daily'].median())
        df[f'anti_top{i+1}_gate_daily'] = df[f'anti_top{i+1}_gate_daily'].fillna(df[f'anti_top{i+1}_gate_daily'].median())

    return df

df_train_cleaned = add_daily_top_gates(df_train_cleaned, 'user_id', 3)
df_test_cleaned = add_daily_top_gates(df_test_cleaned, 'user_word', 3)

In [292]:
# def gate_pass_counts_daily(df, user_col, shifted_all_gates):
#     daily_counts = df.groupby([user_col, 'date', 'gate_id']).size().unstack(fill_value=0)
    
#     # добавляем отсутствующие колонки
#     for g in shifted_all_gates:
#         if g not in daily_counts.columns:
#             daily_counts[g] = 0
    
#     # сортируем колонки
#     daily_counts = daily_counts[shifted_all_gates]
#     daily_counts.columns = [f'gate_{c}_daily_count' for c in daily_counts.columns]
#     daily_counts = daily_counts.reset_index()
#     df = df.merge(daily_counts, on=[user_col, 'date'], how='left')
#     return df


# df_train_cleaned = gate_pass_counts_daily(df_train_cleaned, 'user_id', shifted_all_gates)
# df_test_cleaned = gate_pass_counts_daily(df_test_cleaned, 'user_word', shifted_all_gates)

In [293]:
def unique_gates_per_day(df, user_col):
    # количество уникальных гейтов за день
    unique_counts = (df.groupby([user_col, 'date'])['gate_id'].nunique().reset_index().rename(columns={'gate_id': 'unique_gates_per_day'}))
    df = df.merge(unique_counts[[user_col, 'unique_gates_per_day', 'date']], on=[user_col, 'date'], how='left')
    return df

df_train_cleaned = unique_gates_per_day(df_train_cleaned, 'user_id')
df_test_cleaned = unique_gates_per_day(df_test_cleaned, 'user_word')

In [294]:
df_all = pd.concat([df_train_cleaned, df_test_cleaned], axis=0)

df_all['hour'] = df_all['ts'].dt.hour
df_all['time'] = df_all['ts'].dt.time
df_all['hour'] = df_all['ts'].dt.hour
df_all['morning'] = df_all['hour'].between(6,11).astype(int)
df_all['afternoon'] = df_all['hour'].between(12,17).astype(int)
df_all['evening'] = df_all['hour'].between(18,23).astype(int)
df_all['night'] = df_all['hour'].between(0,5).astype(int)


In [295]:
# Split on train, validation, test datasets
train_idx = df_all['user_word'].isnull()
X = df_all.loc[train_idx]
X_test = df_all.loc[~train_idx]

validation_index = X['ts'] > '2022-11-30'
print(f"Size of validation: {round(sum(validation_index) / validation_index.shape[0] * 100, 4)} %")

X_train = X[~validation_index].copy()
X_val = X[validation_index].copy()

print('X_train:', X_train.shape)
print('X_val:', X_val.shape)
print('X_test:', X_test.shape)

Size of validation: 18.6142 %
X_train: (28625, 18)
X_val: (6547, 18)
X_test: (6620, 18)


In [296]:
def user_gate_probs(df, user_col, shifted_all_gates):
    user_gate_counts = df.groupby([user_col, 'gate_id']).size().rename("count")
    user_gate_probs = user_gate_counts.groupby(level=user_col).apply(lambda x: x / x.sum())
    user_gate_probs.index = user_gate_probs.index.droplevel(0)
    user_gate_probs = user_gate_probs.unstack(fill_value=0)

    for g in shifted_all_gates:
        if g not in user_gate_probs.columns:
            user_gate_probs[g] = float(0)

    user_gate_probs.columns = [f"user_gate_prob_{c}" for c in user_gate_probs.columns]
    df = df.merge(user_gate_probs, on=user_col, how='left')
    df.columns = df.columns.fillna(0)
    return df

X_train = user_gate_probs(X_train, 'user_id', shifted_all_gates)
X_val = user_gate_probs(X_val, 'user_id', shifted_all_gates)
X_test = user_gate_probs(X_test, 'user_word', shifted_all_gates)

In [297]:
def delta_seconds(df, user_col):
    df['delta_sec'] = (df.sort_values([user_col, 'ts']).groupby([user_col, 'date'])['ts'].diff().dt.total_seconds())
    
    # чистим: оставляем первое событие дня + события c delta
    df_clean = df[(df['delta_sec'].isna()) | (df['delta_sec'] > 3)]
    df_clean['delta_sec'] = df_clean['delta_sec'].fillna(0)
    return df_clean


X_train = delta_seconds(X_train, 'user_id')
X_val = delta_seconds(X_val, 'user_id')
X_test = delta_seconds(X_test, 'user_word')

In [298]:
# def working_hours(df, user_col):
#     work_time = (df.groupby([user_col, 'date']).agg(first_in=('ts', 'min'), last_out=('ts', 'max'))).reset_index()
#     work_time['working_hours'] = (work_time['last_out'] - work_time['first_in']).dt.total_seconds() / 3600
#     df = df.merge(work_time[[user_col, 'date', 'working_hours']], on=[user_col, 'date'], how='left')

#     return df


# X_train = working_hours(X_train, 'user_id')
# X_val = working_hours(X_val, 'user_id')
# X_test = working_hours(X_test, 'user_word')

In [299]:
# Adding next and previous gates
def add_prev_next_gates(df, user_col, num_shifts):
    df = df.sort_values([user_col, 'ts']).copy()

    for i in range(1, num_shifts + 1):
        df[f'next_gate_{i}'] = df.groupby(user_col)['gate_id'].shift(-i)
        df[f'prev_gate_{i}'] = df.groupby(user_col)['gate_id'].shift(i)

        #df[f'next_gate_{i}'] = df[f'next_gate_{i}'].fillna(df[f'next_gate_{i}'].median())
        #df[f'prev_gate_{i}'] = df[f'prev_gate_{i}'].fillna(df[f'prev_gate_{i}'].median())

        df[f'next_gate_{i}'] = df[f'next_gate_{i}'].fillna(-1)
        df[f'prev_gate_{i}'] = df[f'prev_gate_{i}'].fillna(-1)
    
    return df


X_train = add_prev_next_gates(X_train, 'user_id', 2)
X_val = add_prev_next_gates(X_val, 'user_id', 2)
X_test = add_prev_next_gates(X_test, 'user_word', 2)

print('X_train:', X_train.shape)
print('X_val:', X_val.shape)
print('X_test:', X_test.shape)

X_train: (20171, 41)
X_val: (4723, 41)
X_test: (4750, 41)


In [300]:
# prev → current
X_train['trans_prev_cur'] = X_train['prev_gate_1'] * 100 + X_train['gate_id']
X_val['trans_prev_cur'] = X_val['prev_gate_1'] * 100 + X_val['gate_id']
X_test['trans_prev_cur'] = X_test['prev_gate_1'] * 100 + X_test['gate_id']

# current → next
X_train['trans_cur_next'] = X_train['gate_id']*100 + X_train['next_gate_1']
X_val['trans_cur_next'] = X_val['gate_id']*100 + X_val['next_gate_1']
X_test['trans_cur_next'] = X_test['gate_id']*100 + X_test['next_gate_1']

# prev2 → prev1 → current
X_train['trans_2step_prev'] = X_train['prev_gate_2']*10000 + X_train['prev_gate_1']*100 + X_train['gate_id']
X_val['trans_2step_prev'] = X_val['prev_gate_2']*10000 + X_val['prev_gate_1']*100 + X_val['gate_id']
X_test['trans_2step_prev'] = X_test['prev_gate_2']*10000 + X_test['prev_gate_1']*100 + X_test['gate_id']

# current → next1 → next2
X_train['trans_2step_next'] = X_train['gate_id']*10000 + X_train['next_gate_1']*100 + X_train['next_gate_2']
X_val['trans_2step_next'] = X_val['gate_id']*10000 + X_val['next_gate_1']*100 + X_val['next_gate_2']
X_test['trans_2step_next'] = X_test['gate_id']*10000 + X_test['next_gate_1']*100 + X_test['next_gate_2']

In [301]:
X_train

Unnamed: 0,user_id,ts,gate_id,date,top1_gate_daily,anti_top1_gate_daily,top2_gate_daily,anti_top2_gate_daily,top3_gate_daily,anti_top3_gate_daily,...,user_gate_prob_15,delta_sec,next_gate_1,prev_gate_1,next_gate_2,prev_gate_2,trans_prev_cur,trans_cur_next,trans_2step_prev,trans_2step_next
62,0.0,2022-07-29 10:30:17,4,2022-07-29,4,8,5.0,11.0,7.0,5.0,...,0.0,0.0,12.0,-1.0,7.0,-1.0,-96.0,412.0,-10096.0,41207.0
209,0.0,2022-07-29 13:19:21,12,2022-07-29,4,8,5.0,11.0,7.0,5.0,...,0.0,10143.0,7.0,4.0,8.0,-1.0,412.0,1207.0,-9588.0,120708.0
210,0.0,2022-07-29 13:19:48,7,2022-07-29,4,8,5.0,11.0,7.0,5.0,...,0.0,27.0,8.0,12.0,4.0,4.0,1207.0,708.0,41207.0,70804.0
253,0.0,2022-07-29 14:20:15,8,2022-07-29,4,8,5.0,11.0,7.0,5.0,...,0.0,3626.0,4.0,7.0,11.0,12.0,708.0,804.0,120708.0,80411.0
254,0.0,2022-07-29 14:21:26,4,2022-07-29,4,8,5.0,11.0,7.0,5.0,...,0.0,71.0,11.0,8.0,12.0,7.0,804.0,411.0,70804.0,41112.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24837,57.0,2022-10-28 15:07:29,5,2022-10-28,4,11,5.0,12.0,11.0,4.0,...,0.0,19.0,4.0,12.0,0.0,11.0,1205.0,504.0,111205.0,50400.0
26949,57.0,2022-11-18 09:08:56,4,2022-11-18,0,0,4.0,4.0,5.0,5.0,...,0.0,0.0,0.0,5.0,0.0,12.0,504.0,400.0,120504.0,40000.0
26951,57.0,2022-11-18 09:09:56,0,2022-11-18,0,0,4.0,4.0,5.0,5.0,...,0.0,58.0,0.0,4.0,5.0,5.0,400.0,0.0,50400.0,5.0
26952,57.0,2022-11-18 09:12:02,0,2022-11-18,0,0,4.0,4.0,5.0,5.0,...,0.0,126.0,5.0,0.0,-1.0,4.0,0.0,5.0,40000.0,499.0


In [302]:
X_train.isna().sum()

user_id                     0
ts                          0
gate_id                     0
date                        0
top1_gate_daily             0
anti_top1_gate_daily        0
top2_gate_daily             0
anti_top2_gate_daily        0
top3_gate_daily             0
anti_top3_gate_daily        0
unique_gates_per_day        0
user_word               20171
hour                        0
time                        0
morning                     0
afternoon                   0
evening                     0
night                       0
user_gate_prob_0            0
user_gate_prob_1            0
user_gate_prob_2            0
user_gate_prob_4            0
user_gate_prob_5            0
user_gate_prob_6            0
user_gate_prob_7            0
user_gate_prob_8            0
user_gate_prob_9            0
user_gate_prob_10           0
user_gate_prob_11           0
user_gate_prob_12           0
user_gate_prob_13           0
user_gate_prob_14           0
user_gate_prob_16           0
user_gate_

In [303]:
X_train_cleaned = X_train.copy()
y_train = X_train_cleaned['user_id'].astype(int)

X_val_cleaned = X_val.copy()
y_val = X_val_cleaned['user_id'].astype(int)

X_test_cleaned = X_test.copy()

In [304]:
X_train_cleaned.columns

Index(['user_id', 'ts', 'gate_id', 'date', 'top1_gate_daily',
       'anti_top1_gate_daily', 'top2_gate_daily', 'anti_top2_gate_daily',
       'top3_gate_daily', 'anti_top3_gate_daily', 'unique_gates_per_day',
       'user_word', 'hour', 'time', 'morning', 'afternoon', 'evening', 'night',
       'user_gate_prob_0', 'user_gate_prob_1', 'user_gate_prob_2',
       'user_gate_prob_4', 'user_gate_prob_5', 'user_gate_prob_6',
       'user_gate_prob_7', 'user_gate_prob_8', 'user_gate_prob_9',
       'user_gate_prob_10', 'user_gate_prob_11', 'user_gate_prob_12',
       'user_gate_prob_13', 'user_gate_prob_14', 'user_gate_prob_16',
       'user_gate_prob_17', 'user_gate_prob_3', 'user_gate_prob_15',
       'delta_sec', 'next_gate_1', 'prev_gate_1', 'next_gate_2', 'prev_gate_2',
       'trans_prev_cur', 'trans_cur_next', 'trans_2step_prev',
       'trans_2step_next'],
      dtype='object')

In [None]:
columns_to_drop = ['user_word', 'user_id', 'ts', 'date', 'time', 'gate_id', 'night', 'user_gate_prob_3', 'user_gate_prob_15']

X_train_cleaned = X_train_cleaned.drop(columns_to_drop, axis=1)
X_val_cleaned = X_val_cleaned.drop(columns_to_drop, axis=1)
user_word = X_test_cleaned['user_word']
X_test_cleaned = X_test_cleaned.drop(columns_to_drop, axis=1)

# Ставим колонки X_val и X_test в порядок, как в X_train
X_val_cleaned   = X_val_cleaned[X_train_cleaned.columns]
X_test_cleaned  = X_test_cleaned[X_train_cleaned.columns]

In [325]:
# Это список пар фич, которые почти одинаковые.
corr = X_train_cleaned.corr().abs()
high_corr = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).stack().sort_values(ascending=False))
high_corr[high_corr > 0.8]

user_gate_prob_1   user_gate_prob_17    1.000000
trans_cur_next     trans_2step_next     1.000000
prev_gate_1        trans_prev_cur       0.999951
prev_gate_2        trans_2step_prev     0.999951
user_gate_prob_13  user_gate_prob_14    0.917430
user_gate_prob_4   user_gate_prob_6     0.870032
dtype: float64

In [318]:
# Фичи с нулевой дисперсией — сразу в мусор
print(X_train_cleaned.var()[X_train_cleaned.var() < 1e-5].index)
print(X_val_cleaned.var()[X_val_cleaned.var() < 1e-5].index)
print(X_test_cleaned.var()[X_test_cleaned.var() < 1e-5].index)

Index(['night', 'user_gate_prob_1', 'user_gate_prob_2', 'user_gate_prob_17'], dtype='object')
Index(['night', 'user_gate_prob_1', 'user_gate_prob_2'], dtype='object')
Index(['user_gate_prob_0', 'user_gate_prob_1', 'user_gate_prob_2',
       'user_gate_prob_9', 'user_gate_prob_17'],
      dtype='object')


In [306]:
# Вывод 10 наиболее коррелирующих признаков
# Фичи с почти нулевой корреляцией — кандидаты на удаление.
correlations = X_train_cleaned.corrwith(y_train).abs().sort_values(ascending=False)
print(correlations.tail(25))


user_gate_prob_12       0.090295
unique_gates_per_day    0.076884
hour                    0.072077
trans_2step_next        0.055725
trans_cur_next          0.055719
trans_prev_cur          0.055137
next_gate_1             0.054850
next_gate_2             0.054635
prev_gate_1             0.054569
trans_2step_prev        0.054445
prev_gate_2             0.053889
evening                 0.041981
user_gate_prob_5        0.036640
anti_top3_gate_daily    0.032380
morning                 0.029145
user_gate_prob_2        0.023431
user_gate_prob_9        0.022383
anti_top2_gate_daily    0.018695
user_gate_prob_17       0.014417
user_gate_prob_1        0.014417
delta_sec               0.009367
anti_top1_gate_daily    0.009354
afternoon               0.008210
top3_gate_daily         0.002411
night                        NaN
dtype: float64


In [307]:
print(correlations.head(25))

user_gate_prob_4        0.463530
user_gate_prob_6        0.332672
user_gate_prob_7        0.244445
top1_gate_daily         0.192627
user_gate_prob_14       0.188795
user_gate_prob_13       0.177979
user_gate_prob_16       0.158737
user_gate_prob_10       0.127564
user_gate_prob_0        0.104483
user_gate_prob_8        0.101917
user_gate_prob_11       0.101787
top2_gate_daily         0.099066
user_gate_prob_12       0.090295
unique_gates_per_day    0.076884
hour                    0.072077
trans_2step_next        0.055725
trans_cur_next          0.055719
trans_prev_cur          0.055137
next_gate_1             0.054850
next_gate_2             0.054635
prev_gate_1             0.054569
trans_2step_prev        0.054445
prev_gate_2             0.053889
evening                 0.041981
user_gate_prob_5        0.036640
dtype: float64


### Cat Boost

In [308]:
CatBoostModel = CatBoostClassifier(
    iterations=300,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy', 
    random_seed=42,
    verbose=50
)
CatBoostModel.fit(X_train_cleaned, y_train)

0:	learn: 0.4764761	total: 494ms	remaining: 2m 27s
50:	learn: 0.9997521	total: 23.9s	remaining: 1m 56s
100:	learn: 1.0000000	total: 47.2s	remaining: 1m 33s
150:	learn: 1.0000000	total: 1m 10s	remaining: 1m 9s
200:	learn: 1.0000000	total: 1m 30s	remaining: 44.8s
250:	learn: 1.0000000	total: 1m 44s	remaining: 20.5s
299:	learn: 1.0000000	total: 1m 56s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x19f1e2fc750>

In [309]:
feature_importances = CatBoostModel.get_feature_importance(prettified=True)
print(feature_importances.head(25))

              Feature Id  Importances
0      user_gate_prob_12    16.367980
1       user_gate_prob_7    15.712093
2      user_gate_prob_10    15.667845
3       user_gate_prob_5    12.103557
4      user_gate_prob_13     8.406313
5      user_gate_prob_11     7.547496
6       user_gate_prob_6     7.316583
7       user_gate_prob_4     5.544368
8       user_gate_prob_8     4.186676
9      user_gate_prob_16     2.348091
10     user_gate_prob_14     2.325643
11      user_gate_prob_9     1.864123
12      user_gate_prob_0     0.316508
13                  hour     0.052013
14             afternoon     0.037867
15      trans_2step_prev     0.025370
16  anti_top3_gate_daily     0.025150
17           next_gate_1     0.018831
18        trans_prev_cur     0.016336
19      trans_2step_next     0.016252
20           next_gate_2     0.015342
21           prev_gate_1     0.014933
22       top1_gate_daily     0.013626
23  anti_top2_gate_daily     0.011659
24        trans_cur_next     0.011464


In [310]:
print(feature_importances.tail(25))

              Feature Id  Importances
12      user_gate_prob_0     0.316508
13                  hour     0.052013
14             afternoon     0.037867
15      trans_2step_prev     0.025370
16  anti_top3_gate_daily     0.025150
17           next_gate_1     0.018831
18        trans_prev_cur     0.016336
19      trans_2step_next     0.016252
20           next_gate_2     0.015342
21           prev_gate_1     0.014933
22       top1_gate_daily     0.013626
23  anti_top2_gate_daily     0.011659
24        trans_cur_next     0.011464
25             delta_sec     0.010380
26  anti_top1_gate_daily     0.008270
27           prev_gate_2     0.008210
28      user_gate_prob_1     0.003708
29     user_gate_prob_17     0.002574
30      user_gate_prob_2     0.000667
31       top3_gate_daily     0.000070
32       top2_gate_daily     0.000002
33  unique_gates_per_day     0.000002
34               morning     0.000000
35               evening     0.000000
36                 night     0.000000


In [311]:
y_val_preds = CatBoostModel.predict(X_val_cleaned)
y_val_probs = CatBoostModel.predict_proba(X_val_cleaned)

print("Accuracy =", round(accuracy_score(y_val, y_val_preds)*100, 5))
print("F1_score =", round(f1_score(y_val, y_val_preds, average='macro')*100,5))

Accuracy = 40.6733
F1_score = 22.38363


### Making y_val и y_train в напободие того, что нужно предсказывать at OdsAi

In [313]:
# Сделаем словарь для train, наподобие того, который нужно предсказать (и обратный словарь)
y_train_list_words = {f'word_{i}': y_i.item() for i, y_i in enumerate(list(y_train.unique()))}
y_train_list_words_inverse = {y_i.item(): f'word_{i}' for i, y_i in enumerate(list(y_train.unique()))}

print("Forward y_train:", dict(list(y_train_list_words.items())[:3]))
print("Inverse y_train:", dict(list(y_train_list_words_inverse.items())[:3]))


# Сделаем словарь для val, наподобие того, который нужно предсказать (и обратный словарь)
y_val_list_words = {f'word_{i}': y_i.item() for i, y_i in enumerate(list(y_val.unique()))}
y_val_list_words_inverse = {y_i.item(): f'word_{i}' for i, y_i in enumerate(list(y_val.unique()))}

print("Forward y_val:", dict(list(y_val_list_words.items())[:3]))
print("Inverse y_val:", dict(list(y_val_list_words_inverse.items())[:3]))

Forward y_train: {'word_0': 0, 'word_1': 1, 'word_2': 3}
Inverse y_train: {0: 'word_0', 1: 'word_1', 3: 'word_2'}
Forward y_val: {'word_0': 0, 'word_1': 1, 'word_2': 2}
Inverse y_val: {0: 'word_0', 1: 'word_1', 2: 'word_2'}


In [314]:
# Превратим y_val и y в напободие того, что нужно предсказывать
y_val_word = y_val.copy()
y_train_word = y_train.copy()
y_val_word = y_val_word.map(y_val_list_words_inverse)
y_train_word = y_train_word.map(y_train_list_words_inverse)

val_words = pd.DataFrame()
val_words['user_word'] = y_val_word
val_words['true'] = y_val
val_words['preds'] = y_val_preds

# Предскажем самым частотным user_id
comp_df = pd.DataFrame(val_words.groupby('user_word')['preds'].agg(lambda x: x.value_counts().index[0]))

for idx in comp_df.index:
    comp_df.loc[idx, 'true'] = y_val_list_words[idx]

comp_df = comp_df.astype(int)
comp_df['comp'] = comp_df['preds'] == comp_df['true']

# Веса юзеров мы не знаем, давайте возьмем равные веса для простоты
comp_df['norm'] = 1

true_answers = (comp_df['comp'] * comp_df['norm']).sum()
total_answers = comp_df['norm'].sum()
precent_true = round((true_answers / total_answers)*100, 1)

print('Оценка val', true_answers, '/', total_answers, '=', precent_true, '%')

Оценка val 12 / 43 = 27.9 %


### Submission

In [2504]:
test_preds = CatBoostModel.predict(X_test_cleaned)
probs = CatBoostModel.predict_proba(X_test_cleaned)

#test_preds = RF_model.predict(X_test_cleaned)
#probs = RF_model.predict_proba(X_test_cleaned)

max_probs = probs.max(axis=1)

submission = pd.DataFrame({
    'user_word': user_word.values,
    'preds': test_preds.ravel(), 
    'max_prob': max_probs
})

submission = submission.groupby('user_word').agg({
    'preds': lambda x: x.value_counts().idxmax(),
    'max_prob': 'max'  # максимальная уверенность в группе
})

# Ставим -999 только если максимальная вероятность < порог
threshold = 0.25
submission.loc[submission['max_prob'] < threshold, 'preds'] = -999

# Убираем колонку max_prob
submission = submission.drop(columns='max_prob')

In [2505]:
submission

Unnamed: 0_level_0,preds
user_word,Unnamed: 1_level_1
aucroc,49
binary,12
blue,25
categorical,40
coefficient,1
collinear,33
distributed,0
epsilon,49
f1,6
fit,3


In [2506]:
submission.to_csv("answer.csv")

In [1722]:
163 / (540/34)

10.262962962962963