In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install catboost lightgbm xgboost



In [3]:
RANDOM_STATE = 42

In [4]:
df_train = pd.read_csv('train (1).csv', index_col=0)
df_train['ts'] = pd.to_datetime(df_train['ts'])
df_train.shape

(37518, 3)

In [5]:
df_test = pd.read_csv('test (1).csv', index_col=0)
df_test['ts'] = pd.to_datetime(df_test['ts'])
df_test.shape

(7125, 3)

In [6]:
print('Number of duplicated rows:')
print(df_train[df_train.duplicated()].shape[0])
print(df_test[df_test.duplicated()].shape[0])

Number of duplicated rows:
2346
505


In [178]:
df_train_cleaned = df_train.drop_duplicates()
df_test_cleaned = df_test.drop_duplicates()

In [8]:
print("-----Train min() and max()-----")
print(df_train_cleaned['ts'].min())
print(df_train_cleaned['ts'].max())
print("-----Test min() and max()-----")
print(df_test_cleaned['ts'].min())
print(df_test_cleaned['ts'].max())

-----Train min() and max()-----
2022-07-29 09:08:54
2022-12-31 20:39:31
-----Test min() and max()-----
2023-01-03 08:21:00
2023-02-24 19:44:09


In [9]:
missing_1 = set(df_test_cleaned['gate_id']) - set(df_train_cleaned['gate_id'])
print('Gate(s) that in test but not in train:', missing_1)

missing_2 = set(df_train_cleaned['gate_id']) - set(df_test_cleaned['gate_id'])
print('Gate(s) that in train but not in test:', missing_2)

Gate(s) that in test but not in train: {2}
Gate(s) that in train but not in test: {0, 16}


In [10]:
# users которые встречаются редко
counts = df_train_cleaned.groupby('user_id').size()
counts[counts <= 4]

user_id
4     1
44    4
51    2
dtype: int64

In [179]:
df = df_train_cleaned.copy()

df['date'] = df['ts'].dt.date

unique_counts = (df.groupby(['user_id', 'date'])['gate_id'].nunique().reset_index().rename(columns={'gate_id': 'unique_gates_per_day'}))
unique_counts_filtered = unique_counts[unique_counts.unique_gates_per_day > 5]
people_unique_counts = np.sort(unique_counts_filtered['user_id'].unique())

df['delta_sec'] = (df.sort_values(['user_id', 'ts']).groupby(['user_id', 'date'])['ts'].diff().dt.total_seconds())
df_filtered  = df[(df['delta_sec'] >= 180) & (df['delta_sec'] <= 600)]
counts = df_filtered.groupby(['user_id', 'date']).size().reset_index(name='delta_count')
short_intervals = counts[counts.delta_count > 3]
people_short_intervals = np.sort(short_intervals.user_id.unique())

smokers = np.intersect1d(people_short_intervals, people_unique_counts)

df['smokers'] = df['user_id'].isin(smokers).astype(int)
df

Unnamed: 0,user_id,ts,gate_id,date,delta_sec,smokers
0,18,2022-07-29 09:08:54,7,2022-07-29,,1
1,18,2022-07-29 09:09:54,9,2022-07-29,60.0,1
3,18,2022-07-29 09:10:06,5,2022-07-29,12.0,1
4,18,2022-07-29 09:10:08,5,2022-07-29,2.0,1
5,18,2022-07-29 09:10:34,10,2022-07-29,26.0,1
...,...,...,...,...,...,...
37512,6,2022-12-31 17:21:19,10,2022-12-31,21.0,1
37513,6,2022-12-31 20:38:56,11,2022-12-31,11857.0,1
37514,6,2022-12-31 20:39:22,6,2022-12-31,26.0,1
37515,6,2022-12-31 20:39:23,6,2022-12-31,1.0,1


In [1233]:
df

Unnamed: 0,user_id,ts,gate_id,date,global_day_order,user_day_order
0,18,2022-07-29 09:08:54,9,2022-07-29,0,0
1,18,2022-07-29 09:09:54,11,2022-07-29,0,0
3,18,2022-07-29 09:10:06,7,2022-07-29,0,0
4,18,2022-07-29 09:10:08,7,2022-07-29,0,0
5,18,2022-07-29 09:10:34,12,2022-07-29,0,0
...,...,...,...,...,...,...
37512,6,2022-12-31 17:21:19,12,2022-12-31,135,94
37513,6,2022-12-31 20:38:56,13,2022-12-31,135,94
37514,6,2022-12-31 20:39:22,8,2022-12-31,135,94
37515,6,2022-12-31 20:39:23,8,2022-12-31,135,94


In [162]:
df = df_train_cleaned.copy()
df = df.sort_values(['user_id', 'ts'])
df['delta_sec'] = (df.groupby('user_id')['ts'].diff().dt.total_seconds())

SESSION_GAP = 30 * 60  # 30 минут
df['new_session'] = ((df['delta_sec'].isna()) | (df['delta_sec'] > SESSION_GAP)).astype(int)
df['session_id'] = (df.groupby('user_id')['new_session'].cumsum())

In [163]:
session_len = (df.groupby(['user_id','session_id']).size().rename('session_len'))
df = df.merge(session_len, on=['user_id','session_id'], how='left')

In [164]:
session_time = df.groupby(['user_id','session_id']).agg(session_start=('ts','min'),session_end=('ts','max'))
session_time['session_duration_min'] = (session_time['session_end'] - session_time['session_start']).dt.total_seconds() / 60
df = df.merge(session_time[['session_duration_min']], on=['user_id','session_id'], how='left')

In [165]:
uniq_gates = df.groupby(['user_id','session_id'])['gate_id'].nunique().rename('session_unique_gates')
df = df.merge(uniq_gates, on=['user_id','session_id'], how='left')

In [166]:
session_first_last = (df.groupby(['user_id','session_id']).agg(first_gate=('gate_id','first'), last_gate=('gate_id','last')))
df = df.merge(session_first_last, on=['user_id','session_id'], how='left')

In [180]:
df = df_train_cleaned.copy()
df = df.sort_values(['user_id', 'ts'])
df['delta_sec'] = (df.groupby('user_id')['ts'].diff().dt.total_seconds())

SESSION_GAP = 30 * 60  # 30 минут
df['new_session'] = ((df['delta_sec'].isna()) | (df['delta_sec'] > SESSION_GAP)).astype(int)
df['session_id'] = (df.groupby('user_id')['new_session'].cumsum())

session_df = (
    df.groupby(['user_id','session_id'])
      .agg(
          session_len=('gate_id','size'),
          session_duration=('ts', lambda x: (x.max()-x.min()).total_seconds()/60),
          unique_gates=('gate_id','nunique'),
          first_gate=('gate_id','first'),
          last_gate=('gate_id','last'),
          start_hour=('ts', lambda x: x.min().hour),
          end_hour=('ts', lambda x: x.max().hour),
      )
      .reset_index()
)


In [181]:
session_df

Unnamed: 0,user_id,session_id,session_len,session_duration,unique_gates,first_gate,last_gate,start_hour,end_hour
0,0,1,2,0.016667,1,3,3,10,10
1,0,2,3,0.466667,2,11,6,13,13
2,0,3,4,1.683333,3,7,10,14,14
3,0,4,3,0.500000,2,11,4,18,18
4,0,5,4,1.333333,3,7,10,10,11
...,...,...,...,...,...,...,...,...,...
8950,57,136,3,0.316667,2,11,4,16,16
8951,57,137,3,0.466667,2,3,10,11,11
8952,57,138,3,0.333333,2,11,4,15,15
8953,57,139,4,3.100000,2,3,-1,9,9


In [159]:
df = df_train_cleaned.copy()
df['date'] = df['ts'].dt.date
df['time'] = df['ts'].dt.time

# group by date to find index of ts max/min
idx = df.groupby(['user_id', 'date'])['ts'].agg(['idxmin', 'idxmax'])

# first/last time and gates
first = df.loc[idx['idxmin'], ['user_id', 'date', 'time', 'gate_id']].rename(columns={'time': 'first_in_time', 'gate_id': 'first_in_gate'})
last = df.loc[idx['idxmax'], ['user_id', 'date', 'time', 'gate_id']].rename(columns={'time': 'last_out_time', 'gate_id': 'last_out_gate'})

# merge last with first
work_time = first.merge(last, on=['user_id', 'date'], how='left')

work_time['same_in_out_gates'] = (work_time.first_in_gate == work_time.last_out_gate).astype(int)
work_time['no_exit_flag'] = (work_time.first_in_time == work_time.last_out_time).astype(int)

# Возвращает количество уникальных дат, которые встречаются у user. Сколько раз user visited area.
user_days = df.groupby('user_id')['date'].nunique()
rare_users = user_days[user_days < 5].index
df['rare_user_flag'] = df['user_id'].isin(rare_users).astype(int)

df = df.merge(work_time, on=['user_id', 'date'], how='left')

In [156]:
session_df

Unnamed: 0,user_id,session_id,session_len,session_duration,unique_gates,first_gate,last_gate,start_hour,end_hour
0,0,1,2,0.016667,1,3,3,10,10
1,0,2,3,0.466667,2,11,6,13,13
2,0,3,4,1.683333,3,7,10,14,14
3,0,4,3,0.500000,2,11,4,18,18
4,0,5,4,1.333333,3,7,10,10,11
...,...,...,...,...,...,...,...,...,...
8950,57,136,3,0.316667,2,11,4,16,16
8951,57,137,3,0.466667,2,3,10,11,11
8952,57,138,3,0.333333,2,11,4,15,15
8953,57,139,4,3.100000,2,3,-1,9,9


### Start here

In [109]:
df_train_cleaned = df_train.drop_duplicates()
df_test_cleaned = df_test.drop_duplicates()

In [12]:
# # Shifting gates by +1 to remove -1 and for coding transitions later 
# all_gates = sorted(set(df_train_cleaned['gate_id'].unique()) | set(df_test_cleaned['gate_id'].unique()))
# min_gate = min(all_gates)
# shift = abs(min_gate)

# df_train_cleaned['gate_id'] = df_train_cleaned['gate_id'] + shift +1
# df_test_cleaned['gate_id'] = df_test_cleaned['gate_id'] + shift +1

# shifted_all_gates = sorted(set(df_train_cleaned['gate_id'].unique()) | set(df_test_cleaned['gate_id'].unique()))

In [183]:
df_all = pd.concat([df_train_cleaned, df_test_cleaned], axis=0)

# df_all['hour'] = df_all['ts'].dt.hour
# df_all['time'] = df_all['ts'].dt.time
# df_all['date'] = df_all['ts'].dt.date


In [250]:
# Split on train, validation, test datasets
train_idx = df_all['user_word'].isnull()
X = df_all.loc[train_idx] #rename on X or X_train
X_test = df_all.loc[~train_idx]

validation_index = X['ts'] > '2022-11-30'
print(f"Size of validation: {round(sum(validation_index) / validation_index.shape[0] * 100, 4)} %")

X_train = X[~validation_index].copy()
X_val = X[validation_index].copy()

print('X_train:', X_train.shape)
print('X_val:', X_val.shape)
print('X_test:', X_test.shape)

Size of validation: 18.6142 %
X_train: (28625, 4)
X_val: (6547, 4)
X_test: (6620, 4)


In [15]:
# def profile_features(df, user):
#     # group by date to find index of ts max/min
#     idx = df.groupby([user, 'date'])['ts'].agg(['idxmin', 'idxmax'])

#     # first/last time and gates
#     first = df.loc[idx['idxmin'], [user, 'date', 'time', 'gate_id']].rename(columns={'time': 'first_in_time', 'gate_id': 'first_in_gate'})
#     last = df.loc[idx['idxmax'], [user, 'date', 'time', 'gate_id']].rename(columns={'time': 'last_out_time', 'gate_id': 'last_out_gate'})

#     # merge last with first
#     work_time = first.merge(last, on=[user, 'date'], how='left')

#     # Возвращает количество уникальных дат, которые встречаются у user. Сколько раз user visited area.
#     user_days = df.groupby(user)['date'].nunique()
#     rare_users = user_days[user_days < 5].index
#     df['rare_user_flag'] = df[user].isin(rare_users).astype(int)

#     df = df.merge(work_time, on=[user, 'date'], how='left')
#     return df

# X_train = profile_features(X_train, 'user_id')
# X_val = profile_features(X_val, 'user_id')
# X_test = profile_features(X_test, 'user_word')

In [68]:
# def smoker_status(df, user):
#     # count users transitions per day through gates
#     unique_counts = (df.groupby([user, 'date'])['gate_id'].nunique().reset_index().rename(columns={'gate_id': 'unique_gates_per_day'}))
#     unique_counts_filtered = unique_counts[unique_counts.unique_gates_per_day > 5]
#     people_unique_counts = np.sort(unique_counts_filtered[user].unique())

#     # filtering smokers time intervals 3-10 mins and its repeatitions 
#     df['delta_sec'] = (df.sort_values([user, 'ts']).groupby([user])['ts'].diff().dt.total_seconds()) #date or common
#     df_filtered  = df[(df['delta_sec'] >= 180) & (df['delta_sec'] <= 600)]
#     counts = df_filtered.groupby([user, 'date']).size().reset_index(name='delta_count')
#     short_intervals = counts[counts.delta_count > 3]
#     people_short_intervals = np.sort(short_intervals[user].unique())

#     smokers = np.intersect1d(people_short_intervals, people_unique_counts)

#     df['smokers'] = df[user].isin(smokers).astype(int)

#     # чистим: оставляем первое событие дня + события c delta and repeated logs in the system
#     #df_clean = df[(df['delta_sec'].isna()) | (df['delta_sec'] > 3)]
#     #df_clean['delta_sec'] = df_clean['delta_sec'].fillna(0)

#     return df

# X_train = smoker_status(X_train, 'user_id')
# X_val = smoker_status(X_val, 'user_id')
# X_test = smoker_status(X_test, 'user_word')

In [129]:
def sessions(df, user):
    df = df.sort_values([user, 'ts'])
    df['delta_sec'] = (df.groupby(user)['ts'].diff().dt.total_seconds())

    SESSION_GAP = 30 * 60  # 30 минут
    df['new_session'] = ((df['delta_sec'].isna()) | (df['delta_sec'] > SESSION_GAP)).astype(int)
    df['session_id'] = (df.groupby(user)['new_session'].cumsum())

    session_len = (df.groupby([user,'session_id']).size().rename('session_len'))
    df = df.merge(session_len, on=[user,'session_id'], how='left')

    session_time = df.groupby([user,'session_id']).agg(session_start=('ts','min'),session_end=('ts','max'))
    session_time['session_duration_min'] = (session_time['session_end'] - session_time['session_start']).dt.total_seconds() / 60
    df = df.merge(session_time[['session_duration_min']], on=[user,'session_id'], how='left')

    # uniq_gates = df.groupby([user,'session_id'])['gate_id'].nunique().rename('session_unique_gates')
    # df = df.merge(uniq_gates, on=[user,'session_id'], how='left')

    session_first_last = (df.groupby([user,'session_id']).agg(first_gate=('gate_id','first'), last_gate=('gate_id','last')))
    df = df.merge(session_first_last, on=[user,'session_id'], how='left')

    return df


X_train = sessions(X_train, 'user_id')
X_val = sessions(X_val, 'user_id')
X_test = sessions(X_test, 'user_word')

#sessions(X_train, 'user_id')

In [251]:
def session_level(df, user):
    df = df.sort_values([user, 'ts'])
    df['delta_sec'] = (df.groupby(user)['ts'].diff().dt.total_seconds())

    SESSION_GAP = 60 * 60  # 30 минут
    df['new_session'] = ((df['delta_sec'].isna()) | (df['delta_sec'] > SESSION_GAP)).astype(int)
    df['session_id'] = (df.groupby(user)['new_session'].cumsum())

    session_df = (
        df.groupby([user,'session_id'])
        .agg(
            session_len=('gate_id','size'),
            session_duration=('ts', lambda x: (x.max()-x.min()).total_seconds()/60),
            unique_gates=('gate_id','nunique'),
            first_gate=('gate_id','first'),
            last_gate=('gate_id','last'),
            start_hour=('ts', lambda x: x.min().hour),
            end_hour=('ts', lambda x: x.max().hour),
        )
        .reset_index()
    )

    return session_df

X_train = session_level(X_train, 'user_id')
X_val = session_level(X_val, 'user_id')
X_test = session_level(X_test, 'user_word')

In [91]:
def timing(df, user_col, num_shifts):
    df = df.sort_values([user_col, 'ts']).copy()

    for i in range(1, num_shifts + 1):
        df[f'prev_delta_{i}'] = df.groupby([user_col])['delta_sec'].shift(i) #date or common
        df[f'next_delta_{i}'] = df.groupby([user_col])['delta_sec'].shift(-i) #date or common
   
    return df


X_train = timing(X_train, 'user_id', 4)
X_val = timing(X_val, 'user_id', 4)
X_test = timing(X_test, 'user_word', 4)

In [18]:
# def compute_user_gate_features(df, user_col, all_gates=None, top_n=3, smoothing=1e-6):
#     """
#     smoothing: маленькая константа для энтропии
#     """
#     # 1) Считаем количество каждого gate по пользователю
#     user_gate_counts = df.groupby([user_col, 'gate_id']).size().rename("count")
#     # 2) Нормализуем по пользователю
#     user_gate_probs = user_gate_counts.groupby(level=user_col).apply(lambda x: x / x.sum())
#     # 3) Убираем лишний уровень индекса после apply
#     user_gate_probs.index = user_gate_probs.index.droplevel(0)
#     # 4) unstack → каждая колонка = один gate
#     user_gate_probs = user_gate_probs.unstack(fill_value=0)
    
#     # 5) Добавляем отсутствующие колонки
#     if all_gates is not None:
#         for g in all_gates:
#             if g not in user_gate_probs.columns:
#                 user_gate_probs[g] = 0
    
#     # 6) Упорядочиваем колонки
#     user_gate_probs = user_gate_probs[[g for g in sorted(user_gate_probs.columns)]]

#     # === СОХРАНЯЕМ gate-prob МАТРИЦУ ===
#     gate_prob_values = user_gate_probs.values.astype(float)

#     # 7) Считаем top-N вероятности
#     sorted_idx = np.argsort(gate_prob_values, axis=1)[:, ::-1]

#     for i in range(top_n):
#         user_gate_probs[f'top{i+1}_gate'] = user_gate_probs.columns[sorted_idx[:, i]]
#         user_gate_probs[f'top{i+1}_prob'] = gate_prob_values[np.arange(len(user_gate_probs)), sorted_idx[:, i]]
    
#     # 8) Считаем энтропию
#     probs_safe = gate_prob_values + smoothing
#     user_gate_probs['gate_entropy'] = -np.sum(probs_safe * np.log(probs_safe), axis=1)

#     # 9) Оставляем только top-N и энтропию
#     final_cols = (
#         [f'top{i+1}_gate' for i in range(top_n)] +
#         [f'top{i+1}_prob' for i in range(top_n)] +
#         ['gate_entropy']
#     )

#     user_gate_probs_final = user_gate_probs[final_cols].reset_index()
    
#     # 10) Сливаем с исходным df
#     df = df.merge(user_gate_probs_final, on=user_col, how='left')
    
#     return df


# X_train = compute_user_gate_features(X_train, 'user_id', all_gates=all_gates, top_n=3)
# X_val   = compute_user_gate_features(X_val,   'user_id', all_gates=all_gates, top_n=3)
# X_test  = compute_user_gate_features(X_test,  'user_word', all_gates=all_gates, top_n=3)

# # Теперь колонки одинаковые: top1_prob, top2_prob, top3_prob, gate_entropy


In [19]:
# def working_hours(df, user_col):
#     work_time = (df.groupby([user_col, 'date']).agg(first_in=('ts', 'min'), last_out=('ts', 'max'))).reset_index()
#     work_time['working_hours'] = (work_time['last_out'] - work_time['first_in']).dt.total_seconds() / 3600

#     # mean и std по пользователю
#     #stats = (work_time.groupby(user_col)['working_hours'].agg(working_hours_mean='mean', working_hours_std='std').reset_index()).fillna(0.0)
#     df = df.merge(work_time[[user_col, 'date', 'working_hours']], on=[user_col, 'date'], how='left')
#     #df = df.merge(stats, on=user_col, how='left')
#     return df


# X_train = working_hours(X_train, 'user_id')
# X_val = working_hours(X_val, 'user_id')
# X_test = working_hours(X_test, 'user_word')

In [92]:
# Adding next and previous gates
def add_prev_next_gates(df, user_col, num_shifts):
    df = df.sort_values([user_col, 'ts']).copy()

    for i in range(1, num_shifts + 1):
        df[f'prev_gate_{i}'] = df.groupby([user_col, 'date'])['gate_id'].shift(i) #date or common
        df[f'next_gate_{i}'] = df.groupby([user_col, 'date'])['gate_id'].shift(-i) #date or common
   
    return df


X_train = add_prev_next_gates(X_train, 'user_id', 9)
X_val = add_prev_next_gates(X_val, 'user_id', 9)
X_test = add_prev_next_gates(X_test, 'user_word', 9)

In [1001]:
# # prev → current
# X_train['trans_prev_cur'] = X_train['prev_gate_1'] * 100 + X_train['gate_id']
# X_val['trans_prev_cur'] = X_val['prev_gate_1'] * 100 + X_val['gate_id']
# X_test['trans_prev_cur'] = X_test['prev_gate_1'] * 100 + X_test['gate_id']

# # current → next
# #X_train['trans_cur_next'] = X_train['gate_id']*100 + X_train['next_gate_1']
# #X_val['trans_cur_next'] = X_val['gate_id']*100 + X_val['next_gate_1']
# #X_test['trans_cur_next'] = X_test['gate_id']*100 + X_test['next_gate_1']

# # prev2 → prev1 → current
# X_train['trans_2step_prev'] = X_train['prev_gate_2']*10000 + X_train['prev_gate_1']*100 + X_train['gate_id']
# X_val['trans_2step_prev'] = X_val['prev_gate_2']*10000 + X_val['prev_gate_1']*100 + X_val['gate_id']
# X_test['trans_2step_prev'] = X_test['prev_gate_2']*10000 + X_test['prev_gate_1']*100 + X_test['gate_id']

# # current → next1 → next2
# X_train['trans_2step_next'] = X_train['gate_id']*10000 + X_train['next_gate_1']*100 + X_train['next_gate_2']
# X_val['trans_2step_next'] = X_val['gate_id']*10000 + X_val['next_gate_1']*100 + X_val['next_gate_2']
# X_test['trans_2step_next'] = X_test['gate_id']*10000 + X_test['next_gate_1']*100 + X_test['next_gate_2']

In [252]:
X_train.isna().sum()

user_id             0
session_id          0
session_len         0
session_duration    0
unique_gates        0
first_gate          0
last_gate           0
start_hour          0
end_hour            0
dtype: int64

In [253]:
X_train_cleaned = X_train.copy()
X_val_cleaned = X_val.copy()
X_test_cleaned = X_test.copy()

# cols_filter = ['gate_id', 'delta_sec', 'prev_delta_1', 'next_delta_1', 'prev_delta_2', \
#        'next_delta_2', 'prev_delta_3', 'next_delta_3', 'prev_delta_4', \
#        'next_delta_4', 'prev_gate_1', 'next_gate_1', 'prev_gate_2', \
#        'next_gate_2', 'prev_gate_3', 'next_gate_3', 'prev_gate_4', \
#        'next_gate_4', 'prev_gate_5', 'next_gate_5', 'prev_gate_6', \
#        'next_gate_6', 'prev_gate_7', 'next_gate_7', 'prev_gate_8', \
#        'next_gate_8', 'prev_gate_9', 'next_gate_9']

# X_train_cleaned = X_train_cleaned.dropna(subset=cols_filter)
# X_val_cleaned = X_val_cleaned.dropna(subset=cols_filter)
# X_test_cleaned = X_test_cleaned.dropna(subset=cols_filter)


In [254]:
X_train.columns

Index(['user_id', 'session_id', 'session_len', 'session_duration',
       'unique_gates', 'first_gate', 'last_gate', 'start_hour', 'end_hour'],
      dtype='object')

In [255]:
# col_order = ['prev_gate_5', 'prev_gate_4', 'prev_gate_3', 'prev_gate_2', 'prev_gate_1', 'gate_id', \
#              'next_gate_1', 'next_gate_2',  'next_gate_3', 'next_gate_4', 'next_gate_5']

# col_train = col_order + ['user_id']
# col_test = col_order + ['user_word']

# X_train_cleaned = X_train_cleaned[col_train]
# X_val_cleaned = X_val_cleaned[col_train]
# X_test_cleaned = X_test_cleaned[col_test]

#cols_to_drop = ['ts', 'hour', 'time', 'date', 'new_session', 'gate_id']
cols_to_drop = ['session_id', 'end_hour', 'unique_gates']
X_train_cleaned = X_train_cleaned.drop(columns=cols_to_drop, axis=1)
X_val_cleaned = X_val_cleaned.drop(columns=cols_to_drop, axis=1)
X_test_cleaned = X_test_cleaned.drop(columns=cols_to_drop, axis=1)

In [256]:
# Split on X and y
y_train = X_train_cleaned['user_id'].astype(int)
y_val = X_val_cleaned['user_id'].astype(int)
user_word = X_test_cleaned['user_word']

# X_train_cleaned = X_train_cleaned.drop(columns=['user_id', 'user_word'], axis=1)
# X_val_cleaned = X_val_cleaned.drop(columns=['user_id', 'user_word'], axis=1)
# X_test_cleaned = X_test_cleaned.drop(columns=['user_id', 'user_word'], axis=1)

X_train_cleaned = X_train_cleaned.drop(columns=['user_id'], axis=1)
X_val_cleaned = X_val_cleaned.drop(columns=['user_id'], axis=1)
X_test_cleaned = X_test_cleaned.drop(columns=['user_word'], axis=1)


In [149]:
# X_train_cleaned = X_train_cleaned.astype('Int64')
# X_val_cleaned = X_val_cleaned.astype('Int64')
# X_test_cleaned = X_test_cleaned.astype('Int64')

In [258]:
X_train_cleaned

Unnamed: 0,session_len,session_duration,first_gate,last_gate,start_hour
0,2,0.016667,3,3,10
1,3,0.466667,11,6,13
2,4,1.683333,7,10,14
3,3,0.500000,11,4,18
4,4,1.333333,7,10,10
...,...,...,...,...,...
6004,3,0.316667,11,4,16
6005,3,0.466667,3,10,11
6006,3,0.333333,11,4,15
6007,4,3.100000,3,-1,9


In [None]:
# cat_features = ['gate_id', 'prev_gate_1', 'next_gate_1', 'prev_gate_2', \
#        'next_gate_2', 'prev_gate_3', 'next_gate_3', 'prev_gate_4', \
#        'next_gate_4', 'prev_gate_5', 'next_gate_5', 'prev_gate_6', \
#        'next_gate_6', 'prev_gate_7', 'next_gate_7', 'prev_gate_8', \
#        'next_gate_8', 'prev_gate_9', 'next_gate_9']


X_train_cleaned = X_train_cleaned.fillna(-999).astype(int)
X_val_cleaned = X_val_cleaned.fillna(-999).astype(int)
X_test_cleaned = X_test_cleaned.fillna(-999).astype(int)

X_train_cleaned = X_train_cleaned.astype(int)
X_val_cleaned = X_val_cleaned.astype(int)
X_test_cleaned = X_test_cleaned.astype(int)


In [259]:
# Это список пар фич, которые почти одинаковые.
corr = X_train_cleaned.corr().abs()
high_corr = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).stack().sort_values(ascending=False))
high_corr[high_corr > 0.85]

Series([], dtype: float64)

In [244]:
# Это список пар фич, которые почти одинаковые.
corr = X_val_cleaned.corr().abs()
high_corr = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).stack().sort_values(ascending=False))
high_corr[high_corr > 0.85]

Series([], dtype: float64)

In [245]:
# Это список пар фич, которые почти одинаковые.
corr = X_test_cleaned.corr().abs()
high_corr = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)).stack().sort_values(ascending=False))
high_corr[high_corr > 0.85]

Series([], dtype: float64)

In [246]:
# Фичи с нулевой дисперсией — сразу в мусор
print(X_train_cleaned.var()[X_train_cleaned.var() < 1e-5].index)
print(X_val_cleaned.var()[X_val_cleaned.var() < 1e-5].index)
print(X_test_cleaned.var()[X_test_cleaned.var() < 1e-5].index)

Index([], dtype='object')
Index([], dtype='object')
Index([], dtype='object')


### Cat Boost

In [260]:
CatBoostModel = CatBoostClassifier(
    iterations=400,
    learning_rate=0.07,
    depth=4,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    bootstrap_type='Bernoulli',
    subsample=0.8,
    rsm=0.8,
    min_data_in_leaf=50,
    leaf_estimation_iterations=1,
    random_seed=42,
    verbose=100
)

CatBoostModel.fit(X_train_cleaned, y_train)

0:	learn: 0.0921950	total: 54.7ms	remaining: 21.8s
100:	learn: 0.2622733	total: 2.37s	remaining: 7s
200:	learn: 0.3023798	total: 4.55s	remaining: 4.5s
300:	learn: 0.3243468	total: 6.69s	remaining: 2.2s
399:	learn: 0.3366617	total: 8.99s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x206913a1910>

In [261]:
feature_importances = CatBoostModel.get_feature_importance(prettified=True)
print(feature_importances.head(50))

         Feature Id  Importances
0  session_duration    29.146282
1        start_hour    27.575422
2        first_gate    16.407972
3         last_gate    15.261999
4       session_len    11.608325


In [262]:
y_val_preds = CatBoostModel.predict(X_val_cleaned)
y_val_probs = CatBoostModel.predict_proba(X_val_cleaned)

print("Accuracy =", round(accuracy_score(y_val, y_val_preds)*100, 5))
print("F1_score =", round(f1_score(y_val, y_val_preds, average='macro')*100,5))

Accuracy = 18.81322
F1_score = 9.75312


### Making y_val и y_train в напободие того, что нужно предсказывать at OdsAi

In [239]:
# Сделаем словарь для train, наподобие того, который нужно предсказать (и обратный словарь)
y_train_list_words = {f'word_{i}': y_i.item() for i, y_i in enumerate(list(y_train.unique()))}
y_train_list_words_inverse = {y_i.item(): f'word_{i}' for i, y_i in enumerate(list(y_train.unique()))}

# Сделаем словарь для val, наподобие того, который нужно предсказать (и обратный словарь)
y_val_list_words = {f'word_{i}': y_i.item() for i, y_i in enumerate(list(y_val.unique()))}
y_val_list_words_inverse = {y_i.item(): f'word_{i}' for i, y_i in enumerate(list(y_val.unique()))}

# Превратим y_val и y в напободие того, что нужно предсказывать
y_val_word = y_val.copy()
y_train_word = y_train.copy()
y_val_word = y_val_word.map(y_val_list_words_inverse)
y_train_word = y_train_word.map(y_train_list_words_inverse)

val_words = pd.DataFrame()
val_words['user_word'] = y_val_word
val_words['true'] = y_val
val_words['preds'] = y_val_preds

# Предскажем самым частотным user_id
comp_df = pd.DataFrame(val_words.groupby('user_word')['preds'].agg(lambda x: x.value_counts().index[0]))
for idx in comp_df.index:
    comp_df.loc[idx, 'true'] = y_val_list_words[idx]

comp_df = comp_df.astype(int)
comp_df['comp'] = comp_df['preds'] == comp_df['true']

# Веса юзеров мы не знаем, давайте возьмем равные веса для простоты
comp_df['norm'] = 1
true_answers = (comp_df['comp'] * comp_df['norm']).sum()
total_answers = comp_df['norm'].sum()
precent_true = round((true_answers / total_answers)*100, 1)
print('Оценка val', true_answers, '/', total_answers, '=', precent_true, '%')

Оценка val 13 / 43 = 30.2 %


### Submission

In [1183]:
test_preds = CatBoostModel.predict(X_test_cleaned)
probs = CatBoostModel.predict_proba(X_test_cleaned)

#test_preds = RF_model.predict(X_test_cleaned)
#probs = RF_model.predict_proba(X_test_cleaned)

max_probs = probs.max(axis=1)

submission = pd.DataFrame({
    'user_word': user_word.values,
    'preds': test_preds.ravel(), 
    'max_prob': max_probs
})

submission = submission.groupby('user_word').agg({
    'preds': lambda x: x.value_counts().idxmax(),
    'max_prob': 'max'  # максимальная уверенность в группе
})

# Ставим -999 только если максимальная вероятность < порог
threshold = 0.2
submission.loc[submission['max_prob'] < threshold, 'preds'] = -999

# Убираем колонку max_prob
submission = submission.drop(columns='max_prob')

In [1184]:
submission

Unnamed: 0_level_0,preds
user_word,Unnamed: 1_level_1
aucroc,49
binary,12
blue,46
categorical,14
coefficient,15
collinear,54
distributed,0
epsilon,49
f1,6
fit,15


In [1185]:
submission.to_csv("answer.csv")

In [None]:
163 / (540/34)

In [None]:
# 