In [1]:
from typing import Tuple
import pickle as pk

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

from warnings import filterwarnings
filterwarnings('ignore')

TRAIN_PATH = '/kaggle/input/vkgraphwithattrs/train_dataset_VK/train.csv'
ATTR_PATH = '/kaggle/input/vkgraphwithattrs/attr.csv'
TEST_PATH = '/kaggle/input/vkgraphwithattrs/train_dataset_VK/test.csv'
SUBMISSION_PATH = '/kaggle/input/vkgraphwithattrs/train_dataset_VK/submission.csv'



# Загрузка и предобработка данных

Так как вся обучающая выборка занимает слишком много места в оперативной памяти, мы используем для обучения только первые 10 000 эго кластеров.

In [3]:
train = pd.read_csv(TRAIN_PATH)
print('Train shape before:', train.shape)
# Take only 10k first ego graphs from train set
train = train[train['ego_id'].isin(set(train['ego_id'].unique()[:10000]))]
print('Train shape after: ', train.shape)
train.head()

Train shape before: (122280372, 7)
Train shape after:  (19918223, 7)


Unnamed: 0,ego_id,u,v,t,x1,x2,x3
0,0,131,84,148.0,5.6692e-07,0.0,0.0
1,0,135,164,396.7,0.06246274,0.0,0.0
2,0,47,15,,0.0,0.0,1.0
3,0,5,4,594.5,0.04962974,0.0,0.0
4,0,176,219,45.5,1.237935,0.0,0.0


Загрузим атрубиты пользователей, заменим значения -1 на NaN. Рассчитаем признак nan_cnt - количество пропусков в атрибутах пользователя.

In [4]:
attr = pd.read_csv(ATTR_PATH)
# Replace -1 with nan
attr = attr.replace(-1, np.nan)
# Add nan cnt feature
attr['nan_cnt'] = attr.isna().sum(axis=1)
print(attr.shape)
attr.head()

(14930748, 8)


Unnamed: 0,ego_id,u,age,city_id,sex,school,university,nan_cnt
0,0,227,68.0,,1.0,778293348.0,,2
1,0,45,38.0,237065842.0,1.0,82803468.0,238500268.0,0
2,0,142,60.0,237065842.0,1.0,196560139.0,,1
3,0,280,66.0,,2.0,963209731.0,720783270.0,1
4,0,41,18.0,,2.0,308862409.0,,2


Создадим функцию для предобработки данных и разделения признаков и целевой переменной.
Она создает следующие признаки:

- t - целое число дней, прошедшее с возникновения дружбы между парой пользователей
- x2 - незивестная величина, отражающая интенсивность взаимодействия между пользователями
- x3 - незивестная величина, отражающая интенсивность взаимодействия между пользователями
- age_x/y - возраст пользователя
- nan_cnt_x/y - количество пропусков в атрибутах пользователя
- same_school/university/city_id - совпадают ли атрибуты пары пользователей
- sex_x/y_1.0/2.0 - dummy кодирование признака пола

In [5]:
def generate_features(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    # Merge with attributes data
    data = pd.merge(
        data, attr, how='left', on=['ego_id','u']
    )
    data = pd.merge(
        data, attr.rename(columns={'u':'v'}), how='left', on=['ego_id','v']
    )
    
    # Add same school, university and city_id features
    data['same_school'] = ((data['school_x'] == data['school_y']) & data['school_x'].notna())
    data['same_university'] = ((data['university_x'] == data['university_y']) & data['university_x'].notna())
    data['same_city_id'] = ((data['city_id_x'] == data['city_id_y']) & data['city_id_x'].notna())

    # Generate dummy vars for sex feature
    data = data.join(
        pd.get_dummies(data['sex_x'], prefix='sex_x')
    ).join(
        pd.get_dummies(data['sex_y'], prefix='sex_y')
    )

    # Create train features and target sets
    data_x = data.drop(
        [
            'ego_id', 'u', 'v', 'x1',
            'sex_x', 'sex_y',
            'city_id_x', 'city_id_y',
            'school_x', 'school_y',
            'university_x', 'university_y',
        ],
        axis=1
    )
    data_y = data['x1']

    return data_x, data_y

In [6]:
model_train_data_x, model_train_data_y = generate_features(train)
print('Размеры тренировочного сета:', model_train_data_x.shape)
model_train_data_x.head()

Размеры тренировочного сета: (19918223, 14)


Unnamed: 0,t,x2,x3,age_x,nan_cnt_x,age_y,nan_cnt_y,same_school,same_university,same_city_id,sex_x_1.0,sex_x_2.0,sex_y_1.0,sex_y_2.0
0,148.0,0.0,0.0,62.0,2.0,50.0,0.0,False,False,True,1,0,1,0
1,396.7,0.0,0.0,45.0,0.0,92.0,2.0,False,True,False,0,1,1,0
2,,0.0,1.0,41.0,3.0,42.0,0.0,False,False,False,0,1,0,1
3,594.5,0.0,0.0,49.0,0.0,40.0,1.0,False,False,False,1,0,0,1
4,45.5,0.0,0.0,21.0,0.0,21.0,2.0,False,False,True,1,0,1,0


# Обучение модели

В качестве модели мы выбрали CatBoost, параметры iterations=1000 и depth=8 были подобраны при помощи кросс-валидации.

In [7]:
model = CatBoostRegressor(
    loss_function='RMSE',
    iterations=1000,
    depth=8,
    random_seed=69,
)

In [8]:
model.fit(
    model_train_data_x,
    model_train_data_y,
);

Learning rate set to 0.195642
0:	learn: 1.1971672	total: 2.03s	remaining: 33m 48s
1:	learn: 1.0828204	total: 3.79s	remaining: 31m 31s
2:	learn: 0.9986401	total: 5.69s	remaining: 31m 31s
3:	learn: 0.9383609	total: 7.49s	remaining: 31m 6s
4:	learn: 0.8939611	total: 9.48s	remaining: 31m 26s
5:	learn: 0.8623033	total: 11.2s	remaining: 30m 51s
6:	learn: 0.8384719	total: 13.2s	remaining: 31m 16s
7:	learn: 0.8222176	total: 15.1s	remaining: 31m 16s
8:	learn: 0.8099843	total: 17s	remaining: 31m 15s
9:	learn: 0.8005525	total: 18.8s	remaining: 31m 3s
10:	learn: 0.7940415	total: 21.1s	remaining: 31m 33s
11:	learn: 0.7888516	total: 23.6s	remaining: 32m 21s
12:	learn: 0.7849807	total: 26.3s	remaining: 33m 13s
13:	learn: 0.7824314	total: 28.1s	remaining: 32m 56s
14:	learn: 0.7799825	total: 29.9s	remaining: 32m 43s
15:	learn: 0.7779678	total: 32s	remaining: 32m 45s
16:	learn: 0.7762528	total: 33.9s	remaining: 32m 41s
17:	learn: 0.7750662	total: 35.7s	remaining: 32m 27s
18:	learn: 0.7741748	total: 37.4

Выведем важность каждого признака

In [9]:
pd.DataFrame({
    'feature': model_train_data_x.columns,
    'importance': model.feature_importances_,
}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
0,t,43.701215
1,x2,40.232613
3,age_x,5.020651
5,age_y,2.989902
10,sex_x_1.0,1.837256
12,sex_y_1.0,1.694171
2,x3,1.598246
4,nan_cnt_x,1.400057
6,nan_cnt_y,0.876728
9,same_city_id,0.23007


Сохраним модель

In [10]:
# Save model
with open('/kaggle/working/model9.pk', 'wb+') as f:
    pk.dump(model, f)

# Локальное тестирование модели

Тестовый набор данных так же довольно большой, поэтому для тестирования мы берем случайные 5% записей.

In [12]:
test = pd.read_csv(TEST_PATH)
print('Test shape before:', test.shape)
# Take only 5% rows from test set
test = test.sample(frac=0.05)
print('Test shape after: ', test.shape)
test.head()

Test shape before: (40548780, 7)
Test shape after:  (2027439, 7)


Unnamed: 0,ego_id,u,v,t,x1,x2,x3
23146497,987842478169,28,22,192.0,6e-06,0.0,0.0
8348635,360777253560,8,258,,0.0,0.0,1.0
837820,34359738898,97,161,,0.001824,0.0,0.0
410844,17179869363,0,214,80.0,1.8e-05,0.0,0.0
32515615,1374389535183,85,59,30.4,0.0,0.0,0.0


Так как в целевой переменной встречаются пропуски, и невозможно рассчитать метрику на таких значениях, уберем их.

In [13]:
# Drop rows with nans in target, because we cannot calculate metric on them
test = test.dropna(subset=['x1'])
test.shape

(1622440, 7)

Сгенерируем признаки на тестовых данных для предсказания моделью.

In [14]:
model_test_data_x, model_test_data_y = generate_features(test)
print('Размеры тестового сета:', model_test_data_x.shape)
model_test_data_x.head()

Размеры тестового сета: (1622440, 14)


Unnamed: 0,t,x2,x3,age_x,nan_cnt_x,age_y,nan_cnt_y,same_school,same_university,same_city_id,sex_x_1.0,sex_x_2.0,sex_y_1.0,sex_y_2.0
0,192.0,0.0,0.0,47.0,2.0,29.0,0.0,False,False,False,1,0,0,1
1,,0.0,1.0,31.0,3.0,32.0,3.0,False,False,False,0,1,0,1
2,,0.0,0.0,28.0,3.0,31.0,2.0,False,False,False,1,0,0,1
3,80.0,0.0,0.0,30.0,1.0,27.0,1.0,False,False,True,1,0,1,0
4,30.4,0.0,0.0,30.0,1.0,41.0,2.0,False,False,True,1,0,1,0


Рассчитаем метрику RMSE на созданном тестовом сете

In [15]:
mean_squared_error(
    model_test_data_y,
    model.predict(model_test_data_x),
    squared=False,
)

0.7646433046532958

# Формирование и отправка решения

In [16]:
submission = pd.read_csv(SUBMISSION_PATH)
submission.head()

Unnamed: 0,ego_id,u,v,x1
0,8,0,93,0.0
1,8,0,143,0.0
2,8,0,151,1.606742
3,8,1,24,0.026496
4,8,5,4,0.159857


Так как в файле submission нет признаков t, x2 и x3, возьмем их из test файла.

In [17]:
test = pd.read_csv(TEST_PATH)

submission = pd.merge(
    submission,
    test,
    how='left',
    on=['ego_id', 'u', 'v']
).drop_duplicates(subset=['ego_id', 'u', 'v'])

# Delete test to free space
del test

submission = submission.\
    drop('x1_y', axis=1).\
    rename(columns={
        'v_x': 'v',
        'x1_x': 'x1',
    })

Сгенерируем признаки.

In [18]:
model_submission_data_x, _ = generate_features(submission)

Сделаем предсказания обученной моделью и сохраним их в файл для отправки на проверку.

In [19]:
submission['x1'] = model.predict(model_submission_data_x)

In [20]:
submission[['ego_id', 'u', 'v', 'x1']].to_csv('submission9.csv', index=False)