In [1]:
from collections import defaultdict
import pickle as pk

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import mean_squared_error

from warnings import filterwarnings
filterwarnings('ignore')



# Загрузка и предобработка данных

Так как вся обучающая выборка занимает слишком много места в оперативной памяти, мы используем для обучения только первые 10 000 эго кластеров.

In [2]:
train = pd.read_csv('/kaggle/input/vkgraphwithattrs/train_dataset_VK/train.csv')
print(train.shape)
# Take only 10k first ego graphs from train set
train = train[train['ego_id'].isin(set(train['ego_id'].unique()[:10000]))]
train.head()

(122280372, 7)


Unnamed: 0,ego_id,u,v,t,x1,x2,x3
0,0,131,84,148.0,5.6692e-07,0.0,0.0
1,0,135,164,396.7,0.06246274,0.0,0.0
2,0,47,15,,0.0,0.0,1.0
3,0,5,4,594.5,0.04962974,0.0,0.0
4,0,176,219,45.5,1.237935,0.0,0.0


In [3]:
train.shape

(19918223, 7)

In [4]:
attr = pd.read_csv('/kaggle/input/vkgraphwithattrs/attr.csv')
attr = attr.replace(-1, np.nan)
print(attr.shape)
attr.head()

(14930748, 7)


Unnamed: 0,ego_id,u,age,city_id,sex,school,university
0,0,227,68.0,,1.0,778293348.0,
1,0,45,38.0,237065842.0,1.0,82803468.0,238500268.0
2,0,142,60.0,237065842.0,1.0,196560139.0,
3,0,280,66.0,,2.0,963209731.0,720783270.0
4,0,41,18.0,,2.0,308862409.0,


In [5]:
# Add nan cnt feature
attr['nan_cnt'] = attr.isna().sum(axis=1)
attr.head()

Unnamed: 0,ego_id,u,age,city_id,sex,school,university,nan_cnt
0,0,227,68.0,,1.0,778293348.0,,2
1,0,45,38.0,237065842.0,1.0,82803468.0,238500268.0,0
2,0,142,60.0,237065842.0,1.0,196560139.0,,1
3,0,280,66.0,,2.0,963209731.0,720783270.0,1
4,0,41,18.0,,2.0,308862409.0,,2


In [6]:
# Merge train and attr datasets
train_full = pd.merge(
    train, attr, how='left', on=['ego_id','u']
)

train_full = pd.merge(
    train_full, attr.rename(columns={'u': 'v'}), how='left', on=['ego_id','v']
)

train_full.head()

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age_x,city_id_x,sex_x,school_x,university_x,nan_cnt_x,age_y,city_id_y,sex_y,school_y,university_y,nan_cnt_y
0,0,131,84,148.0,5.6692e-07,0.0,0.0,62.0,237065842.0,1.0,,,2.0,50.0,237065842.0,1.0,182400947.0,310894832.0,0.0
1,0,135,164,396.7,0.06246274,0.0,0.0,45.0,238321946.0,2.0,461075276.0,991369526.0,0.0,92.0,,1.0,,991369526.0,2.0
2,0,47,15,,0.0,0.0,1.0,41.0,,2.0,,,3.0,42.0,237065842.0,2.0,434584929.0,240636691.0,0.0
3,0,5,4,594.5,0.04962974,0.0,0.0,49.0,237065842.0,1.0,929914814.0,900704564.0,0.0,40.0,,2.0,829727092.0,541978296.0,1.0
4,0,176,219,45.5,1.237935,0.0,0.0,21.0,237065842.0,1.0,967669435.0,149014508.0,0.0,21.0,237065842.0,1.0,,,2.0


In [7]:
# Add same school, university and city_id features
train_full['same_school'] = ((train_full['school_x'] == train_full['school_y']) & train_full['school_x'].notna())
train_full['same_university'] = ((train_full['university_x'] == train_full['university_y']) & train_full['university_x'].notna())
train_full['same_city_id'] = ((train_full['city_id_x'] == train_full['city_id_y']) & train_full['city_id_x'].notna())

train_full.head()

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age_x,city_id_x,sex_x,...,nan_cnt_x,age_y,city_id_y,sex_y,school_y,university_y,nan_cnt_y,same_school,same_university,same_city_id
0,0,131,84,148.0,5.6692e-07,0.0,0.0,62.0,237065842.0,1.0,...,2.0,50.0,237065842.0,1.0,182400947.0,310894832.0,0.0,False,False,True
1,0,135,164,396.7,0.06246274,0.0,0.0,45.0,238321946.0,2.0,...,0.0,92.0,,1.0,,991369526.0,2.0,False,True,False
2,0,47,15,,0.0,0.0,1.0,41.0,,2.0,...,3.0,42.0,237065842.0,2.0,434584929.0,240636691.0,0.0,False,False,False
3,0,5,4,594.5,0.04962974,0.0,0.0,49.0,237065842.0,1.0,...,0.0,40.0,,2.0,829727092.0,541978296.0,1.0,False,False,False
4,0,176,219,45.5,1.237935,0.0,0.0,21.0,237065842.0,1.0,...,0.0,21.0,237065842.0,1.0,,,2.0,False,False,True


In [8]:
# Generate dummy vars for sex feature
train_full = train_full.join(
    pd.get_dummies(train_full['sex_x'], prefix='sex_x')
).join(
    pd.get_dummies(train_full['sex_y'], prefix='sex_y')
)

train_full.head()

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age_x,city_id_x,sex_x,...,school_y,university_y,nan_cnt_y,same_school,same_university,same_city_id,sex_x_1.0,sex_x_2.0,sex_y_1.0,sex_y_2.0
0,0,131,84,148.0,5.6692e-07,0.0,0.0,62.0,237065842.0,1.0,...,182400947.0,310894832.0,0.0,False,False,True,1,0,1,0
1,0,135,164,396.7,0.06246274,0.0,0.0,45.0,238321946.0,2.0,...,,991369526.0,2.0,False,True,False,0,1,1,0
2,0,47,15,,0.0,0.0,1.0,41.0,,2.0,...,434584929.0,240636691.0,0.0,False,False,False,0,1,0,1
3,0,5,4,594.5,0.04962974,0.0,0.0,49.0,237065842.0,1.0,...,829727092.0,541978296.0,1.0,False,False,False,1,0,0,1
4,0,176,219,45.5,1.237935,0.0,0.0,21.0,237065842.0,1.0,...,,,2.0,False,False,True,1,0,1,0


In [9]:
# Create train features and target sets
model_train_data_x = train_full.drop(
    [
        'ego_id', 'u', 'v', 'x1',
        'sex_x', 'sex_y',
        'city_id_x', 'city_id_y',
        'school_x', 'school_y',
        'university_x', 'university_y',
    ],
    axis=1
)
model_train_data_y = train_full['x1']

model_train_data_x.shape, model_train_data_y.shape

((19918223, 14), (19918223,))

In [11]:
model_train_data_x.head()

Unnamed: 0,t,x2,x3,age_x,nan_cnt_x,age_y,nan_cnt_y,same_school,same_university,same_city_id,sex_x_1.0,sex_x_2.0,sex_y_1.0,sex_y_2.0
0,148.0,0.0,0,62.0,2.0,50.0,0.0,False,False,True,1,0,1,0
1,396.7,0.0,0,45.0,0.0,92.0,2.0,False,True,False,0,1,1,0
2,,0.0,1,41.0,3.0,42.0,0.0,False,False,False,0,1,0,1
3,594.5,0.0,0,49.0,0.0,40.0,1.0,False,False,False,1,0,0,1
4,45.5,0.0,0,21.0,0.0,21.0,2.0,False,False,True,1,0,1,0


После обработки данных получили следующие признаки:

- t - целое число дней, прошедшее с возникновения дружбы между парой пользователей
- x2 - незивестная величина, отражающая интенсивность взаимодействия между пользователями
- x3 - незивестная величина, отражающая интенсивность взаимодействия между пользователями
- age_x/y - возраст пользователя
- nan_cnt_x/y - количество пропусков в атрибутах пользователя
- same_school/university/city_id - совпадают ли атрибуты пары пользователей
- sex_x/y_1.0/2.0 - dummy кодирование признака пола

# Model

В качестве модели мы выбрали CatBoost, параметры iterations=1000 и depth=8 были подобраны при помощи кросс-валидации

In [12]:
model = CatBoostRegressor(
    loss_function='RMSE',
    iterations=1000,
    depth=8,
    random_seed=69,
)

In [13]:
model.fit(
    model_train_data_x,
    model_train_data_y,
)

Learning rate set to 0.195642
0:	learn: 1.2032407	total: 1.31s	remaining: 21m 47s
1:	learn: 1.0931280	total: 2.56s	remaining: 21m 18s
2:	learn: 1.0120503	total: 3.75s	remaining: 20m 45s
3:	learn: 0.9536716	total: 5.04s	remaining: 20m 56s
4:	learn: 0.9112911	total: 6.28s	remaining: 20m 49s
5:	learn: 0.8838651	total: 7.4s	remaining: 20m 26s
6:	learn: 0.8609336	total: 8.53s	remaining: 20m 9s
7:	learn: 0.8459720	total: 9.7s	remaining: 20m 3s
8:	learn: 0.8334609	total: 11.1s	remaining: 20m 26s
9:	learn: 0.8226893	total: 12.4s	remaining: 20m 27s
10:	learn: 0.8157709	total: 13.7s	remaining: 20m 33s
11:	learn: 0.8089601	total: 15s	remaining: 20m 38s
12:	learn: 0.8032139	total: 16.2s	remaining: 20m 30s
13:	learn: 0.7981584	total: 17.5s	remaining: 20m 29s
14:	learn: 0.7938216	total: 18.8s	remaining: 20m 36s
15:	learn: 0.7915128	total: 20s	remaining: 20m 27s
16:	learn: 0.7881692	total: 21.3s	remaining: 20m 32s
17:	learn: 0.7860352	total: 22.7s	remaining: 20m 36s
18:	learn: 0.7839088	total: 23.9s	

<catboost.core.CatBoostRegressor at 0x7a89b37b54b0>

Выведем важность каждого признака

In [14]:
pd.DataFrame({
    'feature': model_train_data_x.columns,
    'importance': model.feature_importances_,
}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
1,x2,44.86164
0,t,42.765603
3,age_x,3.770717
5,age_y,2.253869
10,sex_x_1.0,1.443324
4,nan_cnt_x,1.382102
12,sex_y_1.0,1.297652
2,x3,1.244968
6,nan_cnt_y,0.502695
13,sex_y_2.0,0.149427


Сохраним модель

In [16]:
# Save model
with open('/kaggle/working/model9.pk', 'wb+') as f:
    pk.dump(model, f)

# Test

Тестовый набор данных так же довольно большой, поэтому для тестирования мы берем случайные 5% записей.

In [15]:
test = pd.read_csv('/kaggle/input/vkgraphwithattrs/train_dataset_VK/test.csv')
print(test.shape)
test = test.sample(frac=0.05)
test.head()

(40548780, 7)


Unnamed: 0,ego_id,u,v,t,x1,x2,x3
14787102,644245094663,73,79,39.2,0.001091,0.0,0.0
25520976,1082331758638,184,290,81.8,0.001069,0.0,0.0
27639334,1168231105022,141,12,72.7,,0.0,0.0
8617194,377957122285,107,224,126.7,1.270076,0.693147,0.0
21833379,927712936474,7,10,49.1,0.594009,0.0,0.0


Так как в целевой переменной встречаются пропуски, и невозможно рассчитать метрику на таких значениях, уберем их.

In [17]:
# Drop rows with nans in target, because we cannot calculate metric on them
test = test.dropna(subset=['x1'])
test.shape

(1622052, 7)

In [18]:
# Merger test and attr dataframes
test_full = pd.merge(
    test, attr, how='left', on=['ego_id','u']
)

test_full = pd.merge(
    test_full, attr.rename(columns={'u': 'v'}), how='left', on=['ego_id','v']
)

test_full.head()

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age_x,city_id_x,sex_x,school_x,university_x,nan_cnt_x,age_y,city_id_y,sex_y,school_y,university_y,nan_cnt_y
0,644245094663,73,79,39.2,0.001091,0.0,0.0,51.0,,1.0,,,3.0,29.0,576853581.0,1.0,41623893.0,63422139.0,0.0
1,1082331758638,184,290,81.8,0.001069,0.0,0.0,52.0,369101331.0,1.0,111612013.0,,1.0,,,,,,
2,377957122285,107,224,126.7,1.270076,0.693147,0.0,15.0,,1.0,,,3.0,42.0,999727620.0,1.0,,3153412.0,1.0
3,927712936474,7,10,49.1,0.594009,0.0,0.0,24.0,730054785.0,2.0,308737646.0,,1.0,19.0,35000563.0,2.0,295097121.0,,1.0
4,455266533490,175,132,,0.0,0.0,1.0,41.0,695332068.0,2.0,789672116.0,,1.0,,,,,,


In [19]:
test_full['same_school'] = ((test_full['school_x'] == test_full['school_y']) & test_full['school_x'].notna())
test_full['same_university'] = ((test_full['university_x'] == test_full['university_y']) & test_full['university_x'].notna())
test_full['same_city_id'] = ((test_full['city_id_x'] == test_full['city_id_y']) & test_full['city_id_x'].notna())

test_full.head()

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age_x,city_id_x,sex_x,...,nan_cnt_x,age_y,city_id_y,sex_y,school_y,university_y,nan_cnt_y,same_school,same_university,same_city_id
0,644245094663,73,79,39.2,0.001091,0.0,0.0,51.0,,1.0,...,3.0,29.0,576853581.0,1.0,41623893.0,63422139.0,0.0,False,False,False
1,1082331758638,184,290,81.8,0.001069,0.0,0.0,52.0,369101331.0,1.0,...,1.0,,,,,,,False,False,False
2,377957122285,107,224,126.7,1.270076,0.693147,0.0,15.0,,1.0,...,3.0,42.0,999727620.0,1.0,,3153412.0,1.0,False,False,False
3,927712936474,7,10,49.1,0.594009,0.0,0.0,24.0,730054785.0,2.0,...,1.0,19.0,35000563.0,2.0,295097121.0,,1.0,False,False,False
4,455266533490,175,132,,0.0,0.0,1.0,41.0,695332068.0,2.0,...,1.0,,,,,,,False,False,False


In [20]:
test_full = test_full.join(
    pd.get_dummies(test_full['sex_x'], prefix='sex_x')
).join(
    pd.get_dummies(test_full['sex_y'], prefix='sex_y')
)

test_full.head()

Unnamed: 0,ego_id,u,v,t,x1,x2,x3,age_x,city_id_x,sex_x,...,school_y,university_y,nan_cnt_y,same_school,same_university,same_city_id,sex_x_1.0,sex_x_2.0,sex_y_1.0,sex_y_2.0
0,644245094663,73,79,39.2,0.001091,0.0,0.0,51.0,,1.0,...,41623893.0,63422139.0,0.0,False,False,False,1,0,1,0
1,1082331758638,184,290,81.8,0.001069,0.0,0.0,52.0,369101331.0,1.0,...,,,,False,False,False,1,0,0,0
2,377957122285,107,224,126.7,1.270076,0.693147,0.0,15.0,,1.0,...,,3153412.0,1.0,False,False,False,1,0,1,0
3,927712936474,7,10,49.1,0.594009,0.0,0.0,24.0,730054785.0,2.0,...,295097121.0,,1.0,False,False,False,0,1,0,1
4,455266533490,175,132,,0.0,0.0,1.0,41.0,695332068.0,2.0,...,,,,False,False,False,0,1,0,0


In [21]:
model_test_data_x = test_full.drop(
    [
        'ego_id', 'u', 'v', 'x1',
        'sex_x', 'sex_y',
        'city_id_x', 'city_id_y',
        'school_x', 'school_y',
        'university_x', 'university_y',
    ],
    axis=1
)
model_test_data_y = test_full['x1']

model_test_data_x.shape, model_test_data_y.shape

((1622052, 14), (1622052,))

In [23]:
model_test_data_x.head()

Unnamed: 0,t,x2,x3,age_x,nan_cnt_x,age_y,nan_cnt_y,same_school,same_university,same_city_id,sex_x_1.0,sex_x_2.0,sex_y_1.0,sex_y_2.0
0,39.2,0.0,0,51.0,3.0,29.0,0.0,False,False,False,1,0,1,0
1,81.8,0.0,0,52.0,1.0,,,False,False,False,1,0,0,0
2,126.7,0.693147,0,15.0,3.0,42.0,1.0,False,False,False,1,0,1,0
3,49.1,0.0,0,24.0,1.0,19.0,1.0,False,False,False,0,1,0,1
4,,0.0,1,41.0,1.0,,,False,False,False,0,1,0,0


Рассчитаем метрику RMSE на созданном тренировочном сете

In [24]:
mean_squared_error(
    model_test_data_y,
    model.predict(model_test_data_x),
    squared=False,
)

0.7660960110138678

# Submission

In [25]:
submission = pd.read_csv('/kaggle/input/vkgraphwithattrs/train_dataset_VK/submission.csv')
print(submission.shape)

(810976, 4)


Так как в файле submission нет признаков t, x2 и x3, возьмем их из test файла.

In [26]:
test = pd.read_csv('/kaggle/input/vkgraphwithattrs/train_dataset_VK/test.csv')
print(test.shape)

(40548780, 7)


In [27]:
submission = pd.merge(
    submission,
    test,
    how='left',
    on=['ego_id', 'u', 'v']
).drop_duplicates(subset=['ego_id', 'u', 'v'])

In [28]:
del test

In [29]:
submission.head()

Unnamed: 0,ego_id,u,v,x1_x,t,x1_y,x2,x3
0,8,0,93,0.0,359.6,,0.0,0.0
1,8,0,143,0.0,6.1,,0.0,0.0
2,8,0,151,1.606742,0.2,,1.386294,0.0
3,8,1,24,0.026496,594.5,,0.0,0.0
4,8,5,4,0.159857,461.5,,0.0,0.0


In [30]:
submission = submission.\
    drop('x1_y', axis=1).\
    rename(columns={
        'v_x': 'v',
        'x1_x': 'x1',
    })

In [31]:
submission_full = pd.merge(
    submission, attr, how='left', on=['ego_id','u']
)

submission_full = pd.merge(
    submission_full, attr.rename(columns={'u': 'v'}), how='left', on=['ego_id','v']
)

submission_full.head()

Unnamed: 0,ego_id,u,v,x1,t,x2,x3,age_x,city_id_x,sex_x,school_x,university_x,nan_cnt_x,age_y,city_id_y,sex_y,school_y,university_y,nan_cnt_y
0,8,0,93,0.0,359.6,0.0,0.0,36.0,979281502.0,2.0,213987831.0,845825535.0,0.0,36.0,979281502.0,2.0,734952557.0,566091832.0,0.0
1,8,0,143,0.0,6.1,0.0,0.0,36.0,979281502.0,2.0,213987831.0,845825535.0,0.0,43.0,979281502.0,2.0,,,2.0
2,8,0,151,1.606742,0.2,1.386294,0.0,36.0,979281502.0,2.0,213987831.0,845825535.0,0.0,18.0,979281502.0,2.0,,,2.0
3,8,1,24,0.026496,594.5,0.0,0.0,120.0,56833659.0,1.0,370230497.0,779615128.0,0.0,36.0,104874069.0,2.0,213987831.0,562436811.0,0.0
4,8,5,4,0.159857,461.5,0.0,0.0,37.0,979281502.0,1.0,814552332.0,,1.0,37.0,,1.0,213987831.0,,2.0


In [32]:
submission_full['same_school'] = ((submission_full['school_x'] == submission_full['school_y']) & submission_full['school_x'].notna())
submission_full['same_university'] = ((submission_full['university_x'] == submission_full['university_y']) & submission_full['university_x'].notna())
submission_full['same_city_id'] = ((submission_full['city_id_x'] == submission_full['city_id_y']) & submission_full['city_id_x'].notna())

submission_full.head()

Unnamed: 0,ego_id,u,v,x1,t,x2,x3,age_x,city_id_x,sex_x,...,nan_cnt_x,age_y,city_id_y,sex_y,school_y,university_y,nan_cnt_y,same_school,same_university,same_city_id
0,8,0,93,0.0,359.6,0.0,0.0,36.0,979281502.0,2.0,...,0.0,36.0,979281502.0,2.0,734952557.0,566091832.0,0.0,False,False,True
1,8,0,143,0.0,6.1,0.0,0.0,36.0,979281502.0,2.0,...,0.0,43.0,979281502.0,2.0,,,2.0,False,False,True
2,8,0,151,1.606742,0.2,1.386294,0.0,36.0,979281502.0,2.0,...,0.0,18.0,979281502.0,2.0,,,2.0,False,False,True
3,8,1,24,0.026496,594.5,0.0,0.0,120.0,56833659.0,1.0,...,0.0,36.0,104874069.0,2.0,213987831.0,562436811.0,0.0,False,False,False
4,8,5,4,0.159857,461.5,0.0,0.0,37.0,979281502.0,1.0,...,1.0,37.0,,1.0,213987831.0,,2.0,False,False,False


In [33]:
submission_full = submission_full.join(
    pd.get_dummies(submission_full['sex_x'], prefix='sex_x')
).join(
    pd.get_dummies(submission_full['sex_y'], prefix='sex_y')
)

submission_full.head()

Unnamed: 0,ego_id,u,v,x1,t,x2,x3,age_x,city_id_x,sex_x,...,school_y,university_y,nan_cnt_y,same_school,same_university,same_city_id,sex_x_1.0,sex_x_2.0,sex_y_1.0,sex_y_2.0
0,8,0,93,0.0,359.6,0.0,0.0,36.0,979281502.0,2.0,...,734952557.0,566091832.0,0.0,False,False,True,0,1,0,1
1,8,0,143,0.0,6.1,0.0,0.0,36.0,979281502.0,2.0,...,,,2.0,False,False,True,0,1,0,1
2,8,0,151,1.606742,0.2,1.386294,0.0,36.0,979281502.0,2.0,...,,,2.0,False,False,True,0,1,0,1
3,8,1,24,0.026496,594.5,0.0,0.0,120.0,56833659.0,1.0,...,213987831.0,562436811.0,0.0,False,False,False,1,0,0,1
4,8,5,4,0.159857,461.5,0.0,0.0,37.0,979281502.0,1.0,...,213987831.0,,2.0,False,False,False,1,0,1,0


In [34]:
model_submission_data_x = submission_full.drop(
    [
        'ego_id', 'u', 'v', 'x1',
        'sex_x', 'sex_y',
        'city_id_x', 'city_id_y',
        'school_x', 'school_y',
        'university_x', 'university_y',
    ],
    axis=1
)

model_submission_data_x.shape

(810976, 14)

Сделаем предсказания обученной моделью и сохраним их в файл для отправки на проверку.

In [36]:
submission_full['x1'] = model.predict(model_submission_data_x)

In [44]:
submission_full[['ego_id', 'u', 'v', 'x1']].to_csv('submission5.csv', index=False)