In [1]:
import pickle as pk

import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

from warnings import filterwarnings
filterwarnings('ignore')



# Загрузка и предобработка данных

Загрузим ранее предобработанные данные для обучения. В них присутствуют признаки:

- t - целое число дней, прошедшее с возникновения дружбы между парой пользователей
- x2 - незивестная величина, отражающая интенсивность взаимодействия между пользователями
- x3 - незивестная величина, отражающая интенсивность взаимодействия между пользователями
- age_x/y - возраст пользователя
- sex_x/y_1.0/2.0 - dummy кодирование признака пола
- nan_cnt_x/y - количество пропусков в атрибутах пользователя
- same_school/university/city_id - совпадают ли атрибуты школы/университета/города у пары пользователей
- friend_cnt_x/y - количество друзей у пользователя
- common_friends_cnt - количество общих друзей у пары пользователей

In [2]:
train_data = pd.read_csv('/kaggle/input/vkgraphfulldata/train_part2.csv', index_col=0)
print('Размер тренировочных данных', train_data.shape)
train_data.head()

Размер тренировочных данных (27055777, 23)


Unnamed: 0_level_0,ego_id,u,v,t,x1,x2,x3,common_friends_cnt,age_x,nan_cnt_x,...,friend_cnt_y,same_city_id,same_school,same_university,sex_x_1.0,sex_x_2.0,sex_x_nan,sex_y_1.0,sex_y_2.0,sex_y_nan
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1340029796885,56,83,31.2,0.390965,0.0,0.0,8,17.0,2.0,...,18.0,True,False,False,0.0,1.0,0,1.0,0.0,0
1,1340029796885,9,62,52.0,2.467793,3.178054,0.0,9,24.0,3.0,...,29.0,False,False,False,1.0,0.0,0,1.0,0.0,0
2,1340029796885,74,80,46.3,0.372363,0.0,0.0,5,42.0,2.0,...,15.0,True,False,False,1.0,0.0,0,1.0,0.0,0
3,1340029796885,69,71,1.9,0.706004,0.693147,1.0,5,55.0,3.0,...,10.0,False,False,False,1.0,0.0,0,1.0,0.0,0
4,1340029796885,33,46,290.9,0.070883,0.0,0.0,11,24.0,2.0,...,18.0,False,False,False,1.0,0.0,0,1.0,0.0,0


Разобьем загруженные данные на признаки и целевую переменную для обучения.

In [3]:
model_train_data_x = train_data.drop(
    [
        'ego_id', 'u', 'v', 'x1',
        'sex_x_nan', 'sex_y_nan',
    ],
    axis=1
)
model_train_data_y = train_data['x1']

model_train_data_x.head()

Unnamed: 0_level_0,t,x2,x3,common_friends_cnt,age_x,nan_cnt_x,friend_cnt_x,age_y,nan_cnt_y,friend_cnt_y,same_city_id,same_school,same_university,sex_x_1.0,sex_x_2.0,sex_y_1.0,sex_y_2.0
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,31.2,0.0,0.0,8,17.0,2.0,27.0,16.0,1.0,18.0,True,False,False,0.0,1.0,1.0,0.0
1,52.0,3.178054,0.0,9,24.0,3.0,33.0,15.0,2.0,29.0,False,False,False,1.0,0.0,1.0,0.0
2,46.3,0.0,0.0,5,42.0,2.0,13.0,17.0,2.0,15.0,True,False,False,1.0,0.0,1.0,0.0
3,1.9,0.693147,1.0,5,55.0,3.0,14.0,36.0,3.0,10.0,False,False,False,1.0,0.0,1.0,0.0
4,290.9,0.0,0.0,11,24.0,2.0,22.0,21.0,2.0,18.0,False,False,False,1.0,0.0,1.0,0.0


# Обучение модели

В качестве модели мы выбрали CatBoost, параметры iterations=500 и depth=8 были подобраны при помощи кросс-валидации.

In [4]:
model = CatBoostRegressor(
    loss_function='RMSE',
    iterations=500,
    depth=8,
    random_seed=69,
)

In [5]:
model.fit(
    model_train_data_x,
    model_train_data_y,
);

Learning rate set to 0.360756
0:	learn: 1.0865843	total: 2.37s	remaining: 19m 45s
1:	learn: 0.9442471	total: 4.5s	remaining: 18m 40s
2:	learn: 0.8699440	total: 7.01s	remaining: 19m 21s
3:	learn: 0.8315034	total: 9.11s	remaining: 18m 49s
4:	learn: 0.8112020	total: 11.8s	remaining: 19m 24s
5:	learn: 0.7989653	total: 14.3s	remaining: 19m 34s
6:	learn: 0.7922606	total: 16.6s	remaining: 19m 28s
7:	learn: 0.7880843	total: 18.8s	remaining: 19m 17s
8:	learn: 0.7853409	total: 21.8s	remaining: 19m 46s
9:	learn: 0.7827129	total: 24.5s	remaining: 19m 59s
10:	learn: 0.7809482	total: 27s	remaining: 20m
11:	learn: 0.7798478	total: 29.1s	remaining: 19m 43s
12:	learn: 0.7788700	total: 31.6s	remaining: 19m 42s
13:	learn: 0.7778351	total: 34.4s	remaining: 19m 52s
14:	learn: 0.7767121	total: 36.7s	remaining: 19m 46s
15:	learn: 0.7757705	total: 39.2s	remaining: 19m 45s
16:	learn: 0.7749645	total: 42.1s	remaining: 19m 56s
17:	learn: 0.7743320	total: 44.6s	remaining: 19m 53s
18:	learn: 0.7739105	total: 47.1s

Выведем важность каждого признака

In [6]:
pd.DataFrame({
    'feature': model_train_data_x.columns,
    'importance': model.feature_importances_,
}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
0,t,50.206732
1,x2,32.872488
4,age_x,4.236101
7,age_y,2.682199
15,sex_y_1.0,1.48859
13,sex_x_1.0,1.472379
6,friend_cnt_x,1.435498
2,x3,1.379365
9,friend_cnt_y,1.364487
5,nan_cnt_x,1.09149


Сохраним модель

In [7]:
# Save model
with open('/kaggle/working/model12.pk', 'wb+') as f:
    pk.dump(model, f)

# Локальное тестирование модели

Загрузим заранее предобработанный тестовый набор данных. Так как он тоже довольно большой, для тестирования мы берем случайные 10% записей.

In [2]:
test_data = pd.read_csv('/kaggle/input/test-with-n-common-friends/test_friends_total.csv', index_col=0)
# Take only 10% rows from test set
test_data = test_data.sample(frac=0.1)
print('Размеры тестового сета:', test_data.shape)
test_data.head()

Размеры тестового сета: (4054878, 23)


Unnamed: 0_level_0,ego_id,u,v,t,x1,x2,x3,common_friends_cnt,age_x,nan_cnt_x,...,friend_cnt_y,same_city_id,same_school,same_university,sex_x_1.0,sex_x_2.0,sex_x_nan,sex_y_1.0,sex_y_2.0,sex_y_nan
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
469,1288490189111,39,43,6.5,1.060247,0.0,0.0,2,14.0,0.0,...,23.0,False,False,False,0.0,1.0,0,0.0,1.0,0
1345,773094113450,43,244,59.1,,0.0,0.0,2,38.0,0.0,...,38.0,True,False,False,1.0,0.0,0,0.0,1.0,0
580,1623497637925,45,87,409.6,6.688231e-08,0.0,0.0,7,,1.0,...,14.0,False,False,False,1.0,0.0,0,1.0,0.0,0
4904,1425929142522,87,201,4.2,1.906019,0.0,0.0,17,34.0,0.0,...,47.0,False,False,False,1.0,0.0,0,0.0,1.0,0
2239,798863917221,217,240,67.8,0.0,0.0,0.0,0,,,...,1.0,False,False,False,0.0,0.0,1,0.0,1.0,0


Так как в целевой переменной встречаются пропуски, и невозможно рассчитать метрику на таких значениях, уберем их.

In [4]:
# Drop rows with nans in target, because we cannot calculate metric on them
test_data = test_data.dropna(subset=['x1'])
print('Размеры тестового сета после удалени пропусков:', test_data.shape)

Размеры тестового сета после удалени пропусков: (3243327, 23)


Разобьем загруженные данные на признаки и целевую переменную для построения предсказания расчета метрики.

In [5]:
model_test_data_x = test_data.drop(
    [
        'ego_id', 'u', 'v', 'x1',
        'sex_x_nan', 'sex_y_nan',
    ],
    axis=1
)
model_test_data_y = test_data['x1']

model_test_data_x.head()

Unnamed: 0_level_0,t,x2,x3,common_friends_cnt,age_x,nan_cnt_x,friend_cnt_x,age_y,nan_cnt_y,friend_cnt_y,same_city_id,same_school,same_university,sex_x_1.0,sex_x_2.0,sex_y_1.0,sex_y_2.0
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
469,6.5,0.0,0.0,2,14.0,0.0,13.0,21.0,2.0,23.0,False,False,False,0.0,1.0,0.0,1.0
580,409.6,0.0,0.0,7,,1.0,16.0,38.0,3.0,14.0,False,False,False,1.0,0.0,1.0,0.0
4904,4.2,0.0,0.0,17,34.0,0.0,53.0,34.0,2.0,47.0,False,False,False,1.0,0.0,0.0,1.0
2239,67.8,0.0,0.0,0,,,,53.0,1.0,1.0,False,False,False,0.0,0.0,0.0,1.0
2337,13.8,0.0,1.0,10,18.0,3.0,39.0,21.0,3.0,51.0,False,False,False,1.0,0.0,1.0,0.0


Рассчитаем метрику RMSE.

In [9]:
mean_squared_error(
    model_test_data_y,
    model.predict(model_test_data_x),
    squared=False,
)

0.7581874715711081

# Формирование и отправка решения

In [10]:
submission = pd.read_csv('/kaggle/input/vkgraphwithattrs/train_dataset_VK/submission.csv')
submission.head()

Unnamed: 0,ego_id,u,v,x1
0,8,0,93,0.0
1,8,0,143,0.0
2,8,0,151,1.606742
3,8,1,24,0.026496
4,8,5,4,0.159857


Так как в файле submission нет признаков, возьмем их из test файла.

In [11]:
test = pd.read_csv('/kaggle/input/test-with-n-common-friends/test_friends_total.csv')

submission = pd.merge(
    submission,
    test,
    how='left',
    on=['ego_id', 'u', 'v']
).drop_duplicates(subset=['ego_id', 'u', 'v'])

# Delete test to free space
del test

submission = submission.\
    drop('x1_y', axis=1).\
    rename(columns={
        'v_x': 'v',
        'x1_x': 'x1',
    })

submission.head()

Unnamed: 0.1,ego_id,u,v,x1,Unnamed: 0,t,x2,x3,common_friends_cnt,age_x,...,friend_cnt_y,same_city_id,same_school,same_university,sex_x_1.0,sex_x_2.0,sex_x_nan,sex_y_1.0,sex_y_2.0,sex_y_nan
0,8,0,93,0.0,938,359.6,0.0,0.0,5,36.0,...,7.0,True,False,False,0.0,1.0,0,0.0,1.0,0
1,8,0,143,0.0,1006,6.1,0.0,0.0,3,36.0,...,4.0,True,False,False,0.0,1.0,0,0.0,1.0,0
2,8,0,151,1.606742,183,0.2,1.386294,0.0,4,36.0,...,5.0,True,False,False,0.0,1.0,0,0.0,1.0,0
3,8,1,24,0.026496,654,594.5,0.0,0.0,10,120.0,...,26.0,False,False,False,1.0,0.0,0,0.0,1.0,0
4,8,5,4,0.159857,317,461.5,0.0,0.0,9,37.0,...,13.0,False,False,False,1.0,0.0,0,1.0,0.0,0


In [12]:
model_submission_data_x = submission.drop(
    [
        'ego_id', 'u', 'v', 'x1',
        'sex_x_nan', 'sex_y_nan',
        'Unnamed: 0'
    ],
    axis=1
)

model_submission_data_x.head()

Unnamed: 0,t,x2,x3,common_friends_cnt,age_x,nan_cnt_x,friend_cnt_x,age_y,nan_cnt_y,friend_cnt_y,same_city_id,same_school,same_university,sex_x_1.0,sex_x_2.0,sex_y_1.0,sex_y_2.0
0,359.6,0.0,0.0,5,36.0,0.0,156.0,36.0,0.0,7.0,True,False,False,0.0,1.0,0.0,1.0
1,6.1,0.0,0.0,3,36.0,0.0,156.0,43.0,2.0,4.0,True,False,False,0.0,1.0,0.0,1.0
2,0.2,1.386294,0.0,4,36.0,0.0,156.0,18.0,2.0,5.0,True,False,False,0.0,1.0,0.0,1.0
3,594.5,0.0,0.0,10,120.0,0.0,15.0,36.0,0.0,26.0,False,False,False,1.0,0.0,0.0,1.0
4,461.5,0.0,0.0,9,37.0,1.0,12.0,37.0,2.0,13.0,False,False,False,1.0,0.0,1.0,0.0


Сделаем предсказания обученной моделью и сохраним их в файл для отправки на проверку.

In [13]:
submission['x1'] = model.predict(model_submission_data_x)

In [14]:
submission[['ego_id', 'u', 'v', 'x1']].to_csv('submission10.csv', index=False)