In [1]:
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error

from warnings import filterwarnings
filterwarnings('ignore')



### Data generation

In [2]:
def get_chunk_features(train_chunk: pd.DataFrame, attr_chunk: pd.DataFrame) -> pd.DataFrame:
    # Count nan
    attr_chunk['nan_cnt'] = attr_chunk.isna().sum(axis=1)
    
    # Get friends
    friends = defaultdict(set)
    for u, subset in train_chunk.groupby('u'):
        friends[u] = set(subset['v'])
    for v, subset in train_chunk.groupby('v'):
        friends[v] = friends[v].union(set(subset['u']))
    
    attr_chunk['friend_cnt'] = attr_chunk.apply(
        lambda row: len(friends[row['u']]),
        axis=1
    )
    
    # Count common friends
    train_chunk['common_friends_cnt'] = train_chunk.apply(
        lambda row: len(friends[row['u']].intersection(friends[row['v']])),
        axis=1
    )
    
    # Merge dataframes
    train_chunk = pd.merge(
        train_chunk,
        attr_chunk,
        how='left',
        on=['ego_id', 'u']
    )
    train_chunk = pd.merge(
        train_chunk,
        attr_chunk.rename(columns={'u': 'v'}),
        how='left',
        on=['ego_id', 'v']
    )
    
    # Check if city, school or university are same
    train_chunk['same_city_id'] =\
        (train_chunk['city_id_x'] == train_chunk['city_id_y']) & (train_chunk['city_id_x'].notna())
    train_chunk['same_school'] =\
        (train_chunk['school_x'] == train_chunk['school_y']) & (train_chunk['school_x'].notna())
    train_chunk['same_university'] =\
        (train_chunk['university_x'] == train_chunk['university_y']) & (train_chunk['university_x'].notna())
    
    # Generate dummy features for sex
    train_chunk = train_chunk.join(
        pd.get_dummies(train_chunk['sex_x'], dummy_na=True, prefix='sex_x')
    ).join(
        pd.get_dummies(train_chunk['sex_y'], dummy_na=True, prefix='sex_y')
    )

    # Drop unnecessary features
    train_chunk = train_chunk.drop([
        'city_id_x',
        'city_id_y',
        'sex_x',
        'sex_y',
        'school_x',
        'school_y',
        'university_x',
        'university_y',
    ], axis=1)
    
    return train_chunk

age (u, v)

sex (u, v)

friend_cnt (u, v)

t (u, v): mean, median, min, max, std

nan_cnt (u, v) (age, school, university, city)

friends_age (u, v): mean, median, min, max, std

same_school

same_university

same_city_id

common_friends_cnt

same_friends_school_mode

same_friends_university_mode

same_friends_city_mode

match_friends_school_mode (u, v)

match_friends_university_mode (u, v)

match_friends_city_mode (u, v)

# Test

In [3]:
!ls /kaggle/input/

vkgraphwithattrs


In [4]:
test = pd.read_csv('/kaggle/input/vkgraphwithattrs/train_dataset_VK/test.csv')
print(test.shape)
# test = test.sample(frac=0.05)
test.head()

(40548780, 7)


Unnamed: 0,ego_id,u,v,t,x1,x2,x3
0,8,20,19,185.7,0.0003839089,0.0,0.0
1,8,131,125,161.4,0.4034464,0.0,0.0
2,8,73,56,127.0,8.554643e-05,0.0,0.0
3,8,0,4,594.5,0.2886418,0.0,0.0
4,8,63,73,127.0,4.281692e-07,0.0,0.0


In [5]:
attr = pd.read_csv('/kaggle/input/vkgraphwithattrs/attr.csv')
attr = attr.replace(-1, np.nan)
attr.head()

Unnamed: 0,ego_id,u,age,city_id,sex,school,university
0,0,227,68.0,,1.0,778293348.0,
1,0,45,38.0,237065842.0,1.0,82803468.0,238500268.0
2,0,142,60.0,237065842.0,1.0,196560139.0,
3,0,280,66.0,,2.0,963209731.0,720783270.0
4,0,41,18.0,,2.0,308862409.0,


In [6]:
!mkdir /kaggle/working/test_chunks

In [8]:
ego_ids = test['ego_id'].unique()[5000:10000]
print(ego_ids)
for ego_id in tqdm(ego_ids): # 2000
    
    # Get dataset chunk
    test_chunk = test[test['ego_id'] == ego_id]
    attr_chunk = attr[attr['ego_id'] == ego_id]
    
    # Compute features
    test_chunk = get_chunk_features(test_chunk, attr_chunk)
    
    # Save chunk
    test_chunk.\
        to_csv(f'/kaggle/working/test_chunks/test_ego_id_{ego_id}.csv')

  0%|          | 0/5000 [00:00<?, ?it/s]

In [10]:
import os

In [12]:
file_paths = [
    '/kaggle/working/test_chunks/' + file_name
    for file_name in os.listdir('/kaggle/working/test_chunks')
]

In [13]:
test_merged = pd.concat(
    map(pd.read_csv, file_paths),
    ignore_index=True,
)

In [14]:
test_merged.shape

(9705475, 24)

In [15]:
test_merged.to_csv('test_merged.csv', index=False)