In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import gc
import time

In [3]:
train_data = pd.read_pickle('./pkl/train_data.pkl')
test_data = pd.read_pickle('./pkl/valid_data.pkl')
data = pd.concat([train_data, test_data]).reset_index(drop=True)

In [4]:
author_org_cross = pd.read_pickle('./feat/author_org_cross.pkl')
keyword_venue_cross = pd.read_pickle('./feat/keyword_venue_cross.pkl')

In [5]:
abstract_sims_st_bert = pd.read_pickle('./feat/abstract_sims_st_bert.pkl')
keywords_sims_st_bert = pd.read_pickle('./feat/keywords_sims_st_bert.pkl')
title_sims_st_bert = pd.read_pickle('./feat/title_sims_st_bert.pkl')
venue_sims_st_bert = pd.read_pickle('./feat/venue_sims_st_bert.pkl')
authors_sims_st_bert = pd.read_pickle('./feat/authors_sims_st_bert.pkl')
orgs_sims_st_bert = pd.read_pickle('./feat/orgs_sims_st_bert.pkl')
feat_v3_bert = pd.read_pickle('./feat/feat_v3_bert.pkl')

In [6]:
bert_feat = [abstract_sims_st_bert, keywords_sims_st_bert, title_sims_st_bert, venue_sims_st_bert, authors_sims_st_bert, orgs_sims_st_bert, feat_v3_bert]

In [7]:
data = pd.concat([data, author_org_cross, keyword_venue_cross] + bert_feat, axis=1)
print(data.columns)

Index(['author_id', 'author_name', 'author_org', 'label', 'paper_id',
       'author_org_in_orgs_b_times', 'author_interset_num',
       'author_interset_num/paper_ids_len', 'venue_a_in_venue_b_num',
       'keywords_interset_num', 'venue_a_in_venue_b_num/paper_ids_len',
       'keywords_interset_num/paper_ids_len', 'abstract_sims_min',
       'abstract_sims_max', 'abstract_sims_mean', 'abstract_sims_std',
       'abstract_sims_mm2', 'keywords_sims_min', 'keywords_sims_max',
       'keywords_sims_mean', 'keywords_sims_std', 'keywords_sims_mm2',
       'title_sims_min', 'title_sims_max', 'title_sims_mean', 'title_sims_std',
       'title_sims_mm2', 'venue_sims_min', 'venue_sims_max', 'venue_sims_mean',
       'venue_sims_std', 'venue_sims_mm2', 'authors_sims_min',
       'authors_sims_max', 'authors_sims_mean', 'authors_sims_std',
       'authors_sims_mm2', 'orgs_sims_min', 'orgs_sims_max', 'orgs_sims_mean',
       'orgs_sims_std', 'orgs_sims_mm2', 'paper_num', 'orgs_num',
       'orgs_

In [8]:
col_to_rank = ['paper_num', 'author_org_in_orgs_b_times', 'author_interset_num',
       'author_interset_num/paper_ids_len', 'venue_a_in_venue_b_num',
       'keywords_interset_num', 'venue_a_in_venue_b_num/paper_ids_len',
       'keywords_interset_num/paper_ids_len', 
       'abstract_sims_min',
       'abstract_sims_max', 'abstract_sims_mean', 'abstract_sims_std',
       'abstract_sims_mm2', 'keywords_sims_min', 'keywords_sims_max',
       'keywords_sims_mean', 'keywords_sims_std', 'keywords_sims_mm2',
       'title_sims_min', 'title_sims_max', 'title_sims_mean', 'title_sims_std',
       'title_sims_mm2', 'venue_sims_min', 'venue_sims_max', 'venue_sims_mean',
       'venue_sims_std', 'venue_sims_mm2', 'authors_sims_min',
       'authors_sims_max', 'authors_sims_mean', 'authors_sims_std',
       'authors_sims_mm2', 'orgs_sims_min', 'orgs_sims_max', 'orgs_sims_mean',
       'orgs_sims_std', 'orgs_sims_mm2', 
       'orgs_sims_max2', 'orgs_sims_min2', 'orgs_sims_mean2',
       'orgs_sims_std2']

In [9]:
data['pair_name'] = data['author_name'] + '_' + data['paper_id']

In [10]:
from sklearn.preprocessing import LabelEncoder
data['pair_name'] = LabelEncoder().fit_transform(data['pair_name'])

In [11]:
ranks = []
for i, c in enumerate(col_to_rank):
    print(i, c)
    rank_a = data.groupby('pair_name')[c].rank(ascending=True).rename('%s_rank_a' % c)
    rank_b = data.groupby('pair_name')[c].rank(ascending=False).rename('%s_rank_b' % c)
    ranks.append(rank_a)
    ranks.append(rank_b)

0 paper_num
1 author_org_in_orgs_b_times
2 author_interset_num
3 author_interset_num/paper_ids_len
4 venue_a_in_venue_b_num
5 keywords_interset_num
6 venue_a_in_venue_b_num/paper_ids_len
7 keywords_interset_num/paper_ids_len
8 abstract_sims_min
9 abstract_sims_max
10 abstract_sims_mean
11 abstract_sims_std
12 abstract_sims_mm2
13 keywords_sims_min
14 keywords_sims_max
15 keywords_sims_mean
16 keywords_sims_std
17 keywords_sims_mm2
18 title_sims_min
19 title_sims_max
20 title_sims_mean
21 title_sims_std
22 title_sims_mm2
23 venue_sims_min
24 venue_sims_max
25 venue_sims_mean
26 venue_sims_std
27 venue_sims_mm2
28 authors_sims_min
29 authors_sims_max
30 authors_sims_mean
31 authors_sims_std
32 authors_sims_mm2
33 orgs_sims_min
34 orgs_sims_max
35 orgs_sims_mean
36 orgs_sims_std
37 orgs_sims_mm2
38 orgs_sims_max2
39 orgs_sims_min2
40 orgs_sims_mean2
41 orgs_sims_std2


In [12]:
data = pd.concat([data] + ranks, axis=1)

In [13]:
cols = [c + '_rank_a' for c in col_to_rank] + [c + '_rank_b' for c in col_to_rank]
data[cols].to_pickle('./feat/rank_feat_bert.pkl')