In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import gc
import time
import os

In [3]:
feat_dir = './feat/'
if not os.path.exists(feat_dir):
    os.mkdir(feat_dir)

In [4]:
train_data = pd.read_pickle('./pkl/train_data.pkl')
test_data = pd.read_pickle('./pkl/valid_data.pkl')

In [5]:
data = pd.concat([train_data, test_data]).reset_index(drop=True)
data.head()

Unnamed: 0,author_id,author_name,author_org,label,paper_id
0,8GjtUmBs,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg
1,EShnTfSe,li_guo,Institute of Pharmacology and Toxicology,1.0,P9a1gcvg
2,t1ruuB9N,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg
3,xLLXKy6I,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg
4,jTu2AZES,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg


In [6]:
train_author_paper_ids = pd.read_pickle('./pkl/train_author_paper_ids.pkl')
whole_author_name_paper_ids = pd.read_pickle('./pkl/whole_author_name_paper_ids.pkl')

In [7]:
author_pub_ids = whole_author_name_paper_ids[['author_id','paper_ids']].merge(train_author_paper_ids, 'left', 'author_id')

author_pub_ids['paper_ids_x_len'] = author_pub_ids['paper_ids_x'].apply(len)
author_pub_ids['paper_ids_y_len'] = author_pub_ids['paper_ids_y'].apply(lambda x: 0 if type(x) == float else len(x))

author_pub_ids['paper_ids'] = author_pub_ids.apply(lambda row: list(set(row['paper_ids_x']) | (set() if type(row['paper_ids_y']) == float else set(row['paper_ids_y']))), axis=1)

author_pub_ids['paper_ids_len'] = author_pub_ids['paper_ids'].apply(len)

author_pub_ids.drop(columns=['paper_ids_x', 'paper_ids_y', 'paper_ids_x_len', 'paper_ids_y_len'], inplace=True)

author_pub_ids.head()

Unnamed: 0,author_id,paper_ids,paper_ids_len
0,004mBKh6,"[TY0MYj83, S6rklGvj, 0vcMqEj6, iQkpCzIk, lRl7m...",9
1,008b5hIc,[nPuf0kYs],1
2,00BAgHFr,"[kn6wo3ot, 0f2awX7U, TuEkjf5i]",3
3,00M93JDI,"[CuWJcKM7, lKPhLbs5, 9ydPvnCP, N5kY5MnL, 0WyZf...",309
4,00ShLPmg,"[3X9wO1bx, KvHObx7O, IPKNYRm2, LV5duZL5, vg64a...",18


In [8]:
author_pub_ids.shape

(25911, 3)

In [9]:
data['author_org'].nunique()

76340

In [10]:
author_id_org_map = pd.read_pickle('./pkl/author_id_org_map.pkl')

In [11]:
author_id_org_map.head()

Unnamed: 0,author_id,orgs
0,004mBKh6,"[, Department of Biomedical Engineering, Natio..."
1,008b5hIc,[State Key Laboratory Breeding Base of Nuclear...
2,00BAgHFr,"[, , School of Information System and Management]"
3,00M93JDI,"[None, None, None, academia sinica, None, None..."
4,00ShLPmg,"[.Department of Microbiology and Immunology,Gu..."


In [12]:
author_id_org_map['orgs'].apply(len).sum()

266615

In [13]:
# org embedding
from collections import OrderedDict
org_embedding = OrderedDict()
for o in data['author_org']:
    if o == '' or pd.isna(o):
            continue
    if type(o) == float:
        print('err %f' % o)
    org_embedding[o] = np.nan

In [14]:
for os in author_id_org_map['orgs']:
    for o in os:
        if o == '' or pd.isna(o):
            continue
        if type(o) == float:
            print('err %f' % o)
        org_embedding[o] = np.nan

In [15]:
len(org_embedding)

94374

In [16]:
orgs = list(org_embedding.keys())

In [17]:
orgs[:5]

['Institute of Pharmacology and Toxicology',
 'Dept. of Pharmaceutical Chemistry',
 'Department of Chemical Engineering',
 'Sichuan Union University',
 'West China School of Pharmacy']

In [18]:
from random import randint

import numpy as np
import torch
import pandas as pd
import re
import os
os.environ["CUDA_VISIBLE_DEVICES"] ='0'

In [19]:
# max_len=256
# def func(s):
#     s = s.lower()
#     s = re.sub(r'[^\w\s]', ' ', s)
#     wds = s.split()[:max_len]
#     return ' '.join(wds)

In [20]:
# orgs = list(map(func, orgs))
# orgs[:5]

In [21]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('./bert-large-nli-mean-tokens')

In [22]:
# org_embed = model.encode(orgs, bsize=32, tokenize=False, verbose=True)
org_embed = model.encode(orgs, show_progress_bar=True, batch_size=128)

Batches: 100%|██████████| 738/738 [05:15<00:00,  1.31s/it]


In [24]:
print(len(org_embed))
print(org_embed[0].shape)

94374
(1024,)


In [25]:
for i, k in enumerate(org_embedding.keys()):
    org_embedding[k] = org_embed[i]

In [26]:
data = data.merge(author_pub_ids, 'left', 'author_id').merge(author_id_org_map, 'left', 'author_id')

In [27]:
data.head()

Unnamed: 0,author_id,author_name,author_org,label,paper_id,paper_ids,paper_ids_len,orgs
0,8GjtUmBs,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[J5KPOi3Y],1,[College of Geological Exploration Prospecting...
1,EShnTfSe,li_guo,Institute of Pharmacology and Toxicology,1.0,P9a1gcvg,[P9a1gcvg],1,[Institute of Pharmacology and Toxicology]
2,t1ruuB9N,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[T35c9ZAd],1,[Geological Survey Institute of Beijing]
3,xLLXKy6I,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[OQ7F7UAh],1,[Guangdong Electric Power Design Institute]
4,jTu2AZES,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[NRR6kM3z],1,[Department of Neurology]


In [28]:
def pidx(p, ps):
    ans = np.nan
    for i, p2 in enumerate(ps):
        if p == p2:
            ans = i
            break
    return ans
data['idx'] = data.apply(lambda row: pidx(row['paper_id'], row['paper_ids']) if row['label'] == 1 else np.nan, axis=1)
display(data.head(3))

Unnamed: 0,author_id,author_name,author_org,label,paper_id,paper_ids,paper_ids_len,orgs,idx
0,8GjtUmBs,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[J5KPOi3Y],1,[College of Geological Exploration Prospecting...,
1,EShnTfSe,li_guo,Institute of Pharmacology and Toxicology,1.0,P9a1gcvg,[P9a1gcvg],1,[Institute of Pharmacology and Toxicology],0.0
2,t1ruuB9N,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[T35c9ZAd],1,[Geological Survey Institute of Beijing],


In [29]:
print('del...')
from tqdm import tqdm_notebook
for i in tqdm_notebook(range(len(data))):
    if pd.isna(data.loc[i, 'idx']):
        continue
    v = list(data.loc[i, 'orgs'])
    del v[data.loc[i, 'idx'].astype(int)]
    data.set_value(i, 'orgs', v)

del...


HBox(children=(IntProgress(value=0, max=1647782), HTML(value='')))




In [30]:
data.head()

Unnamed: 0,author_id,author_name,author_org,label,paper_id,paper_ids,paper_ids_len,orgs,idx
0,8GjtUmBs,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[J5KPOi3Y],1,[College of Geological Exploration Prospecting...,
1,EShnTfSe,li_guo,Institute of Pharmacology and Toxicology,1.0,P9a1gcvg,[P9a1gcvg],1,[],0.0
2,t1ruuB9N,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[T35c9ZAd],1,[Geological Survey Institute of Beijing],
3,xLLXKy6I,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[OQ7F7UAh],1,[Guangdong Electric Power Design Institute],
4,jTu2AZES,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[NRR6kM3z],1,[Department of Neurology],


In [31]:
data['orgs'] = data['orgs'].apply(set).apply(list)

In [32]:
from scipy import spatial
def cos_sim(a, b):
    return 1 - spatial.distance.cosine(a, b)

In [33]:
from tqdm import tqdm_notebook
org_sims = []
for author_org, orgs in tqdm_notebook(data[['author_org', 'orgs']].values):
    sim = []
    if author_org == '' or pd.isna(author_org):
        pass
    else:
        for o in orgs:
            if o == '' or pd.isna(o):
                continue
            s = cos_sim(org_embedding[author_org], org_embedding[o])
            sim.append(s)
    if len(sim) == 0:
        sim = [0]
    org_sims.append(sim)

HBox(children=(IntProgress(value=0, max=1647782), HTML(value='')))




In [34]:
data['org_sims'] = org_sims
data.head()

Unnamed: 0,author_id,author_name,author_org,label,paper_id,paper_ids,paper_ids_len,orgs,idx,org_sims
0,8GjtUmBs,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[J5KPOi3Y],1,[College of Geological Exploration Prospecting...,,[0.4848203957080841]
1,EShnTfSe,li_guo,Institute of Pharmacology and Toxicology,1.0,P9a1gcvg,[P9a1gcvg],1,[],0.0,[0]
2,t1ruuB9N,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[T35c9ZAd],1,[Geological Survey Institute of Beijing],,[0.48196685314178467]
3,xLLXKy6I,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[OQ7F7UAh],1,[Guangdong Electric Power Design Institute],,[0.5620583891868591]
4,jTu2AZES,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[NRR6kM3z],1,[Department of Neurology],,[0.6908237934112549]


In [35]:
data['orgs_sims_max2'] = data['org_sims'].apply(np.max)
data['orgs_sims_min2'] = data['org_sims'].apply(np.min)
data['orgs_sims_mean2'] = data['org_sims'].apply(np.mean)
data['orgs_sims_std2'] = data['org_sims'].apply(np.std)

In [36]:
data.head(3)

Unnamed: 0,author_id,author_name,author_org,label,paper_id,paper_ids,paper_ids_len,orgs,idx,org_sims,orgs_sims_max2,orgs_sims_min2,orgs_sims_mean2,orgs_sims_std2
0,8GjtUmBs,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[J5KPOi3Y],1,[College of Geological Exploration Prospecting...,,[0.4848203957080841],0.48482,0.48482,0.48482,0.0
1,EShnTfSe,li_guo,Institute of Pharmacology and Toxicology,1.0,P9a1gcvg,[P9a1gcvg],1,[],0.0,[0],0.0,0.0,0.0,0.0
2,t1ruuB9N,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,[T35c9ZAd],1,[Geological Survey Institute of Beijing],,[0.48196685314178467],0.481967,0.481967,0.481967,0.0


In [37]:
data['paper_num'] = data['paper_ids_len'] - (data['label'] == 1).astype(int)

In [38]:
data['orgs_num'] = data['orgs'].apply(len) + (data['label'] == 1).astype(int)

In [39]:
data[['paper_num', 'orgs_num', 'orgs_sims_max2', 'orgs_sims_min2', 'orgs_sims_mean2', 'orgs_sims_std2']].to_pickle('./feat/feat_v3_bert.pkl')