In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] ='0'

In [3]:
import pandas as pd
import numpy as np
import gc
import time

In [4]:
out_dir = './out/'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
from catboost import CatBoostClassifier, Pool

In [7]:
train_data = pd.read_pickle('./pkl/train_data.pkl')
test_data = pd.read_pickle('./pkl/valid_data.pkl')

In [8]:
data = pd.concat([train_data, test_data]).reset_index(drop=True)

In [9]:
data.head()

Unnamed: 0,author_id,author_name,author_org,label,paper_id
0,8GjtUmBs,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg
1,EShnTfSe,li_guo,Institute of Pharmacology and Toxicology,1.0,P9a1gcvg
2,t1ruuB9N,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg
3,xLLXKy6I,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg
4,jTu2AZES,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg


In [10]:
print(train_data.shape)
print(test_data.shape)
print(data.shape)

(1194996, 5)
(452786, 4)
(1647782, 5)


In [11]:
##
time_feat_a = pd.read_pickle('./feat/time_feat_a.pkl')
rank_feat = pd.read_pickle('./feat/rank_feat3.pkl')

In [12]:
feats = [time_feat_a, rank_feat]

In [13]:
data = pd.concat([data] + feats, axis=1)

In [14]:
data.head()

Unnamed: 0,author_id,author_name,author_org,label,paper_id,year_a,year_b_min,year_b_max,year_b_mean,year_b_std,...,authors_sims_mm2_bert_rank_b,orgs_sims_min_bert_rank_b,orgs_sims_max_bert_rank_b,orgs_sims_mean_bert_rank_b,orgs_sims_std_bert_rank_b,orgs_sims_mm2_bert_rank_b,orgs_sims_max2_bert_rank_b,orgs_sims_min2_bert_rank_b,orgs_sims_mean2_bert_rank_b,orgs_sims_std2_bert_rank_b
0,8GjtUmBs,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,1996,2013,2013,2013.0,0.0,...,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.5
1,EShnTfSe,li_guo,Institute of Pharmacology and Toxicology,1.0,P9a1gcvg,1996,0,0,0.0,0.0,...,6.0,6.0,6.0,6.0,4.0,6.0,6.0,6.0,6.0,3.5
2,t1ruuB9N,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,1996,2011,2011,2011.0,0.0,...,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,3.5
3,xLLXKy6I,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,1996,2008,2008,2008.0,0.0,...,1.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.5
4,jTu2AZES,li_guo,Institute of Pharmacology and Toxicology,0.0,P9a1gcvg,1996,2011,2011,2011.0,0.0,...,3.0,1.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,3.5


In [29]:
drop_feat = ['author_id', 'author_name', 'author_org', 'paper_id', 'label']
used_feat = [c for c in data.columns if c not in drop_feat]
print(len(used_feat))
print(used_feat)

95
['year_a', 'year_b_min', 'year_b_max', 'year_b_mean', 'year_b_std', 'year_b_mm2', 'year_b_min-year_a', 'year_b_max-year_a', 'year_b_mean-year_a', 'year_b_mm2-year_a', 'year_inside_range', 'paper_num_rank_a', 'author_org_in_orgs_b_times_rank_a', 'author_interset_num_rank_a', 'author_interset_num/paper_ids_len_rank_a', 'venue_a_in_venue_b_num_rank_a', 'keywords_interset_num_rank_a', 'venue_a_in_venue_b_num/paper_ids_len_rank_a', 'keywords_interset_num/paper_ids_len_rank_a', 'abstract_sims_min_bert_rank_a', 'abstract_sims_max_bert_rank_a', 'abstract_sims_mean_bert_rank_a', 'abstract_sims_std_bert_rank_a', 'abstract_sims_mm2_bert_rank_a', 'keywords_sims_min_bert_rank_a', 'keywords_sims_max_bert_rank_a', 'keywords_sims_mean_bert_rank_a', 'keywords_sims_std_bert_rank_a', 'keywords_sims_mm2_bert_rank_a', 'title_sims_min_bert_rank_a', 'title_sims_max_bert_rank_a', 'title_sims_mean_bert_rank_a', 'title_sims_std_bert_rank_a', 'title_sims_mm2_bert_rank_a', 'venue_sims_min_bert_rank_a', 'venue_

In [30]:
train = data[:len(train_data)]
test = data[len(train_data):]

In [31]:
test_x = test[used_feat]

In [32]:
# cv split according to author names
train_author_name = train['author_name'].unique()
print(len(train_author_name))

204


In [33]:
def gen_dict(df, label):
    df = df[['paper_id', 'author_name', 'author_id', label]]
    res = df.groupby(['paper_id', 'author_name'])[label].apply(np.argmax).reset_index()
    res.columns = ['paper_id', 'author_name', 'index']
    idx_name = df[['author_id']].reset_index()
    res = res.merge(idx_name, 'left', 'index')
    from collections import defaultdict
    res_dict = defaultdict(list)
    for pid, aid in res[['paper_id', 'author_id']].values:
        res_dict[aid].append(pid)
    return res_dict

In [34]:
def f1_score(pred_dict, true_dict):
    total_unassigned_paper = np.sum([len(l) for l in true_dict.values()])
    print('total_unassigned_paper: ', total_unassigned_paper)
    print('true author num: ', len(true_dict))
    author_weight = dict((k, len(v) / total_unassigned_paper) for k, v in true_dict.items())
    author_precision = {}
    author_recall = {}
    for author in author_weight.keys():
        # total pred, total belong, correct pred
        total_belong = len(true_dict[author])
        total_pred = (len(pred_dict[author]) if author in pred_dict else 0)
        correct_pred = len(set(true_dict[author]) & (set(pred_dict[author]) if author in pred_dict else set()))
        author_precision[author] = (correct_pred/total_pred) if total_pred > 0 else 0
        author_recall[author] = correct_pred / total_belong
        
    weighted_precision = 0
    weighted_recall = 0
    for author, weight in author_weight.items():
        weighted_precision += weight * author_precision[author]
        weighted_recall += weight * author_recall[author]
    weighted_f1 = 2 * weighted_precision * weighted_recall / (weighted_precision + weighted_recall)
    print('weighted_precision: %f, weighted_recall: %f, weighted_f1: %f' %(weighted_precision, weighted_recall, weighted_f1))
    return weighted_precision, weighted_recall, weighted_f1

In [35]:
from sklearn.model_selection import KFold
preds = np.zeros((test.shape[0], 2))
scores = []
f1_scores = []
has_saved = False
imp = pd.DataFrame()
imp['feat'] = used_feat

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
for index, (tr_idx, va_idx) in enumerate(kfold.split(train_author_name)):
    print('*' * 30)
    trn_aname, val_aname = train_author_name[tr_idx], train_author_name[va_idx]
    trn_dat = train[train['author_name'].isin(trn_aname)]
    val_dat = train[train['author_name'].isin(val_aname)]
    X_train, y_train, X_valid, y_valid = trn_dat[used_feat], trn_dat['label'], val_dat[used_feat], val_dat['label']
    cate_features = []
    train_pool = Pool(X_train, y_train, cat_features=cate_features)
    eval_pool = Pool(X_valid, y_valid,cat_features=cate_features)
    if not has_saved: 
        cbt_model = CatBoostClassifier(iterations=10000,
                           learning_rate=0.1,
                           eval_metric='AUC',
                           use_best_model=True,
                           random_seed=42,
                           logging_level='Verbose',
                           task_type='GPU',
                           devices='0',
#                            gpu_ram_part=0.5,
                           early_stopping_rounds=300,
                           loss_function='Logloss',
#                            depth=12,
                           )
        cbt_model.fit(train_pool, eval_set=eval_pool, verbose=100)
#         with open('./models/fold%d_cbt_v1.mdl' % index, 'wb') as file:
#             pickle.dump(cbt_model, file)
#     else:
#         with open('./models/fold%d_cbt_v1.mdl' % index, 'rb') as file:
#             cbt_model = pickle.load(file)
    
    imp['score%d' % (index+1)] = cbt_model.feature_importances_
    
    val_dat['pred'] = cbt_model.predict_proba(X_valid)[:, 1]
    val_pred_dict = gen_dict(val_dat, 'pred')
    val_true_dict = gen_dict(val_dat, 'label')
    precision, recall, f1 = f1_score(val_pred_dict, val_true_dict)
    f1_scores.append(f1)
    
    score = cbt_model.best_score_['validation']['AUC']
    scores.append(score)
    print('fold %d round %d : auc: %.6f | mean auc %.6f | F1: %.6f | mean F1: %.6f' % (index+1, cbt_model.best_iteration_, score,np.mean(scores), f1, np.mean(f1_scores))) 
    preds += cbt_model.predict_proba(test_x)  
#     break
    del cbt_model, train_pool, eval_pool
    del X_train, y_train, X_valid, y_valid
    import gc
    gc.collect()
    
#     mdls.append(cbt_model)

******************************




0:	learn: 0.9975497	test: 0.9971379	best: 0.9971379 (0)	total: 15.8ms	remaining: 2m 38s
100:	learn: 0.9997494	test: 0.9996755	best: 0.9996755 (100)	total: 1.31s	remaining: 2m 8s
200:	learn: 0.9998242	test: 0.9997479	best: 0.9997479 (200)	total: 2.56s	remaining: 2m 5s
300:	learn: 0.9998572	test: 0.9997733	best: 0.9997734 (299)	total: 3.83s	remaining: 2m 3s
400:	learn: 0.9998775	test: 0.9997894	best: 0.9997895 (399)	total: 5.11s	remaining: 2m 2s
500:	learn: 0.9998966	test: 0.9997950	best: 0.9997954 (489)	total: 6.39s	remaining: 2m 1s
600:	learn: 0.9999099	test: 0.9998027	best: 0.9998028 (595)	total: 7.71s	remaining: 2m
700:	learn: 0.9999231	test: 0.9998082	best: 0.9998084 (695)	total: 8.99s	remaining: 1m 59s
800:	learn: 0.9999328	test: 0.9998132	best: 0.9998132 (800)	total: 10.3s	remaining: 1m 58s
900:	learn: 0.9999403	test: 0.9998142	best: 0.9998147 (874)	total: 11.6s	remaining: 1m 56s
1000:	learn: 0.9999473	test: 0.9998164	best: 0.9998177 (989)	total: 12.8s	remaining: 1m 55s
1100:	lear



0:	learn: 0.9972583	test: 0.9969720	best: 0.9969720 (0)	total: 46.3ms	remaining: 7m 42s
100:	learn: 0.9997396	test: 0.9996647	best: 0.9996647 (100)	total: 2.63s	remaining: 4m 18s
200:	learn: 0.9998118	test: 0.9997188	best: 0.9997193 (197)	total: 5.29s	remaining: 4m 18s
300:	learn: 0.9998518	test: 0.9997449	best: 0.9997452 (298)	total: 7.98s	remaining: 4m 17s
400:	learn: 0.9998756	test: 0.9997635	best: 0.9997640 (399)	total: 10.7s	remaining: 4m 17s
500:	learn: 0.9998925	test: 0.9997688	best: 0.9997688 (500)	total: 13.5s	remaining: 4m 16s
600:	learn: 0.9999071	test: 0.9997710	best: 0.9997720 (557)	total: 16.1s	remaining: 4m 12s
700:	learn: 0.9999195	test: 0.9997789	best: 0.9997789 (700)	total: 18.8s	remaining: 4m 9s
800:	learn: 0.9999287	test: 0.9997822	best: 0.9997822 (796)	total: 21.4s	remaining: 4m 5s
900:	learn: 0.9999379	test: 0.9997869	best: 0.9997870 (895)	total: 24s	remaining: 4m 2s
1000:	learn: 0.9999461	test: 0.9997880	best: 0.9997884 (988)	total: 26.8s	remaining: 4m 1s
1100:	l



0:	learn: 0.9974340	test: 0.9970190	best: 0.9970190 (0)	total: 17ms	remaining: 2m 49s
100:	learn: 0.9997501	test: 0.9996758	best: 0.9996758 (100)	total: 2.5s	remaining: 4m 4s
200:	learn: 0.9998254	test: 0.9997292	best: 0.9997292 (200)	total: 5.11s	remaining: 4m 9s
300:	learn: 0.9998624	test: 0.9997545	best: 0.9997545 (300)	total: 7.62s	remaining: 4m 5s
400:	learn: 0.9998850	test: 0.9997683	best: 0.9997689 (396)	total: 10.1s	remaining: 4m 1s
500:	learn: 0.9999032	test: 0.9997736	best: 0.9997737 (497)	total: 12.7s	remaining: 4m 1s
600:	learn: 0.9999161	test: 0.9997758	best: 0.9997759 (597)	total: 15.3s	remaining: 3m 59s
700:	learn: 0.9999282	test: 0.9997799	best: 0.9997799 (683)	total: 17.9s	remaining: 3m 58s
800:	learn: 0.9999378	test: 0.9997867	best: 0.9997867 (800)	total: 20.6s	remaining: 3m 56s
900:	learn: 0.9999455	test: 0.9997877	best: 0.9997891 (884)	total: 23.1s	remaining: 3m 53s
1000:	learn: 0.9999517	test: 0.9997847	best: 0.9997891 (884)	total: 25.7s	remaining: 3m 51s
1100:	lea

In [36]:
imp.sort_values(by='score1', ascending=False)

Unnamed: 0,feat,score1,score2,score3,score4,score5
55,author_interset_num_rank_b,8.298452,9.578436,7.598800,7.475774,8.116196
2,year_b_max,5.735328,6.535335,3.740404,5.142827,8.352572
91,orgs_sims_max2_bert_rank_b,5.313472,5.694128,7.393179,7.184583,7.615661
13,author_interset_num_rank_a,4.948066,4.680774,5.389117,5.553717,5.218932
4,year_b_std,3.532221,3.057120,3.247804,3.318082,3.121897
49,orgs_sims_max2_bert_rank_a,3.236305,4.152073,5.122202,3.326928,3.131560
11,paper_num_rank_a,2.297650,2.131351,2.037330,2.180784,2.117218
3,year_b_mean,2.252438,1.168923,2.083037,4.591285,1.958823
52,orgs_sims_std2_bert_rank_a,2.173362,2.504588,1.786622,2.619718,2.884649
57,venue_a_in_venue_b_num_rank_b,2.056701,3.294738,1.887856,3.237639,1.778620


In [37]:
test_data['pred'] = preds[:, 1]

In [38]:
test_data.head()

Unnamed: 0,paper_id,author_name,author_org,author_id,pred
0,F3Mha4HG,lei_shi,State Key Laboratory of Catalysis,0gL4hj4n,0.041708
1,F3Mha4HG,lei_shi,State Key Laboratory of Catalysis,0jknXeWe,0.058157
2,F3Mha4HG,lei_shi,State Key Laboratory of Catalysis,1vE3Drg0,0.457224
3,F3Mha4HG,lei_shi,State Key Laboratory of Catalysis,2IVNIGb6,0.456184
4,F3Mha4HG,lei_shi,State Key Laboratory of Catalysis,5QgbLXhl,0.43432


In [39]:
result_dict = gen_dict(test_data, 'pred')

In [40]:
len(result_dict)

1291

In [41]:
import json
import time
localtime = time.localtime(time.time())
save_path = './out/result_%02d%02d%02d%02d.json' % (localtime[1], localtime[2], localtime[3], localtime[4])
with open(save_path, 'w') as file:
    file.write(json.dumps(result_dict))