In [1]:
import torch
from torch import tanh
from torch import nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
graph_input = '/home/duyongkang/PaperRec/Data/Vector/Metapath2vec.pkl'
text_input = '/home/duyongkang/PaperRec/Data/Vector/Word2vec.pkl'

In [2]:
data_t = pd.read_pickle(text_input)
data_g = pd.read_pickle(graph_input)#header=None,names=['node','vector']

## 合并网络特征向量和文本特征向量

In [3]:
#将data_t和data_g转换成字典，key是节点编号，value是向量
g_dic = data_g[['node','vector']].set_index('node').to_dict()['vector']
t_dic = data_t[['node','vector']].set_index('node').to_dict()['vector']

In [4]:
#遍历文本向量，根据节点名称找到字典中对应的key，然后将文本特征添加上
#因为g_dic中包含会议节点，但训练时不需要会议节点，所以遍历t_dic
output_dic = {}   #key为节点名称，value为特征向量
for key in t_dic:
    try:
        c = np.hstack((g_dic[key], np.array(t_dic[key])))
#         t_dic[key].extend(g_dic[key])
        output_dic[key] = c
    except KeyError:
        continue

In [5]:
#将拼好的向量转换成dataframe类型
df = pd.DataFrame(pd.Series(output_dic), columns=['vector'])
df = df.reset_index().rename(columns = {'index':'node'})
df

Unnamed: 0,node,vector
0,a678283,"[0.024763891, 0.0030672506, 0.00028448, -0.018..."
1,a644976,"[-0.080528535, 0.06352691, 0.022937424, -0.019..."
2,a107969,"[0.0035548138, 0.01666692, -0.0043901615, 0.00..."
3,a16959,"[0.039123327, -0.01776713, -0.01280804, -0.010..."
4,a500399,"[0.008487368, 0.021440862, -0.007883114, -0.00..."
...,...,...
76113,p1968682,"[0.005413892, -0.0063772737, -0.050358698, 0.0..."
76114,p892693,"[0.014541109, -0.0013100328, -0.019380659, -0...."
76115,p1872275,"[-0.018633783, -0.01836606, 0.03491105, -0.010..."
76116,p379274,"[0.05887307, -0.017219385, -0.082777806, -0.06..."


## 提取训练数据

In [6]:
from numpy import multiply
info = pd.read_csv('/home/duyongkang/aminer/aminer.author.paper.link.csv')
ppr_list = info['paper_id'].values.tolist()
auth_list = info['author_id'].values.tolist()

In [7]:
# 作者-论文数据集中找到作者和论文的特征向量，相乘后存在mul列表中
# 因为ppr_list或auth_list中的节点可能不存在于output_dic中，所以要记录留下的节点
ppr = []
auth = []
for i in range(len(ppr_list)):
    p = 'p' + str(ppr_list[i])
    a = 'a' + str(auth_list[i])
    if p in output_dic.keys() and a in output_dic.keys():
        ppr.append(p)
        auth.append(a)
    else:
        continue

In [8]:
# 生成dataframe文件
from pandas.core.frame import DataFrame
data_dic = {"author":auth, "paper":ppr}
dataset = DataFrame(data_dic)
dataset['label'] = 1

In [9]:
# 统计每篇文章的作者，key为论文标号，value为作者列表
pa_dic = {}
for i in range(len(ppr_list)):
    p = 'p' + str(ppr_list[i])
    a = 'a' + str(auth_list[i])
    try:
        pa_dic[p].append(a)
    except KeyError:
        pa_dic[p] = []
        pa_dic[p].append(a)
# 统计每个作者的文章，key为作者编号，value为论文列表
ap_dic = {}
for i in range(len(auth_list)):
    p = 'p' + str(ppr_list[i])
    a = 'a' + str(auth_list[i])
    try:
        ap_dic[a].append(p)
    except KeyError:
        ap_dic[a] = []
        ap_dic[a].append(p)

## 在训练集中添加负样本

In [12]:
df_train=dataset[:int(0.8*len(dataset) )]

In [13]:
import random
mul = []
lenth = len(df_train)
for i in range(lenth):
    a = df_train.author[i]
    while(1):
        # 生成随机论文
        p = dataset.paper[random.randint(0,len(dataset)-1)]
        #判断随机找到的论文是否在该作者的论文列表中
        if p in ap_dic[a] :
            continue
        else:
            break
    #将随机作者和该论文向量点乘
    new = pd.DataFrame({'author':a,'paper':p,'label':0},index=[1])
    df_train = df_train.append(new,ignore_index=True)   # ignore_index=True,表示不按原来的索引，从0开始自动递增

KeyboardInterrupt: 

## 构建测试集

In [10]:
df_test=dataset[int(0.8*len(dataset)):]
df_test = df_test.sort_values(by="author" , ascending=False) #by 指定列 ascending
df_test = df_test.reset_index(drop=True)

In [15]:
list_aa = []
for i in range(len(df_test)):
    if df_test.author[i] not in list_aa:
        list_aa.append(df_test.author[i])

In [26]:
list_aa[:200]

['a9997',
 'a99940',
 'a99935',
 'a998654',
 'a998162',
 'a997953',
 'a997434',
 'a99715',
 'a996982',
 'a996922',
 'a99686',
 'a99670',
 'a996239',
 'a99589',
 'a995775',
 'a995640',
 'a995466',
 'a99514',
 'a99485',
 'a99451',
 'a994275',
 'a994253',
 'a994141',
 'a99413',
 'a994051',
 'a993959',
 'a993710',
 'a99353',
 'a99271',
 'a992684',
 'a992679',
 'a992363',
 'a992349',
 'a992328',
 'a992186',
 'a99214',
 'a990915',
 'a990845',
 'a99080',
 'a990589',
 'a990556',
 'a990261',
 'a99019',
 'a990166',
 'a990139',
 'a98997',
 'a989685',
 'a98967',
 'a98964',
 'a989592',
 'a989577',
 'a989440',
 'a989364',
 'a98927',
 'a989195',
 'a989037',
 'a988950',
 'a98877',
 'a988524',
 'a988520',
 'a988338',
 'a988332',
 'a988299',
 'a988236',
 'a987751',
 'a987681',
 'a987538',
 'a98679',
 'a986592',
 'a986233',
 'a98605',
 'a986038',
 'a985689',
 'a985683',
 'a98566',
 'a985414',
 'a985351',
 'a985194',
 'a98505',
 'a984443',
 'a983991',
 'a98358',
 'a983048',
 'a98262',
 'a98189',
 'a981624

In [67]:
df_test=df_test[0:13]

In [68]:
df_test['source'] = '学者'
#显示所有列
pd.set_option('display.max_rows', 13)
df_test

Unnamed: 0,author,paper,label,source
0,a9997,p1919974,1,学者
1,a99940,p1872287,1,学者
2,a99940,p1872341,1,学者
3,a99935,p1893935,1,学者
4,a99935,p1939226,1,学者
5,a99935,p1744910,1,学者
6,a99935,p1970375,1,学者
7,a99935,p1939019,1,学者
8,a998654,p1918311,1,学者
9,a998654,p1918306,1,学者


In [69]:
count = {}
ap_test = {}
for i in range(len(df_test)):    
    try:
        count[df_test.author[i]]+=1
        ap_test[df_test.author[i]].append(df_test.paper[i])
    except:
        count[df_test.author[i]] = 1
        ap_test[df_test.author[i]] = []
        ap_test[df_test.author[i]].append(df_test.paper[i])

In [70]:
import copy
info = pd.read_csv('/home/duyongkang/aminer/aminer.paper.info.csv')
ppr_list = info['paper_id'].values.tolist()             # 论文列表
text_list = info['text'].values.tolist()                # 文本列表
cit_list = info['citations'].values.tolist()            #引用列表
confer_list = info['conference'].values.tolist()        #会议列表

# 论文的引用
cit_dic = {}      # key为论文编号，value为引用论文编号
for i in range(len(ppr_list)):
    ppr = 'p' + str(ppr_list[i])
    cit_dic[ppr] = []
    lenth = len(cit_list[i])
    if lenth == 1:
        continue
    elif lenth > 1 and lenth < 8:
        cit = 'p' + str(cit_list[i])
        cit_dic[ppr].append(cit)
    elif lenth > 8:
        cit = cit_list[i].split(';')
        cit = ['p'+num for num in cit]
        cit_dic[ppr].extend(cit)
        
def Cit_Paper( paper ):
    return cit_dic[paper]

# 同一个会议的论文
pc_dic = dict(zip(ppr_list,confer_list))              #key为论文，value为会议
confer_dic = {}
for i in range(len(ppr_list)):
    confer = confer_list[i]
    ppr = 'p' + str(ppr_list[i])
    if confer not in confer_dic.keys():
        confer_dic[confer] = []
    confer_dic[confer].append(ppr)
    
def Same_Confer_Paper( paper ):
    confer = pc_dic[int(paper[1:])]
    temp = copy.deepcopy(confer_dic[confer])
    temp.remove(paper)
    return temp

In [71]:
list_test = ['a9997', 'a99940', 'a99935', 'a998654', 'a998162']

In [72]:
i = 0
# ap_test.keys()
for author in list_test:
    i += 1
    full = False
    for paper in ap_test[author]:
        if full:
            break
        # 找该论文引用的论文，加到负样本中
        for cit_p in Cit_Paper(paper):
            # 如果cit_p已在列表中或cit_p没有对应的特征向量， 就跳到下一个
            if cit_p in ap_test[author] or cit_p not in output_dic.keys():
                continue
            try:
                new = pd.DataFrame({'author':author,'paper':cit_p,'label':0,'source':"引用"},index=[1])
                df_test = df_test.append(new,ignore_index=True)
                count[author]+=1
                if(count[author] == 50):
                    full = True
                    break
            except:
                continue
        # 如果样本不够，再找同一个会议的论文，加入样本中
        if not full:
            for same_confer_p in Same_Confer_Paper(paper):
                if same_confer_p in ap_test[author] or same_confer_p not in output_dic.keys():
                    continue
                try:
                    new = pd.DataFrame({'author':author,'paper':same_confer_p,'label':0,'source':"会议"},index=[1])
                    df_test = df_test.append(new,ignore_index=True)
                    count[author]+=1
                    if(count[author] == 50):
                        full = True
                        break
                except:
                    continue
    if i % 100 == 0:
        print(i)

In [73]:
pd.set_option('display.max_rows', 250)
df_test = df_test.sort_values(by="author" , ascending=False) #by 指定列 ascending
df_test = df_test.reset_index(drop=True)
df_test

Unnamed: 0,author,paper,label,source
0,a9997,p1919974,1,学者
1,a9997,p2006220,0,会议
2,a9997,p1919899,0,会议
3,a9997,p2008187,0,会议
4,a9997,p1918387,0,会议
5,a9997,p2006071,0,会议
6,a9997,p1919829,0,会议
7,a9997,p1918384,0,会议
8,a9997,p1292764,0,会议
9,a9997,p2006030,0,会议


In [78]:
list_source = df_test['source'].values.tolist()
list_source[200:250]

['会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '会议',
 '引用',
 '会议',
 '引用',
 '学者',
 '学者',
 '学者',
 '引用',
 '引用',
 '引用',
 '引用',
 '引用',
 '引用',
 '引用',
 '引用',
 '会议',
 '引用',
 '引用',
 '引用',
 '引用',
 '引用',
 '引用',
 '引用',
 '会议',
 '会议',
 '会议',
 '会议']

In [29]:
df_test.drop(['vector'],axis=1)

KeyError: "['vector'] not found in axis"

In [28]:
import random
i = 0
for a in count:
    while(count[a] < 50):
        temp = []
        while(1):
            p = dataset.paper[random.randint(0,len(dataset)-1)]
            if p in ap_dic[a] and p in temp:
                continue
            else:
                temp.append(p)
                break
        new = pd.DataFrame({'author':a,'paper':p,'label':0},index=[1])
        df_test = df_test.append(new,ignore_index=True)
        count[a]+=1
    i += 1
    if(i%1000==0):
        print(i)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


In [31]:
a = df_test.author.to_list()
p = df_test.paper.to_list()
mul = []
for i in range(len(df_test)):
    a = df_test.author[i]
    p = df_test.paper[i]
    v = multiply(output_dic[a],output_dic[p])
    mul.append(v)

df_test['vector'] = mul

In [33]:
df_test = df_test.sort_values(by="author" , ascending=False) #by 指定列 ascending
df_test = df_test.reset_index(drop=True)

## 计算指标

In [28]:
df_train = df_train.sort_index(ascending=True)
df_train.to_pickle('/home/duyongkang/PaperRec/Data/M2v+W2v_train.pkl')
df_test.to_pickle('/home/duyongkang/PaperRec/Data/M2v+W2v_test.pkl')# df_train.sort_values(by="author" , ascending=False)

In [41]:
df_test.to_pickle('/home/duyongkang/PaperRec/Data/Test_2feature.pkl')