In [298]:

import numpy as np
import scipy.sparse
import scipy.io
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
import utils1.preprocess
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stopwords

from nltk import word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer

from utils1.data import load_glove_vectors
import pathlib

In [299]:
save_prefix = 'data/preprocessed/DBLP_processed/'
num_ntypes = 4

In [300]:
author_label = pd.read_csv('data/raw/DBLP/author_label.txt', sep='\t', header=None, names=['author_id', 'label', 'author_name'], keep_default_na=False, encoding='utf-8')
paper_author = pd.read_csv('data/raw/DBLP/paper_author.txt', sep='\t', header=None, names=['paper_id', 'author_id'], keep_default_na=False, encoding='utf-8')
paper_conf = pd.read_csv('data/raw/DBLP/paper_conf.txt', sep='\t', header=None, names=['paper_id', 'conf_id'], keep_default_na=False, encoding='utf-8')
paper_term = pd.read_csv('data/raw/DBLP/paper_term.txt', sep='\t', header=None, names=['paper_id', 'term_id'], keep_default_na=False, encoding='utf-8')
papers = pd.read_csv('data/raw/DBLP/paper.txt', sep='\t', header=None, names=['paper_id', 'paper_title'], keep_default_na=False, encoding='cp1252')
terms = pd.read_csv('data/raw/DBLP/term.txt', sep='\t', header=None, names=['term_id', 'term'], keep_default_na=False, encoding='utf-8')
confs = pd.read_csv('data/raw/DBLP/conf.txt', sep='\t', header=None, names=['conf_id', 'conf'], keep_default_na=False, encoding='utf-8')

glove_dim = 50
glove_vectors = load_glove_vectors(dim=glove_dim)
# 打印前几行
print("First few GloVe vectors:")
for i, (word, vector) in enumerate(glove_vectors.items()):
    if i < 5:  # 打印前5个向量
        print(f"Word: {word}\nVector: {vector}\n")
    else:
        break

Loading GloVe pretrained word vectors
Done. 400000 words loaded!
First few GloVe vectors:
Word: the
Vector: [ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]

Word: ,
Vector: [ 0.013441  0.23682  -0.16899   0.40951   0.63812   0.47709  -0.42852
 -0.55641  -0.364    -0.23938   0.13001  -0.063734 -0.39575  -0.48162
  0.23291   0.090201 -0.13324   0.078639 -0.41634  -0.15428   0.10068
  0.48891   0.31226  -0.1252   -0.037512 -1.5179    0.1

In [301]:
# filter out all nodes which does not associated with labeled authors
labeled_authors = author_label['author_id'].to_list()
paper_author = paper_author[paper_author['author_id'].isin(labeled_authors)].reset_index(drop=True)
valid_papers = paper_author['paper_id'].unique()
papers = papers[papers['paper_id'].isin(valid_papers)].reset_index(drop=True)
paper_conf = paper_conf[paper_conf['paper_id'].isin(valid_papers)].reset_index(drop=True)
paper_term = paper_term[paper_term['paper_id'].isin(valid_papers)].reset_index(drop=True)
valid_terms = paper_term['term_id'].unique()
terms = terms[terms['term_id'].isin(valid_terms)].reset_index(drop=True)

In [302]:
# term lemmatization and grouping
lemmatizer = WordNetLemmatizer()
lemma_id_mapping = {}
lemma_list = []
lemma_id_list = []
i = 0
for _, row in terms.iterrows():
    i += 1
    lemma = lemmatizer.lemmatize(row['term'])
    lemma_list.append(lemma)
    if lemma not in lemma_id_mapping:
        lemma_id_mapping[lemma] = row['term_id']
    lemma_id_list.append(lemma_id_mapping[lemma])
terms['lemma'] = lemma_list
terms['lemma_id'] = lemma_id_list

term_lemma_mapping = {row['term_id']: row['lemma_id'] for _, row in terms.iterrows()}
lemma_id_list = []
for _, row in paper_term.iterrows():
    lemma_id_list.append(term_lemma_mapping[row['term_id']])
paper_term['lemma_id'] = lemma_id_list

paper_term = paper_term[['paper_id', 'lemma_id']]
paper_term.columns = ['paper_id', 'term_id']
paper_term = paper_term.drop_duplicates()
terms = terms[['lemma_id', 'lemma']]
terms.columns = ['term_id', 'term']
terms = terms.drop_duplicates()

In [303]:
# filter out stopwords from terms
stopwords = sklearn_stopwords.union(set(nltk_stopwords.words('english')))
stopword_id_list = terms[terms['term'].isin(stopwords)]['term_id'].to_list()
paper_term = paper_term[~(paper_term['term_id'].isin(stopword_id_list))].reset_index(drop=True)
terms = terms[~(terms['term'].isin(stopwords))].reset_index(drop=True)



In [304]:
# remove terms not found in GloVe
#terms_not_in_glove = []
#for _, row in terms.iterrows():
#    if row['term'] not in glove_vectors:
#        terms_not_in_glove.append(row['term'])
#term_ids_not_in_glove = terms[terms['term'].isin(terms_not_in_glove)]['term_id'].to_list()
#terms = terms[~(terms['term'].isin(terms_not_in_glove))].reset_index(drop=True)
#paper_term = paper_term[~(paper_term['term_id'].isin(term_ids_not_in_glove))].reset_index(drop=True)

In [305]:
# consider only terms associated with at least two papers
# if having meaningful word vectors for terms (e.g. GloVe)?

In [306]:
author_label = author_label.sort_values('author_id').reset_index(drop=True)
papers = papers.sort_values('paper_id').reset_index(drop=True)
terms = terms.sort_values('term_id').reset_index(drop=True)
confs = confs.sort_values('conf_id').reset_index(drop=True)

In [307]:
# extract labels of authors
labels = author_label['label'].to_numpy()

In [308]:
# 计算维度
dim = len(author_label) + len(papers) + len(terms) + len(confs)
print(f"dim: {dim}")

# 创建 type_mask
type_mask = np.zeros((dim), dtype=int)
type_mask[len(author_label):len(author_label)+len(papers)] = 1
type_mask[len(author_label)+len(papers):len(author_label)+len(papers)+len(terms)] = 2
type_mask[len(author_label)+len(papers)+len(terms):] = 3
print(f"type_mask shape: {type_mask.shape}")
print(f"type_mask sample: {type_mask[:10]}")

# 统计不同类型的数量
unique_types, counts = np.unique(type_mask, return_counts=True)
print(f"Unique types in type_mask: {unique_types}")
print(f"Counts of each type: {counts}")
# 创建 ID 映射
author_id_mapping = {row['author_id']: i for i, row in author_label.iterrows()}
paper_id_mapping = {row['paper_id']: i + len(author_label) for i, row in papers.iterrows()}
term_id_mapping = {row['term_id']: i + len(author_label) + len(papers) for i, row in terms.iterrows()}
conf_id_mapping = {row['conf_id']: i + len(author_label) + len(papers) + len(terms) for i, row in confs.iterrows()}

# 打印 ID 映射的样本
print(f"author_id_mapping sample: {list(author_id_mapping.items())[:]}")
print(f"paper_id_mapping sample: {list(paper_id_mapping.items())[:]}")
print(f"term_id_mapping sample: {list(term_id_mapping.items())[:]}")
print(f"conf_id_mapping sample: {list(conf_id_mapping.items())[:]}")

adjM = np.zeros((dim, dim), dtype=int)
for _, row in paper_author.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = author_id_mapping[row['author_id']]
    adjM[idx1, idx2] = 1
    adjM[idx2, idx1] = 1
for _, row in paper_term.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = term_id_mapping[row['term_id']]
    adjM[idx1, idx2] = 1
    adjM[idx2, idx1] = 1
for _, row in paper_conf.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = conf_id_mapping[row['conf_id']]
    adjM[idx1, idx2] = 1
    adjM[idx2, idx1] = 1
    

dim: 2641
type_mask shape: (2641,)
type_mask sample: [0 0 0 0 0 0 0 0 0 0]
Unique types in type_mask: [0 1 2 3]
Counts of each type: [ 696  684 1257    4]
author_id_mapping sample: [(435, 0), (448, 1), (798, 2), (1245, 3), (2290, 4), (2588, 5), (3460, 6), (3759, 7), (4285, 8), (4319, 9), (4469, 10), (4472, 11), (4473, 12), (4516, 13), (4595, 14), (4642, 15), (4681, 16), (4802, 17), (4830, 18), (4840, 19), (4842, 20), (4927, 21), (7046, 22), (10138, 23), (12729, 24), (13103, 25), (15157, 26), (15206, 27), (15290, 28), (15350, 29), (15359, 30), (15385, 31), (15440, 32), (15441, 33), (15481, 34), (15493, 35), (15575, 36), (15592, 37), (15601, 38), (15610, 39), (15645, 40), (15704, 41), (15706, 42), (15710, 43), (15713, 44), (15787, 45), (15791, 46), (15816, 47), (15829, 48), (15876, 49), (15904, 50), (15973, 51), (15993, 52), (16048, 53), (16122, 54), (16142, 55), (16162, 56), (16224, 57), (16269, 58), (16284, 59), (16322, 60), (16338, 61), (16347, 62), (16399, 63), (16431, 64), (16453, 6

In [309]:
# use HAN paper's preprocessed data as the features of authors (https://github.com/Jhy1993/HAN)

mat = scipy.io.loadmat('data/raw/DBLP/DBLP4057_GAT_with_idx.mat')

features_author = np.array(list(zip(*sorted(zip(labeled_authors, mat['features']), key=lambda tup: tup[0])))[1])

features_author = scipy.sparse.csr_matrix(features_author)
print(features_author)

  (0, 55)	1
  (0, 74)	1
  (0, 135)	1
  (0, 242)	1
  (0, 296)	1
  (0, 316)	1
  (1, 23)	1
  (1, 29)	1
  (1, 98)	1
  (1, 149)	1
  (1, 215)	1
  (1, 296)	1
  (1, 302)	1
  (2, 13)	1
  (2, 39)	1
  (2, 50)	1
  (2, 190)	1
  (2, 193)	1
  (2, 299)	1
  (3, 11)	1
  (3, 13)	1
  (3, 46)	1
  (3, 54)	1
  (3, 55)	1
  (3, 63)	1
  :	:
  (692, 124)	1
  (692, 220)	1
  (692, 222)	1
  (692, 253)	1
  (692, 299)	1
  (692, 321)	1
  (693, 113)	1
  (693, 176)	1
  (693, 321)	1
  (694, 10)	1
  (694, 54)	1
  (694, 116)	1
  (694, 152)	1
  (694, 212)	1
  (694, 213)	1
  (694, 273)	1
  (694, 315)	1
  (695, 55)	1
  (695, 126)	1
  (695, 148)	1
  (695, 149)	1
  (695, 200)	1
  (695, 217)	1
  (695, 237)	1
  (695, 295)	1


In [310]:
# 获取和合并停用词
stopwords = list(sklearn_stopwords.union(set(nltk_stopwords.words('english'))))

# 过滤terms中的停用词
stopword_id_list = terms[terms['term'].isin(stopwords)]['term_id'].to_list()
paper_term = paper_term[~paper_term['term_id'].isin(stopword_id_list)].reset_index(drop=True)
terms = terms[~terms['term'].isin(stopwords)].reset_index(drop=True)

# use bag-of-words representation of paper titles as the features of papers
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
vectorizer = CountVectorizer(min_df=2, stop_words=stopwords, tokenizer=LemmaTokenizer())
features_paper = vectorizer.fit_transform(papers['paper_title'].values)



In [311]:
# use pretrained GloVe vectors as the features of terms
features_term = np.zeros((len(terms), glove_dim))
for i, row in terms.iterrows():
    features_term[i] = glove_vectors.get(row['term'], glove_vectors['the'])

In [320]:
expected_metapaths = [
    [(0, 1, 0), (0, 1, 2, 1, 0), (0, 1, 3, 1, 0)],
    [(1, 0, 1), (1, 2, 1), (1, 3, 1)],
    [(2, 1, 2), (2, 1, 0, 1, 2), (2, 1, 3, 1, 2)],
    [(3, 1, 3), (3, 1, 0, 1, 3), (3, 1, 2, 1, 3)]
]
# create the directories if they do not exist
for i in range(1):
    pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)
for i in range(1):
    # get metapath based neighbor pairs
    neighbor_pairs = utils1.preprocess.get_metapath_neighbor_pairs(adjM, type_mask, expected_metapaths[i])
    # construct and save metapath-based networks
    G_list = utils1.preprocess.get_networkx_graph(neighbor_pairs, type_mask, i)
    
    # save data
    # networkx graph (metapath specific)
    for G, metapath in zip(G_list, expected_metapaths[i]):
        nx.write_adjlist(G, save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '.adjlist')
    # node indices of edge metapaths
    all_edge_metapath_idx_array = utils1.preprocess.get_edge_metapath_idx_array(neighbor_pairs)
    for metapath, edge_metapath_idx_array in zip(expected_metapaths[i], all_edge_metapath_idx_array):
        np.save(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '_idx.npy', edge_metapath_idx_array)
# save data
# all nodes adjacency matrix
scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))
# all nodes (authors, papers, terms and conferences) features
# currently only have features of authors, papers and terms
scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(0), features_author)
scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(1), features_paper)
np.save(save_prefix + 'features_{}.npy'.format(2), features_term)
# all nodes (authors, papers, terms and conferences) type labels
np.save(save_prefix + 'node_types.npy', type_mask)
# author labels
np.save(save_prefix + 'labels.npy', labels)
# author train/validation/test splits
rand_seed = 1566911444
# train_idx, val_idx = train_test_split(np.arange(len(labels)), test_size=0.2, random_state=rand_seed)
# train_idx, test_idx = train_test_split(train_idx, test_size=0.2, random_state=rand_seed)

# train_idx, temp_idx = train_test_split(np.arange(len(labels)), test_size=440, stratify=labels, random_state=rand_seed)
# val_idx, test_idx = train_test_split(temp_idx, test_size=400, stratify=labels[temp_idx], random_state=rand_seed)

train_idx, val_idx = train_test_split(np.arange(len(labels)), test_size=290, random_state=rand_seed)
train_idx, test_idx = train_test_split(train_idx, test_size=96, random_state=rand_seed)

# type_mask = np.array([0] * 590 + [1] * 11 + [2] * 55 + [3] * 10)
# labels = np.random.randint(0, 2, size=len(type_mask))  # 确保 labels 的长度与 type_mask 一致
# 
# train_idx, temp_idx = train_test_split(np.arange(len(labels)), test_size=0.2, stratify=type_mask, random_state=rand_seed)
# val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, stratify=type_mask[temp_idx], random_state=rand_seed)

train_idx.sort()
val_idx.sort()
test_idx.sort()
np.savez(save_prefix + 'train_val_test_idx.npz',
         val_idx=val_idx,
         train_idx=train_idx,
         test_idx=test_idx)

In [321]:
# post-processing for mini-batched training
target_idx_list = np.arange(1000)
for metapath in [(0, 1, 0), (0, 1, 2, 1, 0), (0, 1, 3, 1, 0)]:
    edge_metapath_idx_array = np.load(save_prefix + '{}/'.format(0) + '-'.join(map(str, metapath)) + '_idx.npy')
    target_metapaths_mapping = {}
    for target_idx in target_idx_list:
        target_metapaths_mapping[target_idx] = edge_metapath_idx_array[edge_metapath_idx_array[:, 0] == target_idx][:, ::-1]
    out_file = open(save_prefix + '{}/'.format(0) + '-'.join(map(str, metapath)) + '_idx.pickle', 'wb')
    pickle.dump(target_metapaths_mapping, out_file)
    out_file.close()