In [1]:
import pathlib

import numpy as np
import scipy.sparse
import scipy.io
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
import utils.preprocess
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sklearn_stopwords
from nltk import word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer

from utils.data import load_glove_vectors



In [2]:
save_prefix = 'data/preprocessed/ACM_processed/'
num_ntypes = 4

In [3]:
paper_label = pd.read_csv('data/raw/ACM/paper_label.txt', sep='\t', header=None, names=['paper_id', 'label', 'paper_abstract'], keep_default_na=False, encoding='utf-8')

paper_author = pd.read_csv('data/raw/ACM/paper_author.txt', sep='\t', header=None, names=['paper_id', 'author_id'], keep_default_na=False, encoding='utf-8')
paper_paper = pd.read_csv('data/raw/ACM/paper_paper.txt', sep='\t', header=None, names=['paper_id_1', 'paper_id_2'], keep_default_na=False, encoding='utf-8')
paper_subject = pd.read_csv('data/raw/ACM/paper_subject.txt', sep='\t', header=None, names=['paper_id', 'subject_id'], keep_default_na=False, encoding='utf-8')
papers = pd.read_csv('data/raw/ACM/paper.txt', sep='\t', header=None, names=['paper_id', 'paper_abstract'], keep_default_na=False, encoding='cp1252')
authors = pd.read_csv('data/raw/ACM/author.txt', sep='\t', header=None, names=['author_id', 'author_name'], keep_default_na=False, encoding='utf-8')
subjects = pd.read_csv('data/raw/ACM/subject.txt', sep='\t', header=None, names=['subject_id', 'subject'], keep_default_na=False, encoding='utf-8')

In [4]:
# filter out all nodes which does not associated with labeled papers
labeled_papers = paper_label['paper_id'].to_list()
paper_author = paper_author[paper_author['paper_id'].isin(labeled_papers)].reset_index(drop=True)
valid_papers = paper_author['paper_id'].unique()
valid_authors = paper_author['author_id'].unique()
authors = authors[authors['author_id'].isin(valid_authors)].reset_index(drop=True)

papers = papers[papers['paper_id'].isin(valid_papers)].reset_index(drop=True)
paper_label = paper_label[paper_label['paper_id'].isin(valid_papers)].reset_index(drop=True)
paper_paper = paper_paper[paper_paper['paper_id_1'].isin(valid_papers)].reset_index(drop=True)
paper_paper = paper_paper[paper_paper['paper_id_2'].isin(valid_papers)].reset_index(drop=True)
paper_subject = paper_subject[paper_subject['paper_id'].isin(valid_papers)].reset_index(drop=True)
valid_subjects = paper_subject['subject_id'].unique()
subjects = subjects[subjects['subject_id'].isin(valid_subjects)].reset_index(drop=True)
print(paper_label.shape)
print(papers.shape)
print(subjects.shape)
print(authors.shape)
print(paper_paper.shape)
print(paper_subject.shape)
print(paper_author.shape)

(4019, 3)
(4019, 2)
(60, 2)
(7167, 2)
(9615, 2)
(4019, 2)
(13407, 2)


In [5]:
paper_label = paper_label.sort_values('paper_id').reset_index(drop=True)
authors = authors.sort_values('author_id').reset_index(drop=True)
subjects = subjects.sort_values('subject_id').reset_index(drop=True)

In [6]:
# extract labels of authors
labels = paper_label['label'].to_numpy()

In [7]:
# build the adjacency matrix for the graph consisting of authors, papers, terms and conferences
# 0 for authors, 1 for papers, 2 for terms, 3 for conferences
dim = len(paper_label) + len(authors) + len(subjects)
type_mask = np.zeros((dim), dtype=int)
type_mask[len(paper_label):len(paper_label)+len(authors)] = 1
type_mask[len(paper_label)+len(authors):] = 2

paper_id_mapping = {row['paper_id']: i for i, row in paper_label.iterrows()}
author_id_mapping = {row['author_id']: i + len(paper_label) for i, row in authors.iterrows()}
subject_id_mapping = {row['subject_id']: i + len(paper_label) + len(authors) for i, row in subjects.iterrows()}

adjM = np.zeros((dim, dim), dtype=int)
for _, row in paper_paper.iterrows():
    idx1 = paper_id_mapping[row['paper_id_1']]
    idx2 = paper_id_mapping[row['paper_id_2']]
    adjM[idx1, idx2] = 1
    adjM[idx2, idx1] = 1
#     adjM[idx2, idx1] = 1
for _, row in paper_author.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = author_id_mapping[row['author_id']]
    adjM[idx1, idx2] = 1
    adjM[idx2, idx1] = 1
for _, row in paper_subject.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = subject_id_mapping[row['subject_id']]
    adjM[idx1, idx2] = 1
    adjM[idx2, idx1] = 1
    
print(scipy.sparse.csr_matrix(adjM).shape)
print(scipy.sparse.csr_matrix(adjM).getnnz())
    
scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))

(11246, 11246)
53991


In [8]:
adjMM = np.zeros((dim, dim), dtype=int)
for _, row in paper_paper.iterrows():
    idx1 = paper_id_mapping[row['paper_id_1']]
    idx2 = paper_id_mapping[row['paper_id_2']]
    adjMM[idx1, idx2] = 1
    adjMM[idx2, idx1] = 1
for _, row in paper_author.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = author_id_mapping[row['author_id']]
    adjMM[idx1, idx2] = 2
    adjMM[idx2, idx1] = 3
for _, row in paper_subject.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = subject_id_mapping[row['subject_id']]
    adjMM[idx1, idx2] = 4
    adjMM[idx2, idx1] = 5
    
print(scipy.sparse.csr_matrix(adjMM).shape)
print(scipy.sparse.csr_matrix(adjMM).getnnz())
scipy.sparse.save_npz(save_prefix + 'adjMM.npz', scipy.sparse.csr_matrix(adjMM))

(11246, 11246)
53991


In [11]:
num_etype = adjMM.max()
for i in range(adjMM.shape[0]):
#     if (i, i) not in adjMM:
    adjMM[(i, i)] = num_etype + 1
print(scipy.sparse.csr_matrix(adjMM).getnnz())
scipy.sparse.save_npz(save_prefix + 'adjMM_wsl.npz', scipy.sparse.csr_matrix(adjMM))

 
for i in range(adjMM.shape[0]):
#     if (i, i) not in adjMM:
    adjMM[(i, i)] = num_etype + 1 + type_mask[i]
print(scipy.sparse.csr_matrix(adjMM).getnnz())
scipy.sparse.save_npz(save_prefix + 'adjMM_wsl_2.npz', scipy.sparse.csr_matrix(adjMM))

65226
65226


In [12]:
# use HAN paper's preprocessed data as the features of authors (https://github.com/Jhy1993/HAN)
mat = scipy.io.loadmat('data/raw/DBLP/DBLP4057_GAT_with_idx.mat')
features_author = np.array(list(zip(*sorted(zip(labeled_authors, mat['features']), key=lambda tup: tup[0])))[1])
features_author = scipy.sparse.csr_matrix(features_author)

In [13]:
# use bag-of-words representation of paper titles as the features of papers
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
vectorizer = CountVectorizer(min_df=2, stop_words=stopwords, tokenizer=LemmaTokenizer())
features_paper = vectorizer.fit_transform(papers['paper_title'].values)

  'stop_words.' % sorted(inconsistent))


In [14]:
# use pretrained GloVe vectors as the features of terms
features_term = np.zeros((len(terms), glove_dim))
for i, row in terms.iterrows():
    features_term[i] = glove_vectors.get(row['term'], glove_vectors['the'])

In [None]:
expected_metapaths = [
    [(0, 1, 0), (0, 1, 2, 1, 0), (0, 1, 3, 1, 0)],
    [(1, 0, 1), (1, 2, 1), (1, 3, 1)],
    [(2, 1, 2), (2, 1, 0, 1, 2), (2, 1, 3, 1, 2)],
    [(3, 1, 3), (3, 1, 0, 1, 3), (3, 1, 2, 1, 3)]
]
# create the directories if they do not exist
for i in range(1):
    pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)
for i in range(1):
    # get metapath based neighbor pairs
    neighbor_pairs = utils.preprocess.get_metapath_neighbor_pairs(adjM, type_mask, expected_metapaths[i])
    # construct and save metapath-based networks
    G_list = utils.preprocess.get_networkx_graph(neighbor_pairs, type_mask, i)
    
    # save data
    # networkx graph (metapath specific)
    for G, metapath in zip(G_list, expected_metapaths[i]):
        nx.write_adjlist(G, save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '.adjlist')
    # node indices of edge metapaths
    all_edge_metapath_idx_array = utils.preprocess.get_edge_metapath_idx_array(neighbor_pairs)
    for metapath, edge_metapath_idx_array in zip(expected_metapaths[i], all_edge_metapath_idx_array):
        np.save(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '_idx.npy', edge_metapath_idx_array)
# save data
# all nodes adjacency matrix
scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))
# all nodes (authors, papers, terms and conferences) features
# currently only have features of authors, papers and terms
scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(0), features_author)
scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(1), features_paper)
np.save(save_prefix + 'features_{}.npy'.format(2), features_term)
# all nodes (authors, papers, terms and conferences) type labels
np.save(save_prefix + 'node_types.npy', type_mask)
# author labels
np.save(save_prefix + 'labels.npy', labels)
# author train/validation/test splits
rand_seed = 1566911444
train_idx, val_idx = train_test_split(np.arange(len(labels)), test_size=400, random_state=rand_seed)
train_idx, test_idx = train_test_split(train_idx, test_size=3257, random_state=rand_seed)
train_idx.sort()
val_idx.sort()
test_idx.sort()
np.savez(save_prefix + 'train_val_test_idx.npz',
         val_idx=val_idx,
         train_idx=train_idx,
         test_idx=test_idx)

(32789, 3)
(41633537, 5)
(30803571, 5)


In [17]:
# post-processing for mini-batched training
target_idx_list = np.arange(4057)
for metapath in [(0, 1, 0), (0, 1, 2, 1, 0), (0, 1, 3, 1, 0)]:
    edge_metapath_idx_array = np.load(save_prefix + '{}/'.format(0) + '-'.join(map(str, metapath)) + '_idx.npy')
    target_metapaths_mapping = {}
    for target_idx in target_idx_list:
        target_metapaths_mapping[target_idx] = edge_metapath_idx_array[edge_metapath_idx_array[:, 0] == target_idx][:, ::-1]
    out_file = open(save_prefix + '{}/'.format(0) + '-'.join(map(str, metapath)) + '_idx.pickle', 'wb')
    pickle.dump(target_metapaths_mapping, out_file)
    out_file.close()