In [1]:
#pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
#pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cpu.html


In [2]:
from torch_geometric.data import HeteroData
import torch

import pandas as pd


In [3]:
#Input use case name to append 
usecase="IND2" 

'''
NEW: Literature review 2021
BIB1: Bibliometric Study with seminal seed DOIs
BIB2: Bibliometric Study with random seed DOIs
IND1: Indicators Review with seminal seed DOIs
IND2: Indicators Review with random seed DOIs
COM: Community Review with random seed DOIs
'''

#Date range
if usecase[0:3]=="BIB":
    start_year = 2010
    end_year = 2020
elif usecase[0:3]=="IND":
    start_year = 1985
    end_year = 2015
elif usecase[0:3]=="COM":
    start_year = 1970
    end_year = 2009
else:
    start_year = 1970
    end_year = 2020

usecaseDIR= "../"+usecase+"/"

citNet = usecaseDIR+str(start_year)+"COCI"+str(end_year)+".pkl"
metadata = usecaseDIR+str(start_year)+"metadata"+str(end_year)+".pkl"

autCitNet = usecaseDIR+"autCOCI.pkl"
autCitNetLst = usecaseDIR+"autCOCILst.pkl"

pubCentPkl = usecaseDIR+"pubCent.pkl"
autCentPkl = usecaseDIR+"autCent.pkl"
venCentPkl = usecaseDIR+"venCent.pkl"

DOIPkl = usecaseDIR+"DOIs.pkl"
DOItestPkl = usecaseDIR+"DOItest.pkl"
refDf = usecaseDIR+"refDf.pkl"

FMatPkl = usecaseDIR+"FMat.pkl"
ptHeteroData = usecaseDIR+"HeteroData.pt"


PublicationGraph = usecaseDIR+"Publication.graph"
PublicationHash = usecaseDIR+"Publication.hash"

VenueGraph = usecaseDIR+"Venue.graph"
VenueHash = usecaseDIR+"Venue.hash"

AuthorGraph = usecaseDIR+"Author.graph"
AuthorHash = usecaseDIR+"Author.hash"

In [4]:
data = HeteroData()

In [5]:
pubCent = pd.read_pickle(pubCentPkl)
venCent = pd.read_pickle(venCentPkl)
autCent = pd.read_pickle(autCentPkl)


In [6]:
data['Publication'].x = torch.tensor(pubCent.values)

data['Venue'].x = torch.tensor(venCent.values)

data['Author'].x = torch.tensor(autCent.values)

In [7]:
autCOCI = pd.read_pickle(autCitNet)
COCI = pd.read_pickle(citNet)
DF = pd.read_pickle(metadata).reset_index()

In [8]:
pubCN = COCI.drop(columns=['Venue_citing','Venue_cited','Title_citing','Title_cited'], axis=1)

In [9]:
venCN = COCI.drop(columns=['citing','cited','Title_citing','Title_cited'], axis=1)

In [10]:
autCN = autCOCI.drop(columns=['Venue_citing','Venue_cited','citing','cited'], axis=1)

In [11]:
pubCent = pubCent.reset_index()
pubCent['index']=pubCent.index
pubCent = pubCent.set_index('Publication').drop(columns=['IN','OUT','ACC','Hub','Auth','PR','BC','CC'], axis=1)


In [12]:
venCent = venCent.reset_index()
venCent['index']=venCent.index
venCent = venCent.set_index('Venue').drop(columns=['IN','OUT','ACC','Hub','Auth','PR','BC','CC'], axis=1)


In [13]:
autCent = autCent.reset_index()
autCent['index']=autCent.index
autCent = autCent.set_index('Author').drop(columns=['IN','OUT','ACC','Hub','Auth','PR','BC','CC'], axis=1)


In [14]:
pubCN = pubCN.merge(pubCent, left_on='citing', right_index=True)
pubCN = pubCN.merge(pubCent, left_on='cited', right_index=True, suffixes=['_citing','_cited'])

In [15]:
venCN = venCN.merge(venCent, left_on='Venue_citing', right_index=True)
venCN = venCN.merge(venCent, left_on='Venue_cited', right_index=True, suffixes=['_citing','_cited'])

In [16]:
src = list(venCN.index_citing)
dst = list(venCN.index_cited)

edge_index = torch.tensor([src, dst])

data['Venue', 'cite', 'Venue'].edge_index = edge_index

In [17]:
src = list(pubCN.index_citing)
dst = list(pubCN.index_cited)

edge_index = torch.tensor([src, dst])

data['Publication', 'cite', 'Publication'].edge_index = edge_index

In [18]:
DF = DF.merge(pubCent, left_on='DOI', right_index=True)

In [19]:
DF = DF.merge(venCent, left_on='Venue', right_index=True, suffixes=['_DOI','_Venue'])

In [20]:
DF = DF.drop(columns=['DOI','Title','Venue','Year','Author'],axis=1).groupby('index_DOI').first().reset_index()

In [21]:
src = list(DF.index_DOI)
dst = list(DF.index_Venue)

edge_index = torch.tensor([src, dst])

data['Publication', 'in', 'Venue'].edge_index = edge_index

In [22]:
data

HeteroData(
  [1mPublication[0m={ x=[6662, 8] },
  [1mVenue[0m={ x=[2290, 8] },
  [1mAuthor[0m={ x=[11884, 8] },
  [1m(Venue, cite, Venue)[0m={ edge_index=[2, 15089] },
  [1m(Publication, cite, Publication)[0m={ edge_index=[2, 15089] },
  [1m(Publication, in, Venue)[0m={ edge_index=[2, 6354] }
)

In [23]:
torch.save(data, ptHeteroData)