In [1]:
import pandas as pd
import networkx as nx
import snap



In [34]:
#Input use case name to append 
usecase="IND2" 

'''
NEW: Literature review 2021
BIB1: Bibliometric Study with seminal seed DOIs
BIB2: Bibliometric Study with random seed DOIs
IND1: Indicators Review with seminal seed DOIs
IND2: Indicators Review with random seed DOIs
COM: Community Review with random seed DOIs
'''

#Date range
if usecase[0:3]=="BIB":
    start_year = 2010
    end_year = 2020
elif usecase[0:3]=="IND":
    start_year = 1985
    end_year = 2015
elif usecase[0:3]=="COM":
    start_year = 1970
    end_year = 2009
else:
    start_year = 1970
    end_year = 2020

usecaseDIR= "../"+usecase+"/"

citNet = usecaseDIR+str(start_year)+"COCI"+str(end_year)+".pkl"
autCitNet = usecaseDIR+"autCOCI.pkl"
autCitNetLst = usecaseDIR+"autCOCILst.pkl"

pubCentPkl = usecaseDIR+"pubCent.pkl"
autCentPkl = usecaseDIR+"autCent.pkl"
venCentPkl = usecaseDIR+"venCent.pkl"

DOIPkl = usecaseDIR+"DOIs.pkl"
DOItestPkl = usecaseDIR+"DOItest.pkl"
refDf = usecaseDIR+"refDf.pkl"

FMatPkl = usecaseDIR+"FMat.pkl"

PublicationGraph = usecaseDIR+"Publication.graph"
PublicationHash = usecaseDIR+"Publication.hash"

VenueGraph = usecaseDIR+"Venue.graph"
VenueHash = usecaseDIR+"Venue.hash"

AuthorGraph = usecaseDIR+"Author.graph"
AuthorHash = usecaseDIR+"Author.hash"

In [35]:
def ComputeCentrality(Hash, Graph):
    HashFile = snap.TFIn(Hash) #Loading hash file
    mapping = snap.TStrIntSH (HashFile) #Creating mapping list containing NodeIDs and Names
    graphFile = snap.TFIn(Graph) #Loading graph file.
    Network = snap.TNEANet.Load(graphFile) #Loading graph file into variable
    #Network = snap.GenRndGnm(snap.TNEANet, 100, 1000)

    print("started GetNodeClustCfAll")
    ACC = snap.TIntFltH()
    ACC = snap.GetNodeClustCfAll(Network)

    print("started GetHits")
    Hub = snap.TIntFltH()
    Auth = snap.TIntFltH()
    %time snap.GetHits(Network, Hub, Auth)

    print("started GetPageRank")
    PR = snap.TIntFltH() #Creating Integer float hash for PR
    %time snap.GetPageRank(Network, PR, 0.85, 1e-8, Network.GetNodes())
    
    print("started GetBetweennessCentr")
    NodesBC = snap.TIntFltH()
    EdgesBC = snap.TIntPrFltH()
    %time snap.GetBetweennessCentr(Network, NodesBC, EdgesBC, 1.0, True)

    print("started GetNodeDegV")
    InDegV = Network.GetNodeInDegV() #Using Snap's InDegree function to get InDegrees
    OutDegV = Network.GetNodeOutDegV() #Using Snap's OutDegree function to get OutDegrees

    IN = {item.GetVal1(): item.GetVal2() for item in InDegV}
    OUT = {item.GetVal1(): item.GetVal2() for item in OutDegV}

    print("started GetClosenessCentr")
    CC = {NI.GetId(): Network.GetClosenessCentr(NI.GetId(), True, True) for NI in Network.Nodes()}

    d = []
    for NI in Network.Nodes():
      d.append({'ID':mapping.GetKey(NI.GetId()), 'IN':IN[NI.GetId()], 'OUT':OUT[NI.GetId()], 'ACC':ACC(NI.GetId()), 'Hub':Hub(NI.GetId()), 'Auth':Auth(NI.GetId()), 'PR':PR(NI.GetId()), 'BC':NodesBC(NI.GetId()), 'CC':CC[NI.GetId()]})


    df = pd.DataFrame(d).set_index('ID')

    cut_labels = [1,2,3,4,5,6]
    cut_bins = [0,0.5,0.75,0.90,0.95,0.99,1.0]

    for column in df:
        df[column]=df[column].rank(method='dense',pct=True)
    for column in df:
        df[column] = pd.cut(df[column], bins=cut_bins, labels=cut_labels).astype(int)
    return df

In [36]:
pubCent = ComputeCentrality(PublicationHash, PublicationGraph)
pubCent.index.name = "Publication"
pubCent.to_pickle(pubCentPkl)

started GetNodeClustCfAll
started GetHits
Wall time: 148 ms
started GetPageRank
Wall time: 97.2 ms
started GetBetweennessCentr
Wall time: 1.76 s
started GetNodeDegV
started GetClosenessCentr


In [37]:
venCent = ComputeCentrality(VenueHash, VenueGraph)
venCent.index.name = "Venue"
venCent.to_pickle(venCentPkl)

started GetNodeClustCfAll
started GetHits
Wall time: 111 ms
started GetPageRank
Wall time: 100 ms
started GetBetweennessCentr
Wall time: 5.7 s
started GetNodeDegV
started GetClosenessCentr


In [38]:
autCent = ComputeCentrality(AuthorHash, AuthorGraph)
autCent.index.name = "Author"
autCent.to_pickle(autCentPkl)

started GetNodeClustCfAll
started GetHits
Wall time: 1.19 s
started GetPageRank
Wall time: 1.41 s
started GetBetweennessCentr
Wall time: 4min 45s
started GetNodeDegV
started GetClosenessCentr


In [7]:
COCI = pd.read_pickle(autCitNet)

In [8]:
autLst = pd.read_pickle(autCitNetLst).drop(['Title'], axis=1).set_index('autID')

In [9]:
COCI = COCI.merge(autLst, left_on='autID_citing', right_index=True)

In [10]:
COCI = COCI.merge(autLst, left_on='autID_cited', right_on='autID', suffixes=('_citing', '_cited')).drop(['autID_citing', 'autID_cited'], axis=1)

In [11]:
CtgLst = list(set(list(COCI.citing)).difference(set(list(COCI.cited))))

In [12]:
COCI = COCI.merge(pubCent.add_suffix('_Pctg_'), left_on='citing', right_index=True)

In [13]:
COCI = COCI.merge(venCent.add_suffix('_Vctg_'), left_on='Venue_citing', right_index=True).drop(columns=['Venue_citing'],axis=1)

In [14]:
COCI = COCI.merge(autCent.add_suffix('_Actg_'), left_on='Author_citing', right_index=True).drop(columns=['Author_citing'],axis=1)

In [15]:
Ctg = COCI.groupby("citing").first().drop(columns=['Author_cited','cited','Venue_cited'])

In [16]:
COCI = COCI.drop(columns=['citing'],axis=1)

In [17]:
COCI = COCI.merge(pubCent.add_suffix('_Pctd_'), left_on='cited', right_index=True)

In [18]:
COCI = COCI.merge(venCent.add_suffix('_Vctd_'), left_on='Venue_cited', right_index=True)

In [19]:
COCI = COCI.merge(autCent.add_suffix('_Actd_'), left_on='Author_cited', right_index=True).drop(columns=['Author_cited','Venue_cited'],axis=1)

In [20]:
Ctd = COCI.groupby("cited").mean()

In [21]:
FMat = pd.concat([Ctd, Ctg[Ctg.index.isin(CtgLst)]], verify_integrity=True)

In [22]:
FMat.index.name = "DOI"

In [23]:
FMat = FMat.fillna(0)

In [24]:
y = pd.read_pickle(DOItestPkl)

FMat['y'] = FMat.apply(lambda row: row.name in y, axis=1)


In [25]:
FMat[FMat.y==True]

Unnamed: 0_level_0,IN_Pctg_,OUT_Pctg_,ACC_Pctg_,Hub_Pctg_,Auth_Pctg_,PR_Pctg_,BC_Pctg_,CC_Pctg_,IN_Vctg_,OUT_Vctg_,...,CC_Vctd_,IN_Actd_,OUT_Actd_,ACC_Actd_,Hub_Actd_,Auth_Actd_,PR_Actd_,BC_Actd_,CC_Actd_,y
DOI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.1002/cplx.20166,1.0,1.00000,2.875000,3.375000,1.250000,1.250000,1.500000,2.875000,1.000000,1.750000,...,5.0,1.0,1.0,2.000000,3.250000,1.000000,1.000000,1.500000,2.500000,True
10.1002/prot.20799,1.0,1.00000,1.823529,2.764706,1.411765,1.000000,1.411765,3.058824,1.117647,1.235294,...,5.0,1.0,1.0,1.000000,4.428571,1.428571,2.285714,2.428571,3.000000,True
10.1007/11427995_3,1.0,1.12987,2.311688,3.155844,1.246753,1.077922,1.454545,2.701299,1.519481,1.896104,...,4.0,1.0,1.0,3.333333,3.666667,3.000000,3.000000,3.333333,2.333333,True
10.1007/11569596_31,1.0,1.00000,3.218750,3.406250,1.000000,1.062500,1.125000,3.406250,1.281250,1.656250,...,3.0,1.0,1.5,1.000000,5.000000,3.500000,2.500000,3.500000,4.000000,True
10.1007/978-3-540-24688-6_137,1.0,1.00000,2.838710,3.258065,2.903226,1.774194,2.612903,3.064516,4.419355,5.064516,...,4.0,1.0,1.5,1.000000,5.500000,4.000000,3.500000,3.500000,4.500000,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10.1088/1367-2630/11/4/043025,1.0,1.00000,5.000000,3.000000,1.000000,1.000000,1.000000,4.000000,3.000000,5.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,True
10.1103/physreve.66.036117,1.0,1.00000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,6.000000,6.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,True
10.1103/physreve.79.036111,1.0,1.00000,4.000000,3.000000,1.000000,1.000000,1.000000,4.000000,6.000000,6.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,True
10.1103/physreve.80.026129,1.0,1.00000,4.000000,3.000000,1.000000,1.000000,1.000000,4.000000,6.000000,6.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,True


In [26]:
FMat.to_pickle(FMatPkl)

In [27]:
pd.read_pickle(FMatPkl)

Unnamed: 0_level_0,IN_Pctg_,OUT_Pctg_,ACC_Pctg_,Hub_Pctg_,Auth_Pctg_,PR_Pctg_,BC_Pctg_,CC_Pctg_,IN_Vctg_,OUT_Vctg_,...,CC_Vctd_,IN_Actd_,OUT_Actd_,ACC_Actd_,Hub_Actd_,Auth_Actd_,PR_Actd_,BC_Actd_,CC_Actd_,y
DOI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10.1002/(sici)1096-9861(19960902)372:4<487::aid-cne1>3.0.co;2-0,1.0,1.5,1.0,1.25,1.8125,4.1875,4.5625,1.75,4.750,3.5000,...,2.0,2.0,1.0,1.5,1.5,2.5,5.0,3.0,1.5,False
10.1002/(sici)1096-9861(19970217)378:3<320::aid-cne2>3.0.co;2-5,1.0,1.0,1.5,1.00,2.5000,4.5000,4.5000,1.00,2.500,2.5000,...,2.0,1.8,1.0,3.0,1.0,1.4,3.8,2.2,1.0,False
10.1002/(sici)1096-9861(19980601)395:2<177::aid-cne3>3.0.co;2-#,1.0,1.0,1.0,1.00,1.0000,5.0000,3.0000,1.00,2.000,2.0000,...,2.0,1.0,1.0,3.0,1.5,2.0,3.0,2.5,1.5,False
10.1002/(sici)1096-9861(19990322)405:4<472::aid-cne3>3.0.co;2-p,1.0,1.0,1.0,1.00,2.5000,4.3125,3.3750,1.00,1.875,2.0625,...,2.0,1.0,1.0,2.5,1.0,1.5,3.5,2.5,1.0,False
10.1002/(sici)1096-9861(19990712)409:4<647::aid-cne9>3.0.co;2-3,1.0,1.0,1.0,1.00,1.0000,3.0000,1.0000,1.00,2.000,2.0000,...,2.0,1.0,1.0,2.5,1.0,1.0,1.0,1.0,1.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10.9746/sicetr.46.713,1.0,1.0,1.0,1.00,1.0000,1.0000,1.0000,1.00,1.000,1.0000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
10.9746/sicetr.46.723,1.0,1.0,1.0,2.00,1.0000,1.0000,1.0000,2.00,1.000,1.0000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
10.9746/sicetr.46.774,1.0,1.0,1.0,1.00,1.0000,1.0000,1.0000,2.00,1.000,1.0000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
10.9746/sicetr.46.783,1.0,1.0,1.0,2.00,1.0000,1.0000,1.0000,2.00,1.000,1.0000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
