In [None]:
%matplotlib inline
import leidenalg
import igraph as ig
import csv
from collections import defaultdict, Counter
from clusim.clustering import Clustering, print_clustering
import clusim.sim as sim
import gc
import pandas as pd
from tqdm.notebook import tqdm
from multiprocessing import Pool
from functools import partial
import psutil
from matplotlib import pyplot as plt

In [None]:
!conda install -c conda-forge leidenalg

In [None]:
def genRawNodesplitNetwork_BC_CC(raw_edgelist): # edglist format: (from, to)
    numlayer = 3
    res_edgelist = []
    sumweight_per_layer = defaultdict(lambda: [0] * numlayer)
    # index 0: Clustering-Clustering weight
    # index 1: Clus
    
    # mutiply the node index by 10 and add the last digit to distinguish the type of node
    # last digit information: 
    # a node -> clustring(0), citing(1), cited nodes(2)
    # basic rules: 
    # 1) citing node cites clustering node (*1->*0) (CC)
    # 2) clustring node cites cited node (*0->*2) (BC)
    # 3~9 Reserved for now (candidates: keyword, affiliation, and etc...)
    # retunning edge node order: (clustering node, other layer distinguished with a last digit)
    for edge in raw_edgelist:
        citing_node = edge[0] * 10 + 1
        clustering_node_cited = edge[1] * 10       
        cited_node = edge[1] * 10 + 2
        clustering_node_citing = edge[0] * 10
        
        sumweight_per_layer[clustering_node_cited][1] += 1
        sumweight_per_layer[clustering_node_citing][2] += 1

        res_edgelist.append((clustering_node_cited, citing_node))
        res_edgelist.append((clustering_node_citing, cited_node))
        
    return res_edgelist, sumweight_per_layer

def normNodesplitNetwork_BC_CC(raw_edgelist, r_layerslist = [0, 0.5, 0.5]): # edglist format: (from, to)
    res_edgelist, strengthdict = genRawNodesplitNetwork_BC_CC(raw_edgelist)
    res_weighed_edgelist = []
    for edge in res_edgelist:
        target_layer = edge[1] % 10
        weight = 1 / strengthdict[edge[0]][target_layer] * r_layerslist[target_layer]
        #if(edge[0] % 10 != 0):
        #    print("error!")
        if(weight == 0):
            continue
        res_weighed_edgelist.append((edge[0], edge[1], weight))        
    return res_weighed_edgelist
        
def genRawNodesplitNetwork_BC_CC_DCT1(raw_edgelist): # edglist format: (from, to)
    numlayer = 3
    res_edgelist = []
    sumweight_per_layer = defaultdict(lambda: [0] * numlayer)
    
    # mutiply the node index by 10 and add the last digit to distinguish the type of node
    # last digit information: 
    # a node -> clustring(0), citing(1), cited nodes(2)
    # basic rules: 
    # 1) citing node cites clustering node (*1->*0) (CC)
    # 2) clustring node cites cited node (*0->*2) (BC)
    # 3) clustering node cites clustring node (DC)
    # 3~9 Reserved for now (candidates: keyword, affiliation, and etc...)
    # retunning edge node order: (clustering node, other layer distinguished with a last digit)
    
    for edge in raw_edgelist:
        citing_node = edge[0] * 10 + 1
        clustering_node_cited= edge[1] * 10
        cited_node = edge[1] * 10 + 2
        clustering_node_citing = edge[0] * 10

        sumweight_per_layer[clustering_node_cited][1] += 1
        sumweight_per_layer[clustering_node_citing][2] += 1
        sumweight_per_layer[clustering_node_cited][0] += 1
        sumweight_per_layer[clustering_node_citing][0] += 1
        
        res_edgelist.append((clustering_node_cited, citing_node))
        res_edgelist.append((clustering_node_citing, cited_node))        
        res_edgelist.append((clustering_node_citing, clustering_node_cited))
    return res_edgelist, sumweight_per_layer

def normNodesplitNetwork_BC_CC_DCT1(raw_edgelist, r_layerslist = [1/3, 1/3, 1/3]): # edglist format: (from, to)
    res_edgelist, strengthdict = genRawNodesplitNetwork_BC_CC_DCT1(raw_edgelist)
    res_weighed_edgelist = []
    for edge in res_edgelist:
        target_layer = edge[1] % 10
        weight = 1 / strengthdict[edge[0]][target_layer] * r_layerslist[target_layer]
        res_weighed_edgelist.append((edge[0], edge[1], weight))
        if(edge[0] % 10 != 0):
            print("error!")
        if(target_layer == 0):
            weight = 1 / strengthdict[edge[1]][target_layer] * r_layerslist[target_layer]
        else:
            pass
        res_weighed_edgelist.append((edge[1], edge[0], weight))
    return res_weighed_edgelist # One should be note that it returns Directed Weight Network unlike others

def genRawNodesplitNetwork_BC_CC_DCT2(raw_edgelist): # edglist format: (from, to)
    numlayer = 4
    res_edgelist = []
    sumweight_per_layer = defaultdict(lambda: [0] * numlayer)
       
    # mutiply the node index by 10 and add the last digit to distinguish the type of node
    # last digit information: 
    # a node -> clustring(0), citing(1), cited nodes(2)
    # basic rules: 
    # 1) citing node cites clustering node (*1->*0) (CC)
    # 2) clustring node cites cited node (*0->*2) (BC)
    # 3) clustering node cites clustring node via pseudo node (*0->*3->*0)(DC)
    #    here, pseudo node represent direct citation between them
    # 4~9 Reserved for now (candidates: keyword, affiliation, and etc...)
    current_dc_index = 0
    
    for edge in raw_edgelist:
        citing_node = edge[0] * 10 + 1
        clustering_node_cited= edge[1] * 10
        cited_node = edge[1] * 10 + 2
        clustering_node_citing = edge[0] * 10
        pseudo_dc_node = current_dc_index * 10 + 3
        current_dc_index += 1

        res_edgelist.append((clustering_node_cited, citing_node))
        res_edgelist.append((clustering_node_citing, cited_node))        
        res_edgelist.append((clustering_node_citing, pseudo_dc_node))
        res_edgelist.append((clustering_node_cited, pseudo_dc_node))
        
        sumweight_per_layer[clustering_node_cited][1] += 1
        sumweight_per_layer[clustering_node_citing][2] += 1
        sumweight_per_layer[clustering_node_cited][3] += 1
        sumweight_per_layer[clustering_node_citing][3] += 1
        
    return res_edgelist, sumweight_per_layer

def normNodesplitNetwork_BC_CC_DCT2(raw_edgelist, r_layerslist = [0, 1/3, 1/3, 1/3]): # edglist format: (from, to)
    res_edgelist, strengthdict = genRawNodesplitNetwork_BC_CC_DCT2(raw_edgelist)
    res_weighed_edgelist = []
    for edge in res_edgelist:
        target_layer = edge[1] % 10
        weight = 1 / strengthdict[edge[0]][target_layer] * r_layerslist[target_layer]
        res_weighed_edgelist.append((edge[0], edge[1], weight))
    return res_weighed_edgelist

def calc_granularity_clusters(cluster_list):
    # cluster list: list idx = cluster idx
    # list[cluster idx]: list of node indexes for a certain cluster
    cluster_count = len(cluster_list)
    total_count = sum([len(x) for x in cluster_list])
    gran_denom = sum([len(x)**2 for x in cluster_list])
    gran1 = total_count / gran_denom
    #gran2 = len(cluster_count) / gran_denom
    return gran1

def get_filelength(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

def f_out_clusters(clusterdict, filename):
    f_out = open(filename, "w")
    for k, v in clusterdict.items():
        f_out.write(str(k) + "\t" + str(v) + "\n")
    f_out.close()
       
def get_cluster_score(ig_network, header, fosname, r_layers, res):
    part_now_leidenRB = leidenalg.find_partition(ig_network, leidenalg.RBConfigurationVertexPartition, resolution_parameter=res, weights ="weight")
    cur_elm2clu_dict = {}
    for idx, cluster in enumerate(part_now_leidenRB):
        for node in cluster:
            cur_node = part_now_leidenRB.graph.vs[node]["name"]
            if(cur_node % 10 == 0):
                cur_elm2clu_dict[int(cur_node/10)] = idx
    c1_name = set(cur_elm2clu_dict)
    c2_name = set(paperfielddict)
    intersectionset = c1_name.intersection(c2_name)
    del(c1_name)
    del(c2_name)
    dict1 = {}
    dict2 = {}
    for idx_tmp, val in enumerate(intersectionset):
        dict1[idx_tmp] = paperfielddict[val]
        dict2[idx_tmp] = [cur_elm2clu_dict[val]]
    c1 = Clustering(elm2clu_dict = dict1)
    c2 = Clustering(elm2clu_dict = dict2)
    outclu_fname = "./Cluster_out/" + header + "_" + fosname + "_r_" + "_".join([format(x, ".2f") for x in r_layers]) + "_" + format(res, ".2f") + ".clu"
    f_out_clusters(cur_elm2clu_dict, outclu_fname)
    granularity = calc_granularity_clusters(part_now_leidenRB)
    nmi_sim = sim.nmi(c1, c2)
    return([header, fosname, r_layers, res, granularity, nmi_sim])

def get_cluster_score_nofileout(ig_network, header, fosname, r_layers, res):
    part_now_leidenRB = leidenalg.find_partition(ig_network, leidenalg.RBConfigurationVertexPartition, resolution_parameter=res, weights ="weight")
    cur_elm2clu_dict = {}
    for idx, cluster in enumerate(part_now_leidenRB):
        for node in cluster:
            cur_node = part_now_leidenRB.graph.vs[node]["name"]
            if(cur_node % 10 == 0):
                cur_elm2clu_dict[int(cur_node/10)] = idx
    c1_name = set(cur_elm2clu_dict)
    c2_name = set(paperfielddict)
    intersectionset = c1_name.intersection(c2_name)
    del(c1_name)
    del(c2_name)
    dict1 = {}
    dict2 = {}
    for idx_tmp, val in enumerate(intersectionset):
        dict1[idx_tmp] = paperfielddict[val]
        dict2[idx_tmp] = [cur_elm2clu_dict[val]]
    c1 = Clustering(elm2clu_dict = dict1)
    c2 = Clustering(elm2clu_dict = dict2)
    granularity = calc_granularity_clusters(part_now_leidenRB)
    nmi_sim = sim.nmi(c1, c2)
    return([header, fosname, r_layers, res, granularity, nmi_sim])

In [None]:
#FieldReferences = pd.read_csv("./Processed_data/MAG_ALLReferenceL0FieldMerged.tsv", sep="\t", )
#FieldReferences

In [None]:
FOS_datafile = "./MAG_dataset/advanced/FieldsOfStudy.txt"
df_fosinfo = pd.read_csv(FOS_datafile, sep="\t", header=None, usecols=[0, 3, 5, 6], quoting=csv.QUOTE_NONE)
df_fosinfo.columns = ["FieldofStudyId", "FosName", "FosLevel", "PaperCount"]
L0_FOSID_List = df_fosinfo[df_fosinfo["FosLevel"] == 0]["FieldofStudyId"].tolist()

In [None]:
df_fosinfo[df_fosinfo["FosLevel"] == 0]

In [None]:
fos_l0_namedict = df_fosinfo[df_fosinfo["FosLevel"] == 0].set_index("FieldofStudyId")["FosName"].to_dict()

In [None]:
input_file_mag_l2_field = "./fastnvme01//MAG_PaperFieldL2_MAX.txt"
paperfielddict = defaultdict(list)
with open(input_file_mag_l2_field, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    for row in reader:  
        paperfielddict[int(row[0])].append(int(row[3]))
print(len(paperfielddict))

In [None]:
TargetFOSList = L0_FOSID_List
reslist = [0.2*x for x in range(1, 101)]
result_list = []
count_clustering_nodes = defaultdict(int)

In [None]:
input_file_reference_with_l0 = "./fastnvme01//MAG_ALLReferenceL0FieldMerged.tsv"
field_edge_list_dict = defaultdict(list)
with open(input_file_reference_with_l0, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    #for row in tqdm(reader, total=get_filelength(input_file_reference_with_l0)): # for a new file
    for row in tqdm(reader, total=956198971): # Hard coded to improve the speed
        if(row[1] == row[3]):
            field_edge_list_dict[int(row[1])].append((int(row[0]), int(row[2])))
            

In [None]:
for fos in sorted(field_edge_list_dict, key=lambda k: len(field_edge_list_dict[k]), reverse=False):
    print(fos_l0_namedict[fos], fos, len(field_edge_list_dict[fos]))


In [None]:
TargetFOSList = [144024400, 41008148, 162324750, 192562407, 33923547, 121332964]


In [None]:
no_process = 10
                
for TargetFOS in tqdm(TargetFOSList):
    typename =  "NS-BC-CC-Hybrid"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [0, 1/2, 1/2]
    nodesplitBCCC = normNodesplitNetwork_BC_CC(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=False, weights=True)
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()    

for TargetFOS in tqdm(TargetFOSList):
    typename =  "NS-CC"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS]   
    r_layers = [0, 1, 0]
    nodesplitBCCC = normNodesplitNetwork_BC_CC(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=False, weights=True)
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()    
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers)    
    result_list += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()

for TargetFOS in tqdm(TargetFOSList):
    typename =  "NS-BC"   
    FOSNAME = fos_l0_namedict[TargetFOS]
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS]
    r_layers = [0, 0, 1]
    nodesplitBCCC = normNodesplitNetwork_BC_CC(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=False, weights=True)
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()    
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)           
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))     
    pool.close()

In [None]:
no_process = 10
                
for TargetFOS in tqdm(TargetFOSList):
    typename =  "NS-BC-CC-Hybrid"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [0, 1/10, 9/10]
    nodesplitBCCC = normNodesplitNetwork_BC_CC(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=False, weights=True)
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()    

no_process = 10
                
for TargetFOS in tqdm(TargetFOSList):
    typename =  "NS-BC-CC-Hybrid"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [0, 9/10, 1/10]
    nodesplitBCCC = normNodesplitNetwork_BC_CC(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=False, weights=True)
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()

In [None]:
len (result_list)

In [None]:
df_l2_sim_BC_CC = pd.DataFrame(result_list)
df_l2_sim_BC_CC.columns = ["NetworkType", "L0FOS", "LayerRatio", "ResParm", "Granularity", "L2NMI"]

In [None]:
# uncomment if you want to save the sim file
#df_l2_sim_BC_CC.to_csv("./MAG_L2_Similarity/20200907_BC-CC.tsv", sep = "\t", index = None)

In [None]:
DF_FOS_LIST = df_l2_sim_BC_CC["L0FOS"].unique()

In [None]:
for FOS in DF_FOS_LIST:
    plt.xscale("log")
    plt.yscale("log")
    df_l2_sim_BC_CC[(df_l2_sim_BC_CC["NetworkType"] == "NS-BC-CC-Hybrid")
                    & (df_l2_sim_BC_CC["LayerRatio"].map(tuple).isin([(0,0.5, 0.5)]))
                    & (df_l2_sim_BC_CC["L0FOS"] == FOS)].set_index("Granularity")["L2NMI"].plot()
    df_l2_sim_BC_CC[(df_l2_sim_BC_CC["NetworkType"] == "NS-BC-CC-Hybrid")
                    & (df_l2_sim_BC_CC["LayerRatio"].map(tuple).isin([(0,0.1, 0.9)]))
                    & (df_l2_sim_BC_CC["L0FOS"] == FOS)].set_index("Granularity")["L2NMI"].plot()
    df_l2_sim_BC_CC[(df_l2_sim_BC_CC["NetworkType"] == "NS-BC-CC-Hybrid")
                    & (df_l2_sim_BC_CC["LayerRatio"].map(tuple).isin([(0,0.9, 0.1)]))
                    & (df_l2_sim_BC_CC["L0FOS"] == FOS)].set_index("Granularity")["L2NMI"].plot()        
    df_l2_sim_BC_CC[(df_l2_sim_BC_CC["NetworkType"] == "NS-BC")
                    & (df_l2_sim_BC_CC["LayerRatio"].map(tuple).isin([(0, 0, 1)]))
                    & (df_l2_sim_BC_CC["L0FOS"] == FOS)].set_index("Granularity")["L2NMI"].plot()
    df_l2_sim_BC_CC[(df_l2_sim_BC_CC["NetworkType"] == "NS-CC")
                    & (df_l2_sim_BC_CC["LayerRatio"].map(tuple).isin([(0, 1, 0)]))
                    & (df_l2_sim_BC_CC["L0FOS"] == FOS)].set_index("Granularity")["L2NMI"].plot()
    plt.title(FOS)
    plt.show()
    plt.close()

In [None]:
df_l2_sim_BC_CC["LayerRatio"].map(tuple).isin([(0,0.5, 0.5)]).value_counts()

In [None]:
TargetFOSList = [144024400, 41008148, 162324750, 192562407, 33923547]#, 121332964]
#TargetFOSList = [121332964]
result_list2 = []
no_process = 6

for TargetFOS in tqdm(TargetFOSList):
    typename =  "NS-BC-CC-DC-Hybrid-T2"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [0, 1/3, 1/3, 1/3]
    nodesplitBCCC = normNodesplitNetwork_BC_CC_DCT2(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=False, weights=True)
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list2 += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()
    


In [None]:
TargetFOSList = [144024400, 41008148, 162324750, 192562407, 33923547]#, 121332964]
result_list3 = []

no_process = 6

for TargetFOS in tqdm(TargetFOSList):
    typename =  "NS-BC-CC-DC-Hybrid-T1"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [1/3, 1/3, 1/3]
    nodesplitBCCC = normNodesplitNetwork_BC_CC_DCT1(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=True, weights=True) # T1 BC-CC-DC hybrid should be treated as directed weighted network
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list3 += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()

for TargetFOS in tqdm(TargetFOSList):
    typename =  "NS-BC-CC-DC-Hybrid-T1"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [1/10, 9/20, 9/20]
    nodesplitBCCC = normNodesplitNetwork_BC_CC_DCT1(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=True, weights=True) # T1 BC-CC-DC hybrid should be treated as directed weighted network
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list3 += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()        

In [None]:
df_l2_sim_BC_CC3 = pd.DataFrame(result_list3)
df_l2_sim_BC_CC3.columns = ["NetworkType", "L0FOS", "LayerRatio", "ResParm", "Granularity", "L2NMI"]
df_l2_sim_BC_CC3.to_csv("./MAG_L2_Similarity/20200923_BC_CC_DC_T1_1.tsv", sep = "\t", index = None)

In [None]:
# Run Physics Separately
# TargetFOSList = [144024400, 41008148, 162324750, 192562407, 33923547, 121332964]
TargetFOSList = [121332964]
result_list4 = []
no_process = 4

for TargetFOS in tqdm(TargetFOSList):
    typename =  "NS-BC-CC-DC-Hybrid-T2"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [0, 1/3, 1/3, 1/3]
    nodesplitBCCC = normNodesplitNetwork_BC_CC_DCT2(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=False, weights=True)
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list4 += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()

In [None]:
df_l2_sim_BC_CC4 = pd.DataFrame(result_list4)
df_l2_sim_BC_CC4.columns = ["NetworkType", "L0FOS", "LayerRatio", "ResParm", "Granularity", "L2NMI"]
df_l2_sim_BC_CC4.to_csv("./MAG_L2_Similarity/20200923_BC_CC_DC_T2_2.tsv", sep = "\t", index = None)

In [None]:
TargetFOSList = [121332964]
result_list5 = []
no_process = 4

for TargetFOS in tqdm(TargetFOSList):
    typename =  "NS-BC-CC-DC-Hybrid-T1"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [1/3, 1/3, 1/3]
    nodesplitBCCC = normNodesplitNetwork_BC_CC_DCT1(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=True, weights=True) # T1 BC-CC-DC hybrid should be treated as directed weighted network
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list5 += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()

for TargetFOS in tqdm(TargetFOSList):
    typename =  "NS-BC-CC-DC-Hybrid-T1"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [1/10, 9/20, 9/20]
    nodesplitBCCC = normNodesplitNetwork_BC_CC_DCT1(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=True, weights=True) # T1 BC-CC-DC hybrid should be treated as directed weighted network
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list5 += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()       

In [None]:
df_l2_sim_BC_CC5 = pd.DataFrame(result_list5)
df_l2_sim_BC_CC5.columns = ["NetworkType", "L0FOS", "LayerRatio", "ResParm", "Granularity", "L2NMI"]
df_l2_sim_BC_CC5.to_csv("./MAG_L2_Similarity/20200923_BC_CC_DC_T1_2.tsv", sep = "\t", index = None)

In [None]:
# Run Physics Separately
TargetFOSList = [144024400, 41008148, 162324750, 192562407, 33923547]#, 121332964]
#TargetFOSList = [121332964]
result_list6 = []
no_process = 6
   
for TargetFOS in tqdm(TargetFOSList):
    typename =  "DC-T1"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [1, 0, 0]
    nodesplitBCCC = normNodesplitNetwork_BC_CC_DCT1(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=True, weights=True) # T1 BC-CC-DC hybrid should be treated as directed weighted network
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list6 += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()

for TargetFOS in tqdm(TargetFOSList):
    typename =  "DC-T2"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [0, 0, 0, 1]
    nodesplitBCCC = normNodesplitNetwork_BC_CC_DCT2(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=False, weights=True)
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list6 += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()

In [None]:
df_l2_sim_BC_CC6 = pd.DataFrame(result_list6)
df_l2_sim_BC_CC6.columns = ["NetworkType", "L0FOS", "LayerRatio", "ResParm", "Granularity", "L2NMI"]
df_l2_sim_BC_CC6.to_csv("./MAG_L2_Similarity/202001001_DC_1.tsv", sep = "\t", index = None)

In [None]:
# Run Physics Separately
# TargetFOSList = [144024400, 41008148, 162324750, 192562407, 33923547, 121332964]
TargetFOSList = [121332964]
result_list7 = []
no_process = 4

for TargetFOS in tqdm(TargetFOSList):
    typename =  "DC-T2"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [0, 0, 0, 1]
    nodesplitBCCC = normNodesplitNetwork_BC_CC_DCT2(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=False, weights=True)
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list7 += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()
    
for TargetFOS in tqdm(TargetFOSList):
    typename =  "DC-T1"
    FOSNAME = fos_l0_namedict[TargetFOS]   
    #lst_citation_subset = FieldReferences[(FieldReferences["FOSCiting"] == TargetFOS) & (FieldReferences["FOSCited"] == TargetFOS)][["CitingId", "CitedId"]].values.tolist()
    lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
    r_layers = [1, 0, 0]
    nodesplitBCCC = normNodesplitNetwork_BC_CC_DCT1(lst_citation_subset, r_layers)
    G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=True, weights=True) # T1 BC-CC-DC hybrid should be treated as directed weighted network
    G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
    del(lst_citation_subset)
    del(nodesplitBCCC)
    gc.collect()
    for node in G_NS_HB_BC_CC.vs:
        if(node["name"] % 10 == 0):
            count_clustering_nodes[(TargetFOS, typename)] += 1
    pool = Pool(processes=no_process)
    func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
    result_list5 += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
    pool.close()         


In [None]:
df_l2_sim_BC_CC7 = pd.DataFrame(result_list7)
df_l2_sim_BC_CC7.columns = ["NetworkType", "L0FOS", "LayerRatio", "ResParm", "Granularity", "L2NMI"]
df_l2_sim_BC_CC7.to_csv("./MAG_L2_Similarity/202001001_DC_2.tsv", sep = "\t", index = None)

# 왜 DC T2가 성능이 좋을까? DC-T2만으로도 성능이 좋을까? 

In [None]:


# Run Physics Separately
TargetFOSList = [144024400, 41008148, 162324750, 192562407, 33923547 121332964]
#TargetFOSList = [121332964]
result_list8 = []
no_process = 4

r_dc_list = [0.9, 0.5, 0.01, 0.001, 0.0001, 0.00001]

for TargetFOS in tqdm(TargetFOSList):
    for r_dc in r_dc_list:
        typename =  "NS-BC-CC-DC-Hybrid-T1"
        FOSNAME = fos_l0_namedict[TargetFOS]   
        lst_citation_subset = field_edge_list_dict[TargetFOS] # Do not forget. = operator of list is just copying a address pointer. 
        r_ccbc = (1-r_dc)/2
        r_layers = [r_dc, r_ccbc, r_ccbc]
        nodesplitBCCC = normNodesplitNetwork_BC_CC_DCT1(lst_citation_subset, r_layers)
        G_NS_HB_BC_CC = ig.Graph.TupleList(nodesplitBCCC, directed=True, weights=True) # T1 BC-CC-DC hybrid should be treated as directed weighted network
        G_NS_HB_BC_CC = G_NS_HB_BC_CC.components().giant()
        del(lst_citation_subset)
        del(nodesplitBCCC)
        gc.collect()
        for node in G_NS_HB_BC_CC.vs:
            if(node["name"] % 10 == 0):
                count_clustering_nodes[(TargetFOS, typename)] += 1
        pool = Pool(processes=no_process)
        func = partial(get_cluster_score, G_NS_HB_BC_CC, typename, FOSNAME, r_layers) 
        result_list8 += list(tqdm(pool.imap(func, reslist), total=len(reslist), desc = FOSNAME))
        pool.close()


In [None]:
df_l2_sim_BC_CC8 = pd.DataFrame(result_list8)
df_l2_sim_BC_CC8.columns = ["NetworkType", "L0FOS", "LayerRatio", "ResParm", "Granularity", "L2NMI"]
df_l2_sim_BC_CC8.to_csv("./MAG_L2_Similarity/202001001_DC_2.tsv", sep = "\t", index = None)