In [None]:
import pandas as pd
import csv

In [None]:
def calc_granularity_clusters(cluster_list):
    # cluster list: list idx = cluster idx
    # list[cluster idx]: list of node indexes for a certain cluster
    cluster_count = len(cluster_list)
    total_count = sum([len(x) for x in cluster_list])
    gran_denom = sum([len(x)**2 for x in cluster_list])
    gran1 = total_count / gran_denom
    #gran2 = len(cluster_count) / gran_denom
    return gran1

In [None]:
input_file_reference_with_l0 = "./fastnvme01//MAG_ALLReferenceL0FieldMerged.tsv"
df_neighbor = pd.read_csv(input_file_reference_with_l0, sep="\t")

In [None]:
FOS_datafile = "./MAG_dataset/advanced/FieldsOfStudy.txt"
df_fosinfo = pd.read_csv(FOS_datafile, sep="\t", header=None, usecols=[0, 3, 5, 6], quoting=csv.QUOTE_NONE)
df_fosinfo.columns = ["FieldofStudyId", "FosName", "FosLevel", "PaperCount"]
fos_l0_namedict = df_fosinfo[df_fosinfo["FosLevel"] == 0].set_index("FieldofStudyId")["FosName"].to_dict()

In [None]:
fos_l0_namedict

In [None]:
df_neighbor

In [None]:
TargetFOSList = [144024400, 41008148, 162324750, 192562407, 33923547, 121332964]

In [None]:
from tqdm.notebook import tqdm

for i in tqdm(range(len(TargetFOSList))):
    FosIdx = i
    reslist = [0.2*x for x in range(1, 101)]
    result_T1 = []
    header = "DC-T1"
    fosname = fos_l0_namedict[TargetFOSList[FosIdx]]    
    for i in tqdm(range(len(reslist))):

        r_layers = [1, 0, 0]
        res = reslist[i]
        infile_fname = "./Cluster_out/" + header + "_" + fosname + "_r_" + "_".join([format(x, ".2f") for x in r_layers]) + "_" + format(res, ".2f") + ".clu"
        df_cluster = pd.read_csv(infile_fname, sep="\t", header=None, names=["PaperId", "ClusterId"])
        cluster_dict = df_cluster.set_index("PaperId")["ClusterId"].to_dict()
        df_targetfos = df_neighbor[(df_neighbor["FOSCiting"]==TargetFOSList[FosIdx]) & (df_neighbor["FOSCited"]==TargetFOSList[FosIdx])].copy()
        df_targetfos['citing_cluster'] = df_targetfos['CitingId'].map(cluster_dict)
        df_targetfos['cited_cluster'] = df_targetfos['CitedId'].map(cluster_dict)
        df_targetfos = df_targetfos.dropna(subset=["citing_cluster", "cited_cluster"])
        num_notsame = sum(df_targetfos["citing_cluster"] != df_targetfos["cited_cluster"])
        num_same = sum(df_targetfos["citing_cluster"] == df_targetfos["cited_cluster"])
    
        cluster_list = df_cluster.groupby('ClusterId')['PaperId'].apply(list).tolist()
        granularity = calc_granularity_clusters(cluster_list)
        result_T1.append([res, granularity, num_same, num_notsame, num_same/(num_same + num_notsame)])
        
    out_columns = ["res", "granularity", "num_same", "num_notsame", "num_same_fraction"]    
    outfile_fname = "./MAG_same_cluster/" + header + "_" + fosname + ".tsv"
    pd.DataFrame(result_T1, columns=out_columns).to_csv(outfile_fname, sep="\t")
    
    result_T2 = []

    
    header = "DC-T2"    
    for i in tqdm(range(len(reslist))):

        r_layers = [0, 0, 0, 1]
        res = reslist[i]
        infile_fname = "./Cluster_out/" + header + "_" + fosname + "_r_" + "_".join([format(x, ".2f") for x in r_layers]) + "_" + format(res, ".2f") + ".clu"
        df_cluster = pd.read_csv(infile_fname, sep="\t", header=None, names=["PaperId", "ClusterId"])
        cluster_dict = df_cluster.set_index("PaperId")["ClusterId"].to_dict()
        df_targetfos = df_neighbor[(df_neighbor["FOSCiting"]==TargetFOSList[FosIdx]) & (df_neighbor["FOSCited"]==TargetFOSList[FosIdx])].copy()
        df_targetfos['citing_cluster'] = df_targetfos['CitingId'].map(cluster_dict)
        df_targetfos['cited_cluster'] = df_targetfos['CitedId'].map(cluster_dict)
        df_targetfos = df_targetfos.dropna(subset=["citing_cluster", "cited_cluster"])
        num_notsame = sum(df_targetfos["citing_cluster"] != df_targetfos["cited_cluster"])
        num_same = sum(df_targetfos["citing_cluster"] == df_targetfos["cited_cluster"])
        
        cluster_list = df_cluster.groupby('ClusterId')['PaperId'].apply(list).tolist()
        granularity = calc_granularity_clusters(cluster_list)
        result_T2.append([res, granularity, num_same, num_notsame, num_same/(num_same + num_notsame)])

    out_columns = ["res", "granularity", "num_same", "num_notsame", "num_same_fraction"]    
    outfile_fname = "./MAG_same_cluster/" + header + "_" + fosname + ".tsv"
    pd.DataFrame(result_T2, columns=out_columns).to_csv(outfile_fname, sep="\t")

In [None]:
from matplotlib import pyplot as plt

def cm2inch(value):
    return value/2.54

fig, axes = plt.subplots(3, 2, figsize =(cm2inch(9)*2, cm2inch(12)*2))
axes = axes.flatten()
for idx in tqdm(range(len(TargetFOSList))):
    now_ax = axes[idx]
    now_ax.set_xscale("log")
    fosname = fos_l0_namedict[TargetFOSList[idx]]
    header = "DC-T1"
    infile_fname = "./MAG_same_cluster/" + header + "_" + fosname + ".tsv"
    df_T1 = pd.read_csv(infile_fname, sep="\t")
    df_T1.set_index("res")["num_same_fraction"].plot(label = "DC T1", ax = now_ax)

    header = "DC-T2"
    infile_fname = "./MAG_same_cluster/" + header + "_" + fosname + ".tsv"
    df_T1 = pd.read_csv(infile_fname, sep="\t")
    df_T1.set_index("res")["num_same_fraction"].plot(label = "DC T2", ax = now_ax)
   
    now_ax.legend(frameon=False, fontsize=8)
    now_ax.set_title(fosname, size=12)
    now_ax.set_ylabel("Co-clustering Fraction")
    now_ax.set_xlabel("Resolution Parameter")

#axes[0].legend(loc="upper left", frameon=False)

labellist = ["(a)", "(b)", "(c)", "(d)", "(e)", "(f)", "(g)", "(h)", "(i)", "(j)", "(k)", "(l)", "(m)", "(n)", "(o)", "(p)"]
for index, ax in enumerate(fig.get_axes()):
    ax.yaxis.label.set_fontsize(12)
    ax.xaxis.label.set_fontsize(12)
    ax.tick_params(axis='x', labelsize=12)
    ax.tick_params(axis='y', labelsize=12)
    ax.text(-0.2, 1.05, labellist[index], fontsize=14, weight='bold', transform=ax.transAxes)  
    #ax.set_yticks([0.1 * i for i in range(10)])
    

plt.tight_layout()
plt.savefig("./SubmissionFigures/Figure8.png")
plt.savefig("./SubmissionFigures/Figure8.pdf")
plt.show()
plt.close()

In [None]:
df_T1