In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import glob
from collections import defaultdict
from collections import Counter
import csv
from clusim.clustering import Clustering, print_clustering
import clusim.sim as sim
import math
from multiprocessing import Pool
from tqdm.notebook import tqdm
import itertools

In [None]:
def get_nmi_subset(target_dict, ref_dict):
    c1_name = set(target_dict)
    c2_name = set(ref_dict)
    intersectionset = c1_name.intersection(c2_name)
    del(c1_name)
    del(c2_name)
    dict1 = {}
    dict2 = {}
    for idx_tmp, val in enumerate(intersectionset):
        dict1[idx_tmp] = [target_dict[val]]
        dict2[idx_tmp] = [ref_dict[val]]
    c1 = Clustering(elm2clu_dict = dict1)
    c2 = Clustering(elm2clu_dict = dict2)
    nmi_sim = sim.nmi(c1, c2)
    return nmi_sim

In [None]:
FOS_datafile = "./MAG_dataset/advanced/FieldsOfStudy.txt"
df_fosinfo = pd.read_csv(FOS_datafile, sep="\t", header=None, usecols=[0, 3, 5, 6], quoting=csv.QUOTE_NONE)
df_fosinfo.columns = ["FieldofStudyId", "FosName", "FosLevel", "PaperCount"]
fos_l0_namedict = df_fosinfo[df_fosinfo["FosLevel"] == 0].set_index("FosName")["FieldofStudyId"].to_dict()

In [None]:
input_file_mag_l2_field = "./fastnvme01//MAG_PaperFieldL2_MAX.txt"
paperfielddict = defaultdict(int)
with open(input_file_mag_l2_field, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    next(reader)
    for row in reader:  
        paperfielddict[int(row[0])] = int(row[3])
print(len(paperfielddict))

In [None]:
df_infield = pd.read_csv("./Processed_data/MAG_L0CitationCounts.tsv", sep="\t")

In [None]:
df_infield["IF_PCT"] = df_infield.groupby("L0Field")["InFieldCitationCounts"].rank(pct=True)

In [None]:
df_all = pd.read_csv("./Processed_data/MAG_CitationCounts.tsv", sep="\t")

In [None]:
df_all["ALL_PCT"] = df_all["CitationCounts"].rank(pct=True)

In [None]:
df_all = df_infield[["PaperId", "L0Field"]].merge(df_all, how="outer")

In [None]:
def calc_granularity_cludict(cluster_dict):
    # cluster list: list idx = cluster idx
    # list[cluster idx]: list of node indexes for a certain cluster
    cluster_count = len(set(cluster_dict.values()))
    total_count = len(cluster_dict)
    gran_denom = sum([x**2 for x in Counter(cluster_dict.values()).values()])
    gran1 = total_count / gran_denom
    #gran2 = len(cluster_count) / gran_denom
    return gran1

def get_params_from_filename(fname):
    subname = fname.split("/")[3]
    net_type = subname.split("_", 1)[0]
    res = subname.rsplit("_", 1)[1].rsplit(".", 1)[0]
    temp = subname.split("_", 3)
    if(temp[2][0]=="r"):
        fieldname = temp[1]
        r_layer = tuple(temp[3].rsplit("_", 1)[0].split("_"))
    else:
        fieldname = temp[1]+" "+temp[2]
        r_layer = tuple(temp[3].split("_", 1)[1].rsplit("_", 1)[0].split("_"))
    return [net_type, fieldname, r_layer, res]

In [None]:
def nmi_calculate_quantiles(fname):
    now_quantile_list = []
    param_list = get_params_from_filename(fname)
    clusters = pd.read_csv(fname, sep="\t", header=None, names=["PaperId", "ClusterNum"])
    bound = [0, 0.25, 0.5, 0.75, 1.0]
    L0FOSID = fos_l0_namedict[param_list[1]]
    for i in range(len(bound)-1):
        lb = bound[i]
        ub = bound[i+1]
        paper_sublist = df_infield[(df_infield["L0Field"]==L0FOSID) & (df_infield["IF_PCT"] >= lb)  & (df_infield["IF_PCT"] < ub)].sort_values("IF_PCT")["PaperId"].to_list()
        subclusters = clusters[clusters["PaperId"].isin(paper_sublist)]
        clusterdict = subclusters.set_index("PaperId")["ClusterNum"].to_dict()
        if(len(subclusters) == 0):
            continue
        nmi_val = get_nmi_subset(clusterdict, paperfielddict)
        subgran = calc_granularity_cludict(clusterdict)
        res = param_list + [lb, ub, nmi_val, subgran, nmi_val/subgran]
        now_quantile_list.append(res)
    return now_quantile_list

In [None]:
no_process = 32
filenames = sorted(glob.glob("./fastnvme01/Cluster_out/*.clu"))
quartile_result = []
quartile_result.append(["net_type", "field_name", "r_layer", "res", "pct_lb", "pct_ub", "nmi", "granularity", "nmi_per_gran"])

pool = Pool(processes=no_process)
sub_quartile_result = list(tqdm(pool.imap(nmi_calculate_quantiles, filenames), total=len(filenames), desc = "QUANTILE"))
pool.close()
quartile_result += list(itertools.chain(*sub_quartile_result))

In [None]:
quantile_df = pd.DataFrame(quartile_result[1:], columns=quartile_result[0])

In [None]:
quantile_df.to_csv("./MAG_L2_Similarity/20201028_NMI_BY_CITATION_INFILED_QUANTILE.tsv", index=None, sep="\t")

In [None]:
quantile_df

In [None]:
def nmi_calculate_quantiles_all(fname):
    now_quantile_list = []
    param_list = get_params_from_filename(fname)
    clusters = pd.read_csv(fname, sep="\t", header=None, names=["PaperId", "ClusterNum"])
    bound = [0, 0.25, 0.5, 0.75, 1.0]
    L0FOSID = fos_l0_namedict[param_list[1]]    
    for i in range(len(bound)-1):
        lb = bound[i]
        ub = bound[i+1]
        paper_sublist = df_all[(df_all["L0Field"]==L0FOSID) & (df_all["ALL_PCT"] >= lb)  & (df_all["ALL_PCT"] < ub)].sort_values("ALL_PCT")["PaperId"].to_list()
        subclusters = clusters[clusters["PaperId"].isin(paper_sublist)]
        clusterdict = subclusters.set_index("PaperId")["ClusterNum"].to_dict()
        if(len(subclusters) == 0):
            continue
        nmi_val = get_nmi_subset(clusterdict, paperfielddict)
        subgran = calc_granularity_cludict(clusterdict)
        res = param_list + [lb, ub, nmi_val, subgran, nmi_val/subgran]
        now_quantile_list.append(res)
    return now_quantile_list

In [None]:
no_process = 32
filenames = sorted(glob.glob("./fastnvme01/Cluster_out/*.clu"))
all_quartile_result = []
all_quartile_result.append(["net_type", "field_name", "r_layer", "res", "pct_lb", "pct_ub", "nmi", "granularity", "nmi_per_gran"])

pool = Pool(processes=no_process)
sub_all_quartile_result = list(tqdm(pool.imap(nmi_calculate_quantiles_all, filenames), total=len(filenames), desc = "QUANTILE"))
pool.close()
all_quartile_result += list(itertools.chain(*sub_all_quartile_result))

In [None]:
all_quantile_df = pd.DataFrame(all_quartile_result[1:], columns=all_quartile_result[0])
all_quantile_df.to_csv("./MAG_L2_Similarity/20201028_NMI_BY_CITATION_ALL_QUANTILE.tsv", index=None, sep="\t")

In [None]:
def nmi_calculate_citation_count(fname):
    now_count_list = []
    param_list = get_params_from_filename(fname)
    clusters = pd.read_csv(fname, sep="\t", header=None, names=["PaperId", "ClusterNum"])
    L0FOSID = fos_l0_namedict[param_list[1]]
    max_len = int(math.log(df_infield[(df_infield["L0Field"]==L0FOSID)]["InFieldCitationCounts"].max(), multip)) + 1
    bound = [multip ** i for i in range(max_len)]   
    for i in range(len(bound)-1):
        lb = bound[i]
        ub = bound[i+1]
        paper_sublist = df_infield[(df_infield["L0Field"]==L0FOSID) & (df_infield["InFieldCitationCounts"] >= lb)  & (df_infield["InFieldCitationCounts"] < ub)].sort_values("InFieldCitationCounts")["PaperId"].to_list()
        subclusters = clusters[clusters["PaperId"].isin(paper_sublist)]
        clusterdict = subclusters.set_index("PaperId")["ClusterNum"].to_dict()
        if(len(subclusters) == 0):
            continue
        nmi_val = get_nmi_subset(clusterdict, paperfielddict)
        subgran = calc_granularity_cludict(clusterdict)
        res = param_list + [lb, ub, nmi_val, subgran, nmi_val/subgran]
        now_count_list.append(res)
    return now_count_list

In [None]:
no_process = 32
filenames = sorted(glob.glob("./fastnvme01/Cluster_out/*.clu"))
count_result = []
count_result.append(["net_type", "field_name", "r_layer", "res", "citation_infield_lb", "citation_infield_ub", "nmi", "granularity", "nmi_per_gran"])
multip = 3
pool = Pool(processes=no_process)
sub_count_result = list(tqdm(pool.imap(nmi_calculate_citation_count, filenames), total=len(filenames), desc = "QUANTILE"))
pool.close()
count_list += list(itertools.chain(*sub_count_result))

In [None]:
count_df = pd.DataFrame(count_result[1:], columns=count_result[0])
count_df.to_csv("./MAG_L2_Similarity/20201028_NMI_BY_CITATION_INFIELD.tsv", index=None, sep="\t")

In [None]:
def nmi_calculate_citation_count_all(fname):
    now_count_list = []
    param_list = get_params_from_filename(fname)
    L0FOSID = fos_l0_namedict[param_list[1]]   
    clusters = pd.read_csv(fname, sep="\t", header=None, names=["PaperId", "ClusterNum"])
    max_len = int(math.log(df_all[(df_all["L0Field"]==L0FOSID)]["CitationCounts"].max(), multip)) + 1
    multip = 3
    bound = [multip ** i for i in range(max_len)]    
    for i in range(len(bound)-1):
        lb = bound[i]
        ub = bound[i+1]
        paper_sublist = df_all[(df_all["L0Field"]==L0FOSID) & (df_all["CitationCounts"] >= lb)  & (df_all["CitationCounts"] < ub)].sort_values("CitationCounts")["PaperId"].to_list()
        subclusters = clusters[clusters["PaperId"].isin(paper_sublist)]
        clusterdict = subclusters.set_index("PaperId")["ClusterNum"].to_dict()
        if(len(subclusters) == 0):
            continue
        nmi_val = get_nmi_subset(clusterdict, paperfielddict)
        subgran = calc_granularity_cludict(clusterdict)
        res = param_list + [lb, ub, nmi_val, subgran, nmi_val/subgran]
        now_count_list.append(res)
    return now_count_list

In [None]:
no_process = 32
filenames = sorted(glob.glob("./fastnvme01/Cluster_out/*.clu"))
all_count_result = []
all_count_result.append(["net_type", "field_name", "r_layer", "res", "citation_infield_lb", "citation_infield_ub", "nmi", "granularity", "nmi_per_gran"])

pool = Pool(processes=no_process)
sub_all_count_result = list(tqdm(pool.imap(nmi_calculate_quantiles, filenames), total=len(filenames), desc = "QUANTILE"))
pool.close()
all_count_result += list(itertools.chain(*sub_all_count_result))

In [None]:
all_count_df = pd.DataFrame(all_count_result[1:], columns=all_count_result[0])

In [None]:
count_df.to_csv("./MAG_L2_Similarity/20201028_NMI_BY_CITATION_ALL.tsv", index=None, sep="\t")