In [None]:
import pandas as pd
import json
from tqdm import tqdm
import pickle as pkl
import math

In [None]:
# filter by rules
with open("constraints.json") as fh:
    constr = json.load(fh)

preds_lst = constr['included_predicates']
bidir_lst = constr['directed_predicates']
semty_lst = constr['excluded_semantic_types']
ad_list = ["C0750901", "C0494463", "C0338450", "C0276496", "C0002395"]
n_semmed = 120000000

reader = pd.read_csv('semmedVER43_2022_R_PREDICATION_try.csv.gz',compression="gzip", iterator=True, chunksize=10000, header=None,encoding="cp1252")

predicate_dict = {}
ad_predicate_dict = {}
in_degree = {}  # gram_o
out_degree = {} # gram_s
triple_score = {}
G2_score = {}
gram_r ={}
gram_sr = {}
gram_so = {}
gram_ro = {}
gram_sro = {}

for chunk in tqdm(reader):
    chunk_dict = chunk.to_dict("index")

    for k, v in chunk_dict.items():
        if v[7] == 1 and v[11] == 1:
            if v[3] in preds_lst:
                try:
                    gram_r[v[3]] += 1
                except:
                    gram_r[v[3]] = 1
                
                sub = v[4].split("|")[0]
                obj = v[8].split("|")[0]
                if (sub not in semty_lst) and (obj not in semty_lst):
                    nv = {0: v[0], 1: v[1], 2: v[2], 3: v[3], 4: sub, 5: v[5], 6: v[6], 8: obj, 9: v[9],
                          10: v[10]}
                    if (sub in ad_list) or (obj in ad_list):
                        ad_predicate_dict[k] = nv
                    else:
                        predicate_dict[k] = nv

                    try:
                        in_degree[obj] += 1
                    except:
                        in_degree[obj] = 1

                    try:
                        out_degree[sub] += 1
                    except:
                        out_degree[sub] = 1
                        
                    try:
                        gram_sr[sub+"|"+v[3]] += 1
                    except:
                        gram_sr[sub+"|"+v[3]] = 1                    

                    try:
                        gram_so[sub+"|"+obj] += 1
                    except:
                        gram_so[sub+"|"+obj] = 1
                        
                    try:
                        gram_ro[v[3]+"|"+obj] += 1
                    except:
                        gram_ro[v[3]+"|"+obj] = 1
                        
                    try:
                        gram_sro[sub+"|"+v[3]+"|"+obj] += 1
                    except:
                        gram_sro[sub+"|"+v[3]+"|"+obj] = 1

#                     triple_score[sub+"|"+v[3]+"|"+obj] = 0

                if v[3] in bidir_lst:
                    r_nv = {0: v[0], 1: v[1], 2: v[2], 3: v[3], 4: obj, 5: v[9], 6: v[10], 8: sub, 9: v[5],
                            10: v[5]}
                    if (sub in ad_list) or (obj in ad_list):
                        ad_predicate_dict[k + n_semmed] = r_nv
                    else:
                        predicate_dict[k + n_semmed] = r_nv
                        
                    try:
                        out_degree[obj] += 1
                    except:
                        out_degree[obj] = 1

                    try:
                        in_degree[sub] += 1
                    except:
                        in_degree[sub] = 1
                        
                    try:
                        gram_sr[obj+"|"+v[3]] += 1
                    except:
                        gram_sr[obj+"|"+v[3]] = 1                    

                    try:
                        gram_so[obj+"|"+sub] += 1
                    except:
                        gram_so[obj+"|"+sub] = 1
                        
                    try:
                        gram_ro[v[3]+"|"+sub] += 1
                    except:
                        gram_ro[v[3]+"|"+sub] = 1
                        
                    try:
                        gram_sro[obj+"|"+ v[3]+ "|"+sub] += 1
                    except:
                        gram_sro[obj+"|"+ v[3]+ "|"+sub] = 1

#                     triple_score[obj+"|"+ v[3]+ "|"+sub] = 0

In [None]:
# calculate score
import math

sum_tri = sum(gram_sro.values())

for k,v in tqdm(predicate_dict.items()):
    s = v[4]
    r = v[3]
    o = v[8]
    tri = s+"|"+ r + "|"+ o
    sr = s+"|"+ r
    so = s+"|"+ o
    ro = r + "|"+ o
    
    # Contingency Table for Trigrams
    n111 = gram_sro[tri]
    n112 = gram_sr[sr] - gram_sro[tri]
    n121 = gram_so[so] - gram_sro[tri]
    n122 = out_degree[s] - n111 - n112 - n121
    n211 = gram_ro[ro] - gram_sro[tri]
    n212 = gram_r[r] - n111 - n112 - n211
    n221 = in_degree[o] - n111 - n211 - n121
    n222 = sum_tri - n111 - n112 - n121 - n122 - n211 - n212 - n221
    
    try:
        i111 = n111*math.log(n111*sum_tri*sum_tri/((n211+n111)*(n121+n111)*(n112+n111)))
    except:
        i111 = 0
    
    try:
        i112 = n112*math.log(n112*sum_tri*sum_tri/((n212+n112)*(n122+n112)*(n111+n112)))
    except:
        i112 = 0
    
    try:
        i121 = n121*math.log(n121*sum_tri*sum_tri/((n221+n121)*(n111+n121)*(n122+n121)))
    except:
        i121 = 0
        
    try:
        i122 = n122*math.log(n122*sum_tri*sum_tri/((n222+n122)*(n112+n122)*(n121+n122)))
    except:
        i122 = 0
        
    try:
        i211 = n211*math.log(n211*sum_tri*sum_tri/((n111+n211)*(n221+n211)*(n212+n211)))
    except:
        i211 = 0
        
    try:
        i212 = n212*math.log(n212*sum_tri*sum_tri/((n112+n212)*(n222+n212)*(n211+n212)))
    except:
        i212 = 0
        
    try:
        i221 = n221*math.log(n221*sum_tri*sum_tri/((n121+n221)*(n211+n221)*(n222+n221)))
    except:
        i221 = 0
        
    try:
        i222 = n222*math.log(n222*sum_tri*sum_tri/((n122+n222)*(n212+n222)*(n221+n222)))
    except:
        i222 = 0
        
    G2 = 2*(i111 + i112 + i121 + i122 + i211 + i212 + i221 + i222)
    
    
    G2_score[tri] = G2
    
max_in = max(in_degree.values())
min_in = min(in_degree.values())
max_out = max(out_degree.values())
min_out = min(out_degree.values())
max_G2 = max(G2_score.values())
min_G2 = min(G2_score.values())
    
for k, v in tqdm(predicate_dict.items()):
    G2 = G2_score[v[4] + "|" + v[3] + "|" + v[8]]
    v[11] = (out_degree[s]-min_out)/(max_out-min_out)+(in_degree[o]-min_in)/(max_in-min_in) + (G2-min_G2)/(max_G2-min_G2)
    triple_score[v[4] + "|" + v[3] + "|" + v[8]] = v[11]
    
for k,v in tqdm(ad_predicate_dict.items()):
    v[11] = 0

In [None]:
# filter by score
max_triple = 1000000

triple_score_filter = sorted(triple_score.items(),key = lambda x:x[1])[:max_triple]


predicate_dict_filter = {}
threshold = triple_score_filter[-1][1]

for k, v in tqdm(predicate_dict.items()):
    if v[11] <= threshold:
        predicate_dict_filter[k] = v

In [None]:
# save

with open("filtered_triples.pkl", 'wb') as f:
    pkl.dump(predicate_dict_filter,f, pkl.HIGHEST_PROTOCOL)