In [1]:
import json
import re
import pandas as pd
pd.set_option('display.max_colwidth', None)

import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from tqdm import tqdm

# model name for the model with MCC: 0.55. This model use cross entropy loss. The
# hyperparameters are saved in the folder
# output_dir = './model_save_24_10_google_collab_55'

# model name for the model with MCC: 0.57. This model use focal loss. The
# hyperparameters are saved in the folder
output_dir = './google_collab_focal_loss'

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Load the model
model = BertForSequenceClassification.from_pretrained(output_dir)

In [3]:
input_file_path = 'search_res_pretty.json'
with open(input_file_path, 'r') as file:
    json_data = file.read()
data_dict = json.loads(json_data)

pattern_1 = r'\$input_txt:\$ '
res_dict = {}
cnt = 0 

for key,value in data_dict.items():
    cnt += 1
    res = re.sub(pattern_1, '', value[0][0])
    for k, v in value[0][1]["deduplicated:"].items():
        if res in res_dict:
            res_dict[res].append([k, v[1]])
        else:
            res_dict[res] = []
            res_dict[res].append([k, v[1]])

In [4]:
## EXPERIMENT to check the difference in result when extracting triples on the basis of sentence correctness, 
## attention score or the combination of both. This peice of code only shows results for the first three sentences

final_dict = {}

x = dict(list(res_dict.items())[:3])

for key, values in x.items():
    temp = {}
    for value in values:
        input_text = value[0].replace("[SEP] ", "").lower()
        
        input_id = tokenizer(input_text, return_tensors="pt")["input_ids"].squeeze(1).cpu()
        output = model(input_id)
        pos = torch.softmax(output.logits, dim=1)[0][1].item()
        temp[value[0]] = [value[1], pos]
    
    temp = normalize_dict(temp)
    temp_1_sentence_correctness = {}
    temp_1_attention_score = {}
    temp_1_combination = {}
    
    for k, v in temp.items():        
        if len(temp_1_sentence_correctness) < 5:
            temp_1_sentence_correctness[k] = v[1]
        else:
            temp_1_sentence_correctness[k] = v[1]
            key_to_delete = min(temp_1_sentence_correctness, key=lambda k: temp_1_sentence_correctness[k])
            del temp_1_sentence_correctness[key_to_delete] 
            
    for k, v in temp.items():        
        if len(temp_1_attention_score) < 5:
            temp_1_attention_score[k] = v[0]
        else:
            temp_1_attention_score[k] = v[0]
            key_to_delete = min(temp_1_attention_score, key=lambda k: temp_1_attention_score[k])
            del temp_1_attention_score[key_to_delete] 

    for k, v in temp.items():
        avg_value = (v[0] + v[1])/2
        
        if len(temp_1_combination) < 5:
            temp_1_combination[k] = avg_value
        else:
            temp_1_combination[k] = avg_value
            key_to_delete = min(temp_1_combination, key=lambda k: temp_1_combination[k])
            del temp_1_combination[key_to_delete]

In [9]:
final_dict = {}
from tqdm import tqdm

for key, values in tqdm(res_dict.items()):
    temp = {}
    for value in values:
        input_text = value[0].replace("[SEP] ", "")
        input_id = tokenizer(input_text, return_tensors="pt")["input_ids"]
        # model = model_to_save.to('cpu')
        output = model(input_id)
        pos = torch.softmax(output.logits, dim=1)[0][1].item()
        temp[value[0]] = [value[1], pos]
    
    temp = normalize_dict(temp)
    temp_1_sentence_correctness = {}
    
    for k, v in temp.items():        
        if len(temp_1_sentence_correctness) < 9:
            temp_1_sentence_correctness[k] = [v[0], v[1]]
        else:
            temp_1_sentence_correctness[k] = [v[0], v[1]]
            key_to_delete = min(temp_1_sentence_correctness, key=lambda k: temp_1_sentence_correctness[k][1])
            del temp_1_sentence_correctness[key_to_delete]

    final_dict[key] = sorted(temp_1_sentence_correctness.items(), key=lambda x:x[1], reverse=True)
   

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 635/635 [5:04:17<00:00, 28.75s/it]


In [10]:
with open('google_collab_focal_loss/sorted_model_output_focal_loss.json', 'w') as fp:
    json.dump(final_dict, fp)

In [15]:
with open('google_collab_focal_loss/sorted_model_output_focal_loss.json') as json_file:
    final_dict = json.load(json_file)

In [16]:
# format output into a dataframe so that it is easier to put it in clausie format
column_names = ["sentence", "triple_1", "triple_2", "triple_3"]
final_df = pd.DataFrame(columns=column_names)

index = 0
for key, values in final_dict.items():
    index = index + 1
    final_df.at[index, "sentence"] = key
    for idx, value in enumerate(values[:3]):
        segments = value[0].split("[SEP] ")
        pos = "triple_" + str(idx + 1)
        temp = (value[1][0] * value[1][1])
        segments.append(temp)
        final_df.at[index, pos] = segments
    

In [17]:
# format output in clausie format

file_name = "sorted_model_output_focal_loss.json"
with open("google_collab_focal_loss/tab_separated_" + file_name,"w") as f:
    for ID, record in final_df.iterrows():
        f.write(record['sentence']+"\n")
        f.write(
            str(ID)+'\t'+
            ('"'+record["triple_1"][0]+'"')+'\t'+
            ('"'+record["triple_1"][1]+'"')+'\t'+
            ('"'+record["triple_1"][2]+'"')+'\t'+
            str(record["triple_1"][3])+ '\n'
        )

        f.write(
            str(ID)+'\t'+
            ('"'+record["triple_2"][0]+'"')+'\t'+
            ('"'+record["triple_2"][1]+'"')+'\t'+
            ('"'+record["triple_2"][2]+'"')+'\t'+
            str(record["triple_2"][3])+ '\n'
        )

        f.write(
            str(ID)+'\t'+
            ('"'+record["triple_3"][0]+'"')+'\t'+
            ('"'+record["triple_3"][1]+'"')+'\t'+
            ('"'+record["triple_3"][2]+'"')+'\t'+
            str(record["triple_3"][3])+ '\n'
        )
