In [None]:
# Uncomment for google drive use
# from google.colab import drive
# drive.mount('/content/gdrive')
# %cd gdrive/MyDrive/Colab\ Notebooks/Forward/joint_score_func
# ! pip install datasets
# ! pip install transformers
# ! pip install wikipedia

In [1]:
import json
import csv
import os
import copy
import random
import pandas as pd
from datasets import load_dataset
import sys

sys.path.append('..')
from tools.TextProcessing import build_word_tree, process_keywords, nlp, clean_text
from tools.BasicUtils import my_write, my_csv_read, my_read, my_json_read
from tools.OpenIEUtils import processed_file_reader, openie_my_map

In [24]:
file_description = [
    "data/keyword_f.txt ---- CS keywords\n",
    "data/wordtree.json ---- word tree for cs keywords\n",
    "data/entity.txt ---- Reformed cs keywords with '_' replacing ' '\n",
    "data/co_occur.txt ---- Each line shows the keywords that appear in that line of sentence\n",
    "data/occur.json ---- Tell which lines do each keyword occur\n",
    "data/eid2ent.json ---- Mapping from entity id to entity name in wikidata\n",
    "data/rid2rel.json ---- Mapping from relation id to relation name in wikidat\na"
    "data/kg_cs_triples.csv ---- eid-rid-eid triples with eid be referring to possible cs keywords\n",
    "data/kg_dataset.csv ---- ent-rel-ent triples constructed on knowledge graph with each entity pair co-occurs no less than 10 times in small_sent.txt\n",
    "data/ollie_pos_dataset.csv ---- data containing triples and sentences with confidence greater than 0.9 in csv form\n",
    "data/ollie_pos_dataset.json ---- data containing triples and sentences with confidence greater than 0.9\n",
    "data/ollie_neg_dataset_1.json ---- data containing triples and sentences with confidence less than 0.3\n",
    "data/ollie_neg_dataset_2.json ---- data containing triples and sentences where no extraction is made\n",
    "data/my_dataset.json ---- data containing pos, neg_1 and neg_2, splited to train and valid part\n",
    "data/my_dataset_temp.json ---- smaller set of my_dataset.json\n",
    "data/single-ollie ---- transformers.dataset style file\n"
]
    
my_write('README.md', file_description)

<h2> Generate basic keyword file

In [None]:
# Collect keywords from terms-cs-cfl-epoch200.txt
stable_kw = []
unstable_kw = []
r = my_csv_read('../data/raw_data/terms-cs-cfl-epoch200.txt', delimiter='\t')
candidate_kw_list = [item[0] for item in r if float(item[1]) > 0.1]
stable_kw, unstable_kw = process_keywords(candidate_kw_list)
# Save keywords
if not os.path.exists('data'):
    os.mkdir('data')
my_write('data/keyword.txt', stable_kw)
# Generate word tree (25 seconds)
build_word_tree('data/keyword.txt', 'data/wordtree.json', 'data/entity.txt')

In [None]:
# Go to py folder and run followings in the backend 
# "python gen_co_occur.py ../joint_score_func/data/wordtree.json ../data/corpus/small_sent.txt ../joint_score_func/data/co_occur.txt"
# "python gen_occur.py ../joint_score_func/data/keyword.txt ../joint_score_func/data/co_occur.txt ../joint_score_func/data/occur.json"

<h2> Generate dataset using Wikidata knowledge graph

In [None]:
# Load known cs keywords
kw_set = set(my_read('data/keyword.txt'))
# Map id to text
eid2ent_dict = {eid:ent.lower() for eid, ent in my_csv_read('../data/raw_data/wikidata/entity_names.txt', delimiter='\t') if ent.lower() in kw_set}
rid2rel_dict = {rid:rel.lower() for rid, rel in my_csv_read('../data/raw_data/wikidata/relation_names.txt', delimiter='\t')}
# Get the subgraph that have both entities be potential cs keywords and relation be valid
kg_cs_triples = [(eid1, eid2, rid) for eid1, eid2, rid in my_csv_read('../data/raw_data/wikidata/triples.txt', delimiter=' ') if eid1 in eid2ent_dict and eid2 in eid2ent_dict and rid in rid2rel_dict]
# Get valid cs entities and relations from subgraph
cs_eid_set = set()
cs_rid_set = set()
for eid1, eid2, rid in kg_cs_triples:
    cs_eid_set.update((eid1, eid2))
    cs_rid_set.add(rid)
eid2ent_dict = {eid:ent for eid, ent in eid2ent_dict.items() if eid in cs_eid_set}
rid2rel_dict = {rid:rel for rid, rel in rid2rel_dict.items() if rid in cs_rid_set}
# Save files
json.dump(eid2ent_dict, open('data/eid2ent.json', 'w'))
json.dump(rid2rel_dict, open('data/rid2rel.json', 'w'))
csv.writer(open('data/kg_cs_triples.csv', 'w')).writerows(kg_cs_triples)

In [None]:
# Filter out pairs that have little co-occurance
eid2ent_dict = json.load(open('data/eid2ent.json'))
rid2rel_dict = json.load(open('data/rid2rel.json'))
kg_cs_triples = list(my_csv_read('data/kg_cs_triples.csv', delimiter=','))
occur_dict = my_json_read('data/occur.json')
occur_dict = {k:set(v) for k, v in occur_dict.items()}
acceptable_triple_data = []
for eid1, eid2, rid in kg_cs_triples:
    ent1, ent2 = eid2ent_dict[eid1], eid2ent_dict[eid2]
    if len(occur_dict[ent1] & occur_dict[ent2]) > 10 and rid in rid2rel_dict:
        acceptable_triple_data.append((ent1, ent2, rid2rel_dict[rid]))
csv.writer(open('data/kg_datasets.csv', 'w')).writerows(acceptable_triple_data)

<h2> Generate dataset using public OpenIE training data

In [None]:
openie_triples = json.load(open('../data/corpus/openie_triples.json'))

In [None]:
filtered_triples = copy.deepcopy(openie_triples)
for item in filtered_triples:
    item['triples'] = [tri for tri in item['triples'] if tri[0] >= 0.9]

In [None]:
filtered_triples[3]

In [None]:
openie_triples[3]

<h2> Generate dataset using Ollie extraction on arxiv corpus

In [None]:
low_score_list = processed_file_reader('../openie/ollie_test/small_processed_low.txt')
high_score_list = processed_file_reader('../openie/ollie_test/small_processed_high.txt')
all_score_list = processed_file_reader('../openie/ollie_test/small_processed_all.txt')

In [None]:
low_score_triple = [item for item in low_score_list if len(item) > 1]
high_score_triple = [item for item in high_score_list if len(item) > 1]
no_extraction = [item for item in all_score_list if len(item) == 1]
print(len(low_score_triple))
print(len(high_score_triple))
print(len(no_extraction))

In [None]:
# Generate pos dataset
pos_dataset = []
for item in high_score_triple:
    sent = clean_text(item[0])
    for triple in item[1:]:
        ent1, rel, ent2 = triple.split(';')
        ent1, rel, ent2 = clean_text(ent1), clean_text(rel), clean_text(ent2)
        pos_dataset.append({'labels' : 1, 'ent1' : ent1, 'rel' : rel, 'ent2' : ent2, 'sent' : sent})
json.dump(pos_dataset, open('data/ollie_pos_dataset.json', 'w'))
pd.DataFrame(pos_dataset).to_csv('data/ollie_pos_dataset.csv', index=False)

In [None]:
pos_dataset[:4]

In [None]:
# Generate neg dataset from low score triples
neg_dataset_1 = []
for item in low_score_triple:
    sent = clean_text(item[0])
    for triple in item[1:]:
        ent1, rel, ent2 = triple.split(';')
        ent1, rel, ent2 = clean_text(ent1), clean_text(rel), clean_text(ent2)
        neg_dataset_1.append({'labels' : 0, 'ent1' : ent1, 'rel' : rel, 'ent2' : ent2, 'sent' : sent})
json.dump(neg_dataset_1, open('data/ollie_neg_dataset_1.json', 'w'))

In [None]:
neg_dataset_1[:4]

In [None]:
# Generate neg dataset from no extraction sentences with noun chunks
neg_dataset_2 = []
for item in no_extraction:
    sent = clean_text(item[0])
    noun_chunks = list(nlp(sent).noun_chunks)
    if len(noun_chunks) <= 1:
        continue
    ents = random.sample(noun_chunks, 2)
    neg_dataset_2.append({'labels' : 0, 'ent1' : str(ents[0]), 'rel' : '_', 'ent2' : str(ents[1]), 'sent' : sent})
json.dump(neg_dataset_2, open('data/ollie_neg_dataset_2.json', 'w'))

In [None]:
# Forming dataset json
pos_dataset = json.load(open('data/ollie_pos_dataset.json'))
neg_dataset_1 = json.load(open('data/ollie_neg_dataset_1.json'))
neg_dataset_2 = json.load(open('data/ollie_neg_dataset_2.json'))
dataset = (pos_dataset + neg_dataset_1 + neg_dataset_2)
random.shuffle(dataset)
split_point = int(len(dataset) * 0.8)
final_dataset = {'train' : dataset[:split_point], 'valid' : dataset[split_point:]}
json.dump(final_dataset, open('data/my_dataset.json', 'w'))

In [None]:
# Forming dataset in datasets format
temp_train = load_dataset('json', data_files='data/my_dataset.json', field='train')
temp_valid = load_dataset('json', data_files='data/my_dataset.json', field='valid')
temp_train['valid'] = temp_valid['train']
temp_train.save_to_disk('data/single-ollie')

In [None]:
# Reframe the dataset if needed
temp_dict = json.load(open('data/my_dataset.json'))
temp_dict['train'] = temp_dict['train'][:10000]
temp_dict['valid'] = temp_dict['valid'][:2000]
json.dump(temp_dict, open('data/my_dataset_temp.json', 'w'))
temp_train = load_dataset('json', data_files='data/my_dataset_temp.json', field='train')
temp_valid = load_dataset('json', data_files='data/my_dataset_temp.json', field='valid')
temp_train['valid'] = temp_valid['train']
temp_train.save_to_disk('data/single-ollie2')