# Extract Sentences from Wikipedia
+ This notebook is used for collecting sentences that tell relationship between two entities from wikipedia using some dependency path pattern
+ **This notebook is fully valid under Owl3 machine (using the /scratch/data/wikipedia/full_text-2021-03-20 data)**

## Load necessary resource

In [29]:
import re
from bs4 import BeautifulSoup
import pandas as pd
import sys
import wikipedia
import os
from wikipedia2vec import Wikipedia2Vec
from collections import Counter
import bz2
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import tqdm
from typing import List
from nltk.corpus import stopwords
self_define_stopwords = set(['-', ',', '.'])
sw = set(stopwords.words('english'))
import math
import json
import random
random.seed(0)
import torch

sys.path.append('..')

from tools.BasicUtils import my_write, my_read_pickle, my_write_pickle
from tools.TextProcessing import (
                my_sentence_tokenize,
                my_sentence_tokenize, filter_specific_keywords, nlp, 
                exact_match
                )

from extract_wiki import (
    save_path, entity_occur_from_cooccur_file, graph_file, single_sent_graph_file, 
    w2vec_dump_file, sub_path_pattern_count_file, 
    w2vec_keyword2idx_file, 
    test_path, path_test_file, 
    path_pattern_count_file, 
    save_sub_folders, wiki_sub_folders, 
    wiki_files, save_sent_files, save_cooccur_files, save_selected_files, 
    p, patterns, FeatureProcess, wikipedia_entity_file, CalFreq, 
    note2line, cal_score_from_df, cal_freq_from_df, 
    gen_kw_from_wiki_ent, get_entity_page, gen_corepath_pattern, find_triangles, find_path_between_pair, 
    generate_sample,
    sample_to_neo4j, get_sentence, informativeness_demo, process_list
)

# Generate the save dir
if not os.path.exists(save_path):
    os.mkdir(save_path)

if not os.path.exists(test_path):
    os.mkdir(test_path)

for save_dir in save_sub_folders:
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

# Get all files under wikipedia/full_text-2021-03-20

print('wiki sub folder example:', wiki_sub_folders[0])
print('save sub folder example:', save_sub_folders[0])
print('wiki file example:', wiki_files[0])
print('save sentence file example:', save_sent_files[0])
print('save cooccur file example:', save_cooccur_files[0])
print('save selected sentence file example:', save_selected_files[0])

wiki sub folder example: ../data/wikipedia/full_text-2021-03-20/BE
save sub folder example: data/extract_wiki/wiki_sent_collect/BE
wiki file example: ../data/wikipedia/full_text-2021-03-20/BE/wiki_00
save sentence file example: data/extract_wiki/wiki_sent_collect/BE/wiki_00.dat
save cooccur file example: data/extract_wiki/wiki_sent_collect/BE/wiki_00_co.dat
save selected sentence file example: data/extract_wiki/wiki_sent_collect/BE/wiki_00_se.dat


In [None]:
# [Load] wikipedia2vec
with bz2.open(w2vec_dump_file) as f_in:
    w2vec = Wikipedia2Vec.load(f_in)

In [None]:
# [Test] wikipedia2vec

# Find similar words or entities
# ent1 = 'Python (programming language)'
# w2vec.most_similar_by_vector(w2vec.get_entity_vector(ent1), 20)

# Get similarity between two entities
# ent1 = 'Machine learning'
# ent2 = 'Information science'
# cosine_similarity(w2vec.get_entity_vector(ent1).reshape(1, -1), w2vec.get_entity_vector(ent2).reshape(1, -1))[0, 0]

# Check the entity count and document count
# ent1 = 'Hidden Markov model'
# e = w2vec.get_entity(ent1)
# print(e.count)
# print(e.doc_count)

## [Preparation] Collect sentences, entities, entities co-occurrances, titles from wikipedia dump

### Roughly collect sentences, entity co-occurrances, titles

In [None]:
# [Test] Test get_sentence function
get_sentence(wiki_files[0], 'sent.txt', 'cooccur.txt', 'title.txt')

In [None]:
# python extract_wiki.py collect_sent_and_cooccur (8 hours)

### Correct entity mapping in co-occurrance files

In [None]:
# python extract_wiki.py correct_mapping_in_cooccur (6 mins)

### Collect cooccur similarity

In [None]:
# python extract_wiki.py cal_cooccur_similarity

In [10]:
# [Test]
lines = get_entity_page('Information retrieval')
sents = [note2line(note) for note in lines]
occurs = [note2line(note, '_co_.dat') for note in lines]
ori_occurs = [note2line(note, '_co.dat') for note in lines]
pairs_list = [note2line(note, '_pr.dat') for note in lines]
my_write('sent_check.txt', sents)
my_write('occur_check.txt', occurs)
my_write('ori_occur_check.txt', ori_occurs)
my_write('pair_check.txt', pairs_list)
note2line(lines[0])

'Information retrieval is the process of obtaining information system resources that are relevant to an information need from a collection of those resources.\n'

### Generate entity occurrance from dataset

In [None]:
# Generate the entity occurrance dict from co-occurrance info [collect_ent_occur_from_cooccur]

### [Not necessary] Mapping keyword mention to wikipedia2vec entities

In [None]:
with open(wikipedia_entity_file) as f_in:
    wikipedia_entity = set(f_in.read().split('\n'))
w2vec

w2vec_keyword2idx = {}

for entity in tqdm.tqdm(wikipedia_entity):
    w2vec_entity = w2vec.get_entity(entity)
    if w2vec_entity is None:
        continue
    kw = gen_kw_from_wiki_ent(entity)
    if kw not in w2vec_keyword2idx:
        w2vec_keyword2idx[kw] = [w2vec_entity.index]
    else:
        if w2vec_entity.index not in w2vec_keyword2idx[kw]:
            w2vec_keyword2idx[kw].append(w2vec_entity.index)
w2vec_kws = filter_specific_keywords(list(w2vec_keyword2idx.keys()))
filter_keyword_from_w2vec = set(w2vec_kws)
w2vec_keyword2idx = {k:v for k, v in w2vec_keyword2idx.items() if k in filter_keyword_from_w2vec}
my_write_pickle(w2vec_keyword2idx_file, w2vec_keyword2idx)
len(w2vec_keyword2idx)

In [None]:
# [Load] w2vec_keyword2idx
w2vec_keyword2idx = my_read_pickle(w2vec_keyword2idx_file)

In [None]:
# [Test] w2vec_keyword2idx
kw = 'feature engineering'
kw_in_mention = kw in w2vec_keyword2idx
print(kw_in_mention)
if kw_in_mention:
    for idx in w2vec_keyword2idx[kw]:
        print(w2vec.dictionary.get_item_by_index(idx))

## [Preparation] Collect dataset

### Collect pattern frequency counter

In [None]:
# python extract_wiki.py collect_subpath_pattern_freq
# python extract_wiki.py collect_pattern_freq (12 min)

In [12]:
# [Load] cal_freq function
cal_freq = CalFreq(path_pattern_count_file)
cal_freq.c.most_common(10)

[('i_nsubj attr prep pobj', 150),
 ('i_nsubj prep pobj', 146),
 ('i_nsubj attr', 89),
 ('i_nsubj attr prep pobj prep pobj', 81),
 ('i_nsubj dobj', 79),
 ('i_nsubjpass prep pobj', 72),
 ('i_nsubj prep pobj prep pobj', 70),
 ('i_nsubjpass prep pobj prep pobj', 52),
 ('i_nsubj dobj prep pobj', 47),
 ('i_nsubjpass agent pobj', 25)]

In [13]:
# [Test] cal_freq function
cal_freq.cal_freq_from_path('i_nsubj prep pobj prep pobj prep pobj')

0.58685962811188

In [14]:
test_df = pd.read_csv(path_test_file, sep='\t')
test_df.head()

Unnamed: 0,sim,kw1,kw1_span,kw1_ent,kw2,kw2_span,kw2_ent,sent,dep_path,pattern,dep_coverage
0,0.701339,Archbishop of Cologne,"(3, 5)",Archbishop of Cologne,Anno II,"(0, 1)",Anno II,Anno II was Archbishop of Cologne from 1056 un...,i_attr nsubj,i_nsubj attr,0.949983
1,0.74733,The Doobie Brothers,"(16, 18)",The Doobie Brothers,Livin ' on the Fault Line,"(0, 5)",Livin' on the Fault Line,Livin' on the Fault Line is the seventh studio...,i_appos i_pobj i_prep i_attr nsubj,i_nsubj attr prep pobj,0.990815
2,0.631102,Little Darling,"(3, 4)",Little Darling (I Need You),Marvin Gaye,"(11, 12)",Marvin Gaye,"The track ""Little Darling "" is a remake of the...",i_appos i_nsubj attr prep pobj,i_nsubj attr prep pobj,0.845042
3,0.6142,Lombardy,"(13, 13)",Lombardy,Bagolino,"(0, 0)",Bagolino,"Bagolino is a ""comune"" in the province of Bres...",i_pobj i_prep i_attr nsubj,i_nsubj attr prep pobj,0.888238
4,0.806008,Valle Sabbia,"(30, 31)",Valle Sabbia,Bagolino,"(0, 0)",Bagolino,"Bagolino is a ""comune"" in the province of Bres...",i_pobj i_prep i_pobj i_prep nsubj,i_nsubj prep pobj prep pobj,0.342975


### Collect data

In [None]:
# python extract_wiki.py collect_dataset (18 hours)

## [Prepration] Sentence-edged Graph

### Generate graph

In [None]:
# python extract_wiki.py generate_graph

### Generate sentence graph

In [None]:
# python extract_wiki.py generate_sent_graph

## [Test] Check the score function

In [23]:
# [Load] Graph
graph:nx.Graph = my_read_pickle(graph_file)

print('num of nodes:', len(graph.nodes))
print('num of edges:', len(graph.edges))

single_sent_graph:nx.Graph = my_read_pickle(single_sent_graph_file)
print('num of nodes:', len(single_sent_graph.nodes))
print('num of edges:', len(single_sent_graph.edges))

cal_freq = CalFreq(path_pattern_count_file)
d = my_read_pickle(entity_occur_from_cooccur_file)

fp = FeatureProcess(sub_path_pattern_count_file)

In [None]:
# [Test] graph
ent1 = 'Machine learning'
# Check the neighbours of an entity
list(graph.neighbors(ent1))

# Check the edges of two entities
# edges = graph.edges[ent1, 'Hinge loss']
# edges

### Collect highest scored sentence for pairs

In [19]:
sents = []
edges = list(single_sent_graph.edges)
random.Random(0).shuffle(edges)
for edge in edges[:100]:
    data = single_sent_graph.get_edge_data(*edge)
    sents.append({'ent1' : edge[0], 
                  'ent2' : edge[1], 
                  'sent' : note2line(data['data'][0]['note']).strip(), 
                  'score' : data['data'][0]['score']})
pd.DataFrame(sents).to_csv('highest_sents.csv', index=False)

### Single sentence significance test

In [21]:
df = informativeness_demo("Many operating systems let a program return a result when its process terminates;", 'Operating system', 'Process', fp)
df.to_csv('temp.csv', index=False)

In [22]:
fp.feature_process(nlp("Many operating systems let a program return a result when its process terminates;"), 'Operating system', 'Process')

[{'kw1_span': (1, 2),
  'kw2_span': (11, 11),
  'pattern': 'i_nsubj ccomp advcl nsubj',
  'dep_path': 'i_nsubj ccomp advcl nsubj',
  'dep_coverage': 0.8732452507251984}]

### Check score function on sentences from a pair of entities

In [24]:
ent1 = 'Marko Attila Hoare'
ent2 = 'Serbs'
sents = [note2line(note).strip() for note in d[ent1] & d[ent2]]
len(sents)

4

In [26]:
b = cal_freq_from_df(pd.DataFrame(process_list(sents, [str((0, ent1, ent2))]*len(sents), fp.batched_feature_process)), cal_freq)
b = cal_score_from_df(b)
b = b.sort_values(by=['score'], ascending=False)
b.to_csv('sentences.csv', index=False, columns=['kw1', 'kw1_span', 'kw2', 'kw2_span', 'sent', 'dep_coverage', 'pattern_freq', 'pattern', 'score'])

100%|██████████| 4/4 [00:00<00:00, 35.50it/s]


### Check score function on sentences from pairs containing one entity

In [27]:
examples = []
ent = 'Machine learning'
for neighbor in tqdm.tqdm(list(graph.neighbors(ent))):
    if neighbor not in d:
        continue
    s = d[ent] & d[neighbor]
    s = list(s)
    sents = [note2line(note).strip() for note in s]
    pairs = [{'kw1' : gen_kw_from_wiki_ent(ent, False), 'kw2' : gen_kw_from_wiki_ent(neighbor, False)}]
    temp_list = []
    for idx, sent in enumerate(sents):
        res = fp.batched_feature_process(sent, pairs)
        for i in res:
            i['sent'] = sent
            i['note'] = s[idx]
        temp_list.extend(res)
    if not temp_list:
        continue
    df = pd.DataFrame(temp_list)
    df = cal_freq_from_df(df, cal_freq)
    df = cal_score_from_df(df)
    df = df.sort_values(by=['score'], ascending=False)
    examples.append(df)

test_df = pd.concat(examples)
test_df.to_csv(ent + '.csv', index=False, columns=['kw1', 'kw1_span', 'kw2', 'kw2_span', 'note', 'sent', 'dep_coverage', 'pattern_freq', 'pattern', 'score'])
len(test_df)

100%|██████████| 232/232 [00:59<00:00,  3.91it/s]


313

### Collect test data

In [None]:
# python extract_wiki.py collect_score_function_eval_dataset

### Test score function with human evaluation

In [34]:
def get_score(sent:str, ent1:str, ent2:str):
    kw1 = gen_kw_from_wiki_ent(ent1)
    kw2 = gen_kw_from_wiki_ent(ent2)
    data = fp.batched_feature_process(sent, [{'kw1' : kw1, 'kw2' : kw2}])
    if not data:
        return -1
    data = data[0]
    pattern_freq = cal_freq.cal_freq_from_path(gen_corepath_pattern(data['dep_path']))
    dep_coverage = data['dep_coverage']
    return (2*dep_coverage*pattern_freq) / (pattern_freq+dep_coverage)

test_data = pd.read_csv('test.tsv', sep='\t')
score_function_result = test_data.copy()
score_function_result['score'] = score_function_result.apply(lambda x: get_score(x['sentence'], x['entity 1'], x['entity 2']), axis=1)
score_function_result.to_csv('score_function_result.tsv', sep='\t', index=False)

In [None]:
with open('score_function_result.tsv') as f_in:
    lines = f_in.readlines()
    sf_score = [float(lines[i].strip().split('\t')[-1]) for i in range(1, len(lines))]
    sf_score = np.array(sf_score)

with open('user_label.tsv') as f_in:
    lines = f_in.readlines()
    user_score = [float(lines[i].strip().split('\t')[-1]) / 5 for i in range(1, len(lines))]
    user_score = np.array(user_score)
    
with open('user_label.tsv') as f_in:
    lines = f_in.readlines()
    data = []
    for i in range(1, len(lines)):
        ent1, ent2, sent, user_score_ = lines[i].strip().split('\t')
        data.append({'entity 1' : ent1, 'entity 2' : ent2, 'sentence' : sent, 'user label' : float(user_score_)/5})
        
sf_score = sf_score[user_score > 0]
user_score = user_score[user_score > 0]

In [None]:
np.corrcoef(sf_score, user_score)

In [None]:
# score range within [1,2,3,4,5]
l2 = np.mean(np.abs(sf_score*5 - user_score*5))
l2

In [None]:
np.mean(user_score*5)
np.mean(user_score[sf_score>0.7])
np.mean(user_score[sf_score<=0.6])
np.mean(sf_score*5)

In [None]:
dist = np.abs(sf_score - user_score)
diff = []
for i in range(len(dist)):
    if dist[i] > 0.3:
        diff.append(data[i].copy())
        diff[-1]['score function label'] = sf_score[i]
diff = pd.DataFrame(diff)
len(diff)
diff.to_csv('diff.tsv', sep='\t', index=False)

## Generate training data (Below are not verified)

In [None]:
# [Load] Single sentence graph
single_sent_graph = my_read_pickle(single_sent_graph_file)
print('number of nodes:', single_sent_graph.number_of_nodes())
print('number of edges:', single_sent_graph.number_of_edges())

### Generate data of level 1

In [None]:
# Test
ent1 = 'Machine learning'
ent2 = 'Algorithm'
sample = generate_sample(single_sent_graph, ent1, ent2)

In [None]:
from statistics import mean
triple:list = sample['triple']
sources:List[str] = sample['source']
entity:List[str] = sample['entity']
avg_scores = [mean([tri['score'] for tri in path]) for path in triple]
sorted_list = sorted(zip(avg_scores, triple), key=lambda x: x[0], reverse=True)
triple = list(zip(*sorted_list))[1]
contexts = [[{'e1' : entity[tri['e1']], 
            'e2' : entity[tri['e2']], 
            'sent' : sources[tri['sent']],
            'score' : tri['score']} for tri in path] for path in triple]
ctxs = []
for ctx in contexts[:5]:
    path = [ctx[0]['e1']]
    sents = []
    for i, tri in enumerate(ctx):
        path.append(tri['e2'])
        sents.append('sentence%d: %s' % (i+1, tri['sent']))
    path = '; '.join(path)
    sents = ' '.join(sents)
    ctxs.append('%s %s' % ('path: ' + path, sents))
for ctx in ctxs:
    print(ctx)

In [None]:
# Generate training data of level 1 [collect_sample_from_single_sent_graph] (5 min)

In [None]:
with open('dataset_explicit.json') as f_in:
    a = json.load(f_in)
    print(len(a))

In [None]:
b = [len(set([tri['sent'] for tri in path])) < len(path) for item in a for path in item['triple']]
sum(b)/len(b)

In [None]:
b = [any([len(set([tri['sent'] for tri in path])) < len(path) for path in item['triple']]) for item in a]
sum(b)/len(b)

In [None]:
plt.hist([len(item['triple']) for item in a])
plt.show()

In [None]:
sample_to_neo4j(a[5])

In [None]:
random.seed(0)
random.shuffle(a)
train_ratio = 0.96
valid_ratio = 0.02
training_data = a[:int(len(a)*train_ratio)]
valid_data = a[int(len(a)*train_ratio):int(len(a)*(train_ratio+valid_ratio))]
test_data = a[int(len(a)*(train_ratio+valid_ratio)):]
with open('train.json', 'w') as f_out:
    json.dump(training_data, f_out)
with open('dev.json', 'w') as f_out:
    json.dump(valid_data, f_out)
with open('test.json', 'w') as f_out:
    json.dump(test_data, f_out)

In [None]:
with open('test.json') as f_in:
    test_samples = json.load(f_in)
head = [test_samples[i]['pair'] for i in range(5)]
head

### Evaluate dataset

In [None]:
with open('MyFiD/data/test.json') as f_in:
    data = json.load(f_in)
    eval_data = data[:100]

In [None]:
pd.DataFrame([{'ent1' : sample['pair'][0], 'ent2' : sample['pair'][1], 'sent' : sample['target']} for sample in eval_data]).to_csv('evaluation.csv', index=False)

In [None]:
ent1 = 'Shamsin'
ent2 = 'Homs Governorate'
sample = {}
for sample in eval_data:
    if ent1 in sample['pair'] and ent2 in sample['pair']:
        break

In [None]:
sample

### Generate data for level 1 with random path

In [None]:
entity_occur_from_cooccur = my_read_pickle(entity_occur_from_cooccur_file)

In [None]:
for file_in, file_out in [('MyFiD/data/train.json', 'train_random.json'), ('MyFiD/data/dev.json', 'dev_random.json'), ('MyFiD/data/test.json', 'test_random.json')]:
    with open(file_in) as f_in:
        data = json.load(f_in)
        for item in tqdm.tqdm(data):
            for path in item['triple']:
                for tri in path:
                    item['source'][tri['sent']] = note2line(random.choice(list(entity_occur_from_cooccur[item['entity'][tri['e1']]] & entity_occur_from_cooccur[item['entity'][tri['e2']]]))).strip()
        with open(file_out, 'w') as f_out:
            json.dump(data, f_out)

In [None]:
data[0]

In [None]:
training_data[0]['source']

In [None]:
sent_len_count = []
for data in training_data:
    sents = data['source']
    sent_len_count.extend([len(sent.split()) for sent in sents])

In [None]:
import matplotlib.pyplot as plt

In [None]:
x = np.array(sent_len_count)

In [None]:
plt.hist(x[x<100])
plt.show()

In [None]:
len(sent_len_count)

In [None]:
sample_to_neo4j(items[5])

### Generate data of level 2 from level 1

In [None]:
with open('temp.json') as f_in:
    sample = json.load(f_in)

In [None]:
sample

In [None]:
second_level_sample = generate_second_level_sample(sample)

In [None]:
second_level_sample

In [None]:
from graph4nlp.pytorch.data import GraphData

In [None]:
dep_labels = list(nlp.get_pipe("parser").labels)
dep_labels.extend(['i_'+dep for dep in dep_labels])

In [None]:
g = GraphData()
is_rel = []
is_entity = []

g.add_nodes(1)
is_rel.append(0)
is_entity.append(0)

for src in second_level_sample['sources']:
    pair = src['pair']
    sent_tokens = src['sent']
    
    label_list = []
    label_list.extend(sent_tokens)
    token_num = len(sent_tokens)
    start_node = g.get_node_num()
    g.add_nodes(token_num)
    is_rel.extend([0]*token_num)
    is_entity.extend([0]*token_num)
    is_entity[pair[0]+start_node] = 1
    is_entity[pair[1]+start_node] = 1
    
    label_list.extend(['ROOT', 'ROOT', 'i_ROOT', 'i_ROOT'])
    rel_start_node = start_node + token_num
    g.add_nodes(4)
    is_rel.extend([1]*4)
    is_entity.extend([0]*4)
    g.add_edges([0, 0], [rel_start_node, rel_start_node+1])
    g.add_edges([rel_start_node, rel_start_node+1], [pair[0]+start_node, pair[1]+start_node])
    g.add_edges([pair[0]+start_node, pair[1]+start_node], [rel_start_node+2, rel_start_node+3])
    g.add_edges([rel_start_node+2, rel_start_node+3], [0, 0])
    
    rel_start_node += 4
    triples = src['graph']
    rel_num = len(triples)
    is_rel.extend([1]*rel_num)
    is_entity.extend([0]*rel_num)
    g.add_nodes(rel_num)
    for rel_idx, (tok_1, tok_2, rel) in enumerate(triples):
        g.add_edges([tok_1+start_node, rel_idx+rel_start_node], [rel_idx+rel_start_node, tok_2+start_node])
        label_list.append(rel)
    for i, label in enumerate(label_list):
        g.node_attributes[i+start_node]['label'] = label
g.node_features['is_rel'] = torch.BoolTensor(is_rel)
g.node_features['is_entity'] = torch.BoolTensor(is_entity)

In [None]:
g.get_edge_num()

In [None]:
def find_triangle_with_node(graph:nx.Graph, first_node:str, second_node:str='', third_node:str=''):
    triangles = list(find_triangles(graph, first_node))
    triangles.sort(key=lambda x: x[1])
    triangle_with_sents = []
    n_seen = set()
    for n1, n2, n3 in triangles:
        if second_node and n2 != second_node and n3 != second_node:
            continue
        if third_node and n2 != third_node and n3 != third_node:
            continue
        if n2 not in n_seen:
            n_seen.add(n2)
            triangle_with_sents.append((n1, note2line(graph.get_edge_data(n1, n2)['note']).strip(), n2, graph.get_edge_data(n1, n2)['score']))
        if n3 not in n_seen:
            n_seen.add(n3)
            triangle_with_sents.append((n1, note2line(graph.get_edge_data(n1, n3)['note']).strip(), n3, graph.get_edge_data(n1, n3)['score']))
        triangle_with_sents.append((n2, note2line(graph.get_edge_data(n3, n2)['note']).strip(), n3, graph.get_edge_data(n3, n2)['score']))
    return triangle_with_sents


def isf(w:str, D:int, counters:List[Counter]):
    return math.log(D * 1.0 / sum([1 if w in sent else 0 for sent in counters]))


def do_pagerank(sents:List[str]):
    # Remove stop words
    clean_sents = [[token for token in sent.split() if token not in sw and token not in self_define_stopwords] for sent in sents]

    # Generate word counters
    counters = [Counter(sent) for sent in clean_sents]

    # Build similarity matrix
    D = len(clean_sents)
    sim_matrix = np.zeros((D, D))
    part_list = [math.sqrt(sum([(sent[w] * isf(w, D, counters)) ** 2 for w in sent])) for sent in counters]
    # return part_list
    for i in range(D - 1):
        for j in range(i + 1, D):
            sent_1 = counters[i]
            sent_2 = counters[j]
            share_word_set = sent_1 & sent_2
            numerator = sum([(sent_1[w] * sent_2[w] * (isf(w, D, counters) ** 2)) for w in share_word_set])
            denominator = part_list[i] * part_list[j]
            sim_matrix[i, j] = numerator / denominator
    sim_matrix = sim_matrix + sim_matrix.T
    g = nx.from_numpy_array(sim_matrix)
    score = nx.pagerank(g)
    temp = sorted(score.items(), key=lambda x: x[1], reverse=True)
    idx = [item[0] for item in temp]
    return [sents[i] for i in idx], [score[i] for i in idx]

In [None]:
test_triangles = find_triangle_with_node(single_sent_graph, 'Machine learning', 'Artificial neural network', 'Deep learning')
test_triangles

In [None]:
sent_list = [triangle[1] for triangle in test_triangles]
sents, score = do_pagerank(sent_list)
list(zip(score, sents))

## Evaluate Generation Output

In [None]:
baselines = ['baseline1', 'baseline2', 'baseline3', 'baseline4', 'baseline5']
data = {}
for baseline in baselines:
    with open('MyFiD/checkpoint/'+baseline+'_test/final_output.tsv') as f_in:
        gens = []
        tars = []
        pairs = []
        for line in f_in:
            pair, gen, tar = line.strip().split('\t')
            gens.append(gen)
            tars.append(tar)
            pairs.append(pair)
        data[baseline] = gens
        data['target'] = tars
        data['pair'] = pairs
pd.DataFrame(data)[:100].to_csv('evaluation.csv', index=False, columns=['pair', 'target']+baselines)

## Generate Super Sub-graph

### Collect all sentences between two entities within one hop

In [None]:
# [Load] Single sentence graph
single_sent_graph = my_read_pickle(single_sent_graph_file)
edges = [edge for edge in tqdm.tqdm(single_sent_graph.edges) if single_sent_graph.get_edge_data(*edge)['score'] > 0.65]
filtered_graph = single_sent_graph.edge_subgraph(edges)
print('number of nodes:', filtered_graph.number_of_nodes())
print('number of edges:', filtered_graph.number_of_edges())

In [None]:
paths = find_path_between_pair(single_sent_graph, 'Artificial intelligence', 'Natural language processing', 1)

In [None]:
def build_subgraph(paths:list, single_sent_graph:nx.Graph):
    pairs = set()
    triples = []
    for path in paths:
        if len(path) <= 2:
            continue
        for i in range(len(path)-1):
            new_pair = frozenset((path[i], path[i+1]))
            if new_pair not in pairs:
                pairs.add(new_pair)
                triples.append(list(new_pair) + [note2line(single_sent_graph.get_edge_data(path[i], path[i+1])['note']).strip()])
    return triples

In [None]:
subgraph = build_subgraph(paths, single_sent_graph)

### Generate a graph for one sentence

In [None]:
d = my_read_pickle(entity_occur_from_cooccur_file)

In [None]:
i = 1520

In [None]:
all([a[i]['target'] ==b[i]['target'] for i in range(len(a))])

In [None]:
a[i]['source'][a[i]['triple'][0][1]['sent']]

In [None]:
a[i]['entity'][a[i]['triple'][0][0]['e2']]

In [None]:
b[i]['entity'][b[i]['triple'][0][0]['e2']]

In [None]:
b[i]['source'][b[i]['triple'][0][1]['sent']]

In [None]:
original_file = 'MyFiD/data/train.json'
random_file = 'MyFiD/data/random_train.json'
pair2sent = {}
with open(original_file) as f_in:
    original_samples = json.load(f_in)
with open(random_file) as f_in:
    random_samples = json.load(f_in)

In [None]:
sum([1 for sample in tqdm.tqdm(random_samples) if sample['target'] in sample['source']])

In [None]:
# [[for t in range(len(original_samples[i][]))] 
count = 0
overlap_pairs = set()
for i in range(len(original_samples)):
    random_sample = random_samples[i]
    original_sample = original_samples[i]
    for j in range(len(original_sample['triple'][0])):
        if original_sample['source'][original_sample['triple'][0][j]['sent']] == random_sample['source'][random_sample['triple'][0][j]['sent']]:
            count += 1
            overlap_pairs.add(frozenset((original_sample['entity'][original_sample['triple'][0][j]['e1']], original_sample['entity'][original_sample['triple'][0][j]['e2']])))
            break
print(count)

In [None]:
len(overlap_pairs)

In [None]:
overlap_occur = []
for pair in overlap_pairs:
    ent1, ent2 = pair
    overlap_occur.append(len(d[ent1] & d[ent2]))

In [None]:
c = Counter(overlap_occur)

In [None]:
c.most_common()

In [None]:
for i, sample in enumerate(tqdm.tqdm(samples[6900:])):
    entity = sample['entity']
    stop = False
    sent_list = []
    for path in sample['triple']:
        for tri in path:
            ent1, ent2 = entity[tri['e1']], entity[tri['e2']]
            pair = frozenset((ent1, ent2))
            sents = pair2sent.get(pair)
            if not sents:
                sent_candidates = list(d[ent1] & d[ent2])
                if len(sent_candidates) > 2:
                    sents = sent_candidates[:2]
                else:
                    sents = sent_candidates
                sents = [note2line(sent).strip() for sent in sents]
                pair2sent[pair] = sents
            try:
                sent = sents[0] if sents[0] != sample['target'] else sents[1]
            except:
                print(i)
                print(pair)
                print(json.dumps(sample))
                stop = True
                break
            # if sent not in sent_list:
            #     sent_list.append(sent)
            # tri['sent'] = sent_list.index(sent)
        if stop:
            break
    if stop:
        break
    # sample['source'] = sent_list

## [Not necessary] Online operations

In [None]:
def collect_sents_from_wiki_page(page:wikipedia.WikipediaPage):
    remove_list = ['See also', 'References', 'Further reading', 'Sources', 'External links']
    dic = {sec : page.section(sec) for sec in page.sections}
    dic['summary'] = page.summary
    sents = []
    section_list = list(dic.keys())
    while len(section_list) > 0:
        section = section_list.pop()
        if section in remove_list:
            continue
        section_text = dic[section]
        if not section_text:
            continue
        # processed_text = clean_text(section_text)
        processed_text = ' '.join(section_text.lower().split())
        temp_sents = my_sentence_tokenize(processed_text, True)
        sents += temp_sents
    return list(sents)

def collect_entity_from_wiki_page(page:wikipedia.WikipediaPage):
    return [text.lower() for text in page.links]

def collect_keyword_from_wiki_page(page:wikipedia.WikipediaPage):
    soup = BeautifulSoup(page.html(), 'html.parser')
    main_block = soup.find('div', class_='mw-parser-output')
    keywords = set([l.text.lower() for l in main_block.findAll('a') if re.match(r'^(<a href="/wiki/)', str(l))])
    return keywords



In [None]:
keyword = 'python'

p = wikipedia.page(keyword)
if p is not None:
    sents = collect_sents_from_wiki_page(p)
    keywords = collect_keyword_from_wiki_page(p)
    print('sentences collected')
    my_write('%s.txt' % keyword, sents)
    my_write('%s_kw.txt' % keyword, keywords)
    df = filter_by_path(sents)
    df.to_csv('%s_out.tsv' % keyword, sep='\t', index=False)

    dff = df[df.apply(lambda x: str(x['head']) in keywords and str(x['tail']) in keywords, axis=1)]
    dff.to_csv('%s_out_f.tsv' % keyword, sep='\t', index=False)

# Test

In [None]:
graph = my_read_pickle(graph_file)

In [None]:
sig_sentence_graph = my_read_pickle('sentence_graph_significant_0.40.pickle')

In [None]:
exp_sentence_graph = my_read_pickle('sentence_graph_explicit_0.40.pickle')

In [None]:
scr_sentence_graph = my_read_pickle('sentence_graph_score_0.60.pickle')

In [None]:
len(scr_sentence_graph.edges)

In [None]:
rdn_sentence_graph = my_read_pickle('sentence_graph_random.pickle')

In [None]:
ent1 = 'Python (programming language)'
ent2 = 'Programming language'

In [None]:
data = sig_sentence_graph.get_edge_data(ent1, ent2)
data['data']

In [None]:
data = exp_sentence_graph.get_edge_data(ent1, ent2)
data['data']

In [None]:
with open('MyFiD/data/test.json') as f_in:
    test = json.load(f_in)

In [None]:
baseline1_output = open('MyFiD/checkpoint/baseline1_test/final_output.tsv').readlines()
baseline3_output = open('MyFiD/checkpoint/baseline3_test/final_output.tsv').readlines()
baseline5_output = open('MyFiD/checkpoint/baseline5_test/final_output.tsv').readlines()

data = [{'pair' : 'ent1: %s ent2: %s' % tuple(item['pair']),
         'rdn' : note2line(rdn_sentence_graph.get_edge_data(*item['pair'])['data'][0]['note']).strip(),
         'rdn_score' : '',
         'exp' : note2line(exp_sentence_graph.get_edge_data(*item['pair'])['data'][0]['note']).strip(), 
         'exp_score' : '',
         'sig' : note2line(sig_sentence_graph.get_edge_data(*item['pair'])['data'][0]['note']).strip(), 
         'sig_score' : '',
         'scr' : note2line(scr_sentence_graph.get_edge_data(*item['pair'])['data'][0]['note']).strip(),
         'scr_score' : '',
         'baseline1' : baseline1_output[i].strip().split('\t')[1], 
         'baseline1_score' : '', 
         'baseline3' : baseline3_output[i].strip().split('\t')[1], 
         'baseline3_score' : '', 
         'baseline5' : baseline5_output[i].strip().split('\t')[1], 
         'baseline5_score' : '', 
         } for i, item in enumerate(tqdm.tqdm(test)) if len(graph.get_edge_data(*item['pair'])['data']) >= 10]
pd.DataFrame(data[:100]).to_csv('evaluation_all.csv', index=False)
print(len(data))

In [None]:
sum([(item['exp'] == item['scr']) and (item['sig'] == item['scr']) for item in data]) / len(data)

In [None]:
baselines = ['baseline1', 'baseline3', 'baseline5']
data = {}
for baseline in baselines:
    with open('MyFiD/checkpoint/'+baseline+'_test/final_output.tsv') as f_in:
        gens = []
        tars = []
        pairs = []
        for line in f_in:
            pair, gen, tar = line.strip().split('\t')
            gens.append(gen)
            tars.append(tar)
            pairs.append(pair)
        data[baseline] = gens
        data['target'] = tars
        data['pair'] = pairs
pd.DataFrame(data)[:100].to_csv('evaluation.csv', index=False, columns=['pair', 'target']+baselines)