# Extract Sentences from Wikipedia
+ This notebook is used for collecting sentences that tell relationship between two entities from wikipedia using some dependency path pattern
+ **This notebook is fully valid under Owl3 machine (using the /scratch/data/wikipedia/full_text-2021-03-20 data)**

In [8]:
import re
from bs4 import BeautifulSoup
import pandas as pd
import sys
import wikipedia
import os
import pickle
from wikipedia2vec import Wikipedia2Vec
import bz2
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import tqdm

sys.path.append('..')

from tools.BasicUtils import my_read, my_write, MyMultiProcessing
from tools.TextProcessing import (
                normalize_text, remove_brackets, my_sentence_tokenize, build_word_tree_v2, 
                my_sentence_tokenize, filter_specific_keywords, find_dependency_path_from_tree, find_span, nlp, 
                sent_lemmatize
                )
from tools.DocProcessing import CoOccurrence

from extract_wiki import SentenceFilter

from extract_wiki import (
    wikipedia_dir, wikipedia_entity_file, wikipedia_entity_norm_file, 
    wikipedia_keyword_file, wikipedia_token_file, wikipedia_wordtree_file, wikipedia_keyword_filtered_file, keyword_npmi_graph_file_v2, 
    save_path, keyword_occur_file, keyword_connection_graph_file, w2vec_dump_file, w2vec_keyword_file, w2vec_wordtree_file, w2vec_token_file, w2vec_keyword2idx_file, 
    collect_wiki_entity, get_sentence, keyword_count_file, filter_keyword_by_freq, line2note, note2line
)

test_path = 'data/extract_wiki/wiki_sent_test'

# Generate the save dir
if not os.path.exists(save_path):
    os.mkdir(save_path)

if not os.path.exists(test_path):
    os.mkdir(test_path)

sub_folders = [sub for sub in os.listdir(wikipedia_dir)]
save_sub_folders = [os.path.join(save_path, sub) for sub in sub_folders]
wiki_sub_folders = [os.path.join(wikipedia_dir, sub) for sub in sub_folders]

wiki_files = []
save_sent_files = []
save_cooccur_files = []
save_selected_files = []

for save_dir in save_sub_folders:
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

for i in range(len(wiki_sub_folders)):
    files = [f for f in os.listdir(wiki_sub_folders[i])]
    wiki_files += [os.path.join(wiki_sub_folders[i], f) for f in files]
    save_sent_files += [os.path.join(save_sub_folders[i], f+'.dat') for f in files]
    save_cooccur_files += [os.path.join(save_sub_folders[i], f+'_co.dat') for f in files]
    save_selected_files += [os.path.join(save_sub_folders[i], f+'.tsv') for f in files]

# Get all files under wikipedia/full_text-2021-03-20

print('wiki sub folder example:', wiki_sub_folders[0])
print('save sub folder example:', save_sub_folders[0])
print('wiki file example:', wiki_files[0])
print('save sentence file example:', save_sent_files[0])
print('save cooccur file example:', save_cooccur_files[0])
print('save selected sentence file example:', save_selected_files[0])

wiki sub folder example: ../../data/wikipedia/full_text-2021-03-20/BE
save sub folder example: data/extract_wiki/wiki_sent_collect/BE
wiki file example: ../../data/wikipedia/full_text-2021-03-20/BE/wiki_00
save sentence file example: data/extract_wiki/wiki_sent_collect/BE/wiki_00.dat
save cooccur file example: data/extract_wiki/wiki_sent_collect/BE/wiki_00_co.dat
save selected sentence file example: data/extract_wiki/wiki_sent_collect/BE/wiki_00.tsv


## Collect wikipedia page titles as entities and generate keyword list

In [None]:
# Collect wikipedia entities
p = MyMultiProcessing(10)
output = p.run(collect_wiki_entity, wiki_files)
entity_list = []
for l in output:
    entity_list += l
my_write(wikipedia_entity_file, entity_list)

In [None]:
# Get normalized wikipedia entities
normalized_entity = []
for kw in open(wikipedia_entity_file).readlines():
    eid, ent = kw.split('\t')
    normalized_entity.append('%s\t%s' % (eid, normalize_text(ent)))
my_write(wikipedia_entity_norm_file, normalized_entity)

In [None]:
# Generate keyword list file
keywords = [remove_brackets(line.strip().split('\t')[1]) for line in open(wikipedia_entity_norm_file)]
keywords = [kw for kw in keywords if kw.split()]
keywords = filter_specific_keywords(keywords)
keywords = list(set(keywords))
my_write(wikipedia_keyword_file, keywords)

In [None]:
# Build wordtree
build_word_tree_v2(wikipedia_keyword_file, wikipedia_wordtree_file, wikipedia_token_file)

## Collect sentences from wikipedia and select good sentences by path

In [None]:
# Process the wikipedia page files to sentence only file (12 min)
p = MyMultiProcessing(10)
wiki_sent_pair = [(wiki_files[i], save_sent_files[i]) for i in range(len(wiki_files))]
output = p.run(get_sentence, wiki_sent_pair)

In [None]:
# Create the sentence filter
sf = SentenceFilter(wikipedia_wordtree_file, wikipedia_token_file)

In [None]:
# Create the sentence filter
sf = SentenceFilter(wikipedia_wordtree_file, wikipedia_token_file)
def collect_sents(save_sent_file:str, save_selected_file:str):
    sents = my_read(save_sent_file)
    df = sf.list_operation(sents, use_id=True, keyword_only=True)
    df.to_csv(save_selected_file, sep='\t', index=False)

In [None]:
# Test of selecting sentences by path ['collect_sents']
test_list = [(save_sent_files[i], '%d.tsv' % i) for i in range(20)]
test_output = p.run(collect_sents, test_list)

In [None]:
# Get the keyword occurance (15 min) ['collect_kw_occur_from_selected']

In [None]:
# Build connected graph from selected sentences(18 min) ['build_graph_from_selected']

In [None]:
# Collect keyword cooccurance from sentence files (2 hours) ['collect_kw_occur_from_sents']

In [None]:
# Filter the less frequent keywords using cooccurance files (8 min)
keyword_count = filter_keyword_by_freq(save_cooccur_file_list=save_cooccur_files)
with open(keyword_count_file, 'wb') as f_out:
    pickle.dump(keyword_count, f_out)

In [None]:
f = {k : v for k, v in keyword_count.items() if v >= 300}
my_write(wikipedia_keyword_filtered_file, list(f.keys()))
print(len(f))

In [None]:
'python' in f

In [None]:
# Build connected graph from cooccurance files () ['build_graph_from_cooccur]

## Wikipedia2vec implementation

In [3]:
with bz2.open(w2vec_dump_file) as f_in:
    w2vec = Wikipedia2Vec.load(f_in)

  return next(self.gen)


In [49]:
my_mention_dict = {}
for ent in w2vec.dictionary.entities():
    if ent.count < 50:
        continue
    kw = remove_brackets(normalize_text(ent.title))
    if kw not in my_mention_dict:
        my_mention_dict[kw] = [ent.index]
    else:
        my_mention_dict[kw].append(ent.index)
w2vec_kws = filter_specific_keywords(list(my_mention_dict.keys()))
my_write(w2vec_keyword_file, w2vec_kws)
build_word_tree_v2(w2vec_keyword_file, w2vec_wordtree_file, w2vec_token_file)
filter_keyword_from_w2vec = set(w2vec_kws)
my_mention_dict = {k:v for k, v in my_mention_dict.items() if k in filter_keyword_from_w2vec}
with open(w2vec_keyword2idx_file, 'wb') as f_out:
    pickle.dump(my_mention_dict, f_out)
len(my_mention_dict)

transform keywords into index


100%|██████████| 330958/330958 [00:00<00:00, 349055.91it/s]


start building wordtree


100%|██████████| 330958/330958 [00:01<00:00, 296610.97it/s]


Building word tree is accomplished with 330958 words added
Total time taken in :  build_word_tree_v2 2.7751083374023438


330958

In [50]:
co = CoOccurrence(w2vec_wordtree_file, w2vec_token_file)

In [110]:
co.line_operation(sent_lemmatize('anarchism is a political philosophy and political movement that is sceptical of authority and rejects all involuntary , coercive forms of hierarchy.'))

{'anarchism',
 'authority',
 'hierarchy',
 'political movement',
 'political philosophy'}

In [51]:
def collect_test_paths_all(test_file:str):
    # Build test data
    with open(test_file) as f_in:
        data = []
        for line_idx, line in enumerate(tqdm.tqdm(f_in.readlines())):
            sent_note = line2note(test_file, line_idx)
            line = line.strip()
            co_kws = list(co.line_operation(sent_lemmatize(line)))
            if len(co_kws) < 2:
                continue
            certain_ent_list = []
            certain_ent_kw_list = []
            uncertain_ent_list = []
            uncertain_ent_kw_list = []
            for kw in co_kws:
                idxs = my_mention_dict[kw]
                if len(idxs) == 1:
                    certain_ent_kw_list.append(kw)
                    certain_ent_list.append(w2vec.dictionary.get_entity_by_index(idxs[0]))
                else:
                    uncertain_ent_kw_list.append(kw)
                    uncertain_ent_list.append([w2vec.dictionary.get_entity_by_index(idx) for idx in idxs])
            
            certain_ent_matrix = np.array([w2vec.get_vector(ent) for ent in certain_ent_list])
            uncertain_ent_matrix_list = [np.array([w2vec.get_vector(ent) for ent in ent_list]) for ent_list in uncertain_ent_list]
            pairs = []
            certain_len = len(certain_ent_list)
            uncertain_len = len(uncertain_ent_list)
            if certain_len >= 1:
                # Collect pairs between certain entities
                result = cosine_similarity(certain_ent_matrix, certain_ent_matrix) - np.identity(certain_len)
                for i in range(certain_len):
                    for j in range(i+1, certain_len):
                        pairs.append({'kw1':certain_ent_kw_list[i], 'kw2':certain_ent_kw_list[j], 'sim':float(result[i, j]), 'sent':sent_note, 
                            'kw1_ent':certain_ent_list[i].title, 
                            'kw2_ent':certain_ent_list[j].title})
                # Collect pairs between certain and uncertain entities
                for i in range(uncertain_len):
                    result = cosine_similarity(certain_ent_matrix, uncertain_ent_matrix_list[i])
                    for j in range(certain_len):
                        idx = np.argmax(result[j])
                        pairs.append({'kw1':uncertain_ent_kw_list[i], 'kw2':certain_ent_kw_list[j], 'sim':float(result[j, idx]), 'sent':sent_note, 
                            'kw1_ent':uncertain_ent_list[i][idx].title, 
                            'kw2_ent':certain_ent_list[j].title})
            if uncertain_len >= 2:
                # Collect pairs between uncertain entities
                for i in range(uncertain_len):
                    for j in range(i+1, uncertain_len):
                        result = cosine_similarity(uncertain_ent_matrix_list[i], uncertain_ent_matrix_list[j])
                        idx = np.argmax(result)
                        row = int(idx / result.shape[1])
                        col = idx % result.shape[1]
                        # print(row)
                        # print(col)
                        pairs.append({'kw1':uncertain_ent_kw_list[i], 'kw2':uncertain_ent_kw_list[j], 'sim':float(result[row, col]), 'sent':sent_note, 
                            'kw1_ent':uncertain_ent_list[i][row].title, 
                            'kw2_ent':uncertain_ent_list[j][col].title})
            doc = nlp(line)
            for item in pairs:
                kw1_spans = find_span(doc, item['kw1'], True)
                kw2_spans = find_span(doc, item['kw2'], True)
                for kw1_span in kw1_spans:
                    for kw2_span in kw2_spans:
                        path = find_dependency_path_from_tree(doc, kw1_span, kw2_span)
                        item['kw1_span'] = (kw1_span[0].i, kw1_span[-1].i)
                        item['kw2_span'] = (kw2_span[0].i, kw2_span[-1].i)
                        item['path'] = path
                        data.append(item.copy())
        
        return pd.DataFrame(data=data, columns=['sim', 'kw1', 'kw1_span', 'kw1_ent', 'kw2', 'kw2_span', 'kw2_ent', 'sent', 'path'])
    # Find all keyword cooccurrence and keep the ones that are similar in wikipedia2vec
    # For each remained pair, get the dep path

In [53]:
df = collect_test_paths_all(save_path+'/AA/wiki_00.dat')
df.to_csv(save_path + '/test.tsv', sep='\t', index=False)
print(len(df))

100%|██████████| 5797/5797 [01:23<00:00, 69.19it/s]


In [45]:
df.head(10)

Unnamed: 0,sim,kw1,kw1_span,kw1_ent,kw2,kw2_span,kw2_ent,sent,path
0,0.556184,political movement,"(6, 7)",Political movement,political philosophy,"(3, 4)",Political philosophy,AA:00:0,i_conj
1,0.42007,political movement,"(6, 7)",Political movement,authority,"(12, 12)",Authority,AA:00:0,i_conj relcl acomp prep pobj
2,0.437531,political movement,"(6, 7)",Political movement,hierarchy,"(21, 21)",Hierarchy,AA:00:0,i_conj relcl conj dobj prep pobj
3,0.533027,political movement,"(6, 7)",Political movement,anarchism,"(0, 0)",Anarchism,AA:00:0,i_conj i_attr nsubj
4,0.558961,political philosophy,"(3, 4)",Political philosophy,authority,"(12, 12)",Authority,AA:00:0,relcl acomp prep pobj
5,0.380747,political philosophy,"(3, 4)",Political philosophy,hierarchy,"(21, 21)",Hierarchy,AA:00:0,relcl conj dobj prep pobj
6,0.619777,political philosophy,"(3, 4)",Political philosophy,anarchism,"(0, 0)",Anarchism,AA:00:0,i_attr nsubj
7,0.576076,authority,"(12, 12)",Authority,hierarchy,"(21, 21)",Hierarchy,AA:00:0,i_pobj i_prep i_acomp conj dobj prep pobj
8,0.444457,authority,"(12, 12)",Authority,anarchism,"(0, 0)",Anarchism,AA:00:0,i_pobj i_prep i_acomp i_relcl i_attr nsubj
9,0.408588,hierarchy,"(21, 21)",Hierarchy,anarchism,"(0, 0)",Anarchism,AA:00:0,i_pobj i_prep i_dobj i_conj i_relcl i_attr nsubj


In [46]:
dff = df[df['sim'] < 0.]

In [47]:
dff.head()

Unnamed: 0,sim,kw1,kw1_span,kw1_ent,kw2,kw2_span,kw2_ent,sent,path
1612,-0.03817,france,"(1, 1)",France,the world,"(24, 25)",The World (radio program),AA:00:94,i_pobj i_prep nsubj prep pobj prep pobj conj p...
4339,-0.018074,1920,"(10, 10)",1920,current,"(2, 2)",Current (stream),AA:00:222,i_conj i_pobj i_prep i_conj i_relcl
4709,-0.041801,pupil,"(18, 18)",Pupil,attendance,"(28, 28)",Attendance,AA:00:239,i_dative dobj prep pobj prep pcomp dobj conj
4711,-0.016279,attendance,"(28, 28)",Attendance,autonomy,"(22, 22)",Autonomy,AA:00:239,i_conj i_dobj i_pcomp i_prep
5175,-0.039343,brighton,"(40, 40)",Brighton (UK Parliament constituency),name,"(2, 2)",Name,AA:00:251,i_pobj i_prep i_appos


In [57]:
'programming language' in my_mention_dict

True

In [58]:
for idx in my_mention_dict['programming language']:
    print(w2vec.dictionary.get_item_by_index(idx))

<Entity Programming language>


In [None]:
w2vec.most_similar_by_vector(w2vec.get_entity_vector('Python (programming language)'), 20)

In [121]:
cosine_similarity(w2vec.get_entity_vector('Political movement').reshape(1, -1), w2vec.get_entity_vector('Hierarchy').reshape(1, -1))

array([[0.43753132]], dtype=float32)

In [54]:
e = w2vec.get_entity('The World (radio program)')

In [55]:
print(e.count)
print(e.doc_count)

75
68


## Demo

In [None]:
# Load keyword occur dict which has occurance record for all keywords in selected sentences
with open(keyword_occur_file, 'rb') as f_in:
    keyword_occur = pickle.load(f_in)

In [None]:
# Load keyword connection graph in selected sentences
with open(keyword_connection_graph_file, 'rb') as f_in:
    keyword_connection_graph = pickle.load(f_in)

In [None]:
# Load keyword count file
with open(keyword_count_file, 'rb') as f_in:
    keyword_count = pickle.load(f_in)

In [14]:
# Load mention to index file
with open(w2vec_keyword2idx_file, 'rb') as f_in:
    my_mention_dict = pickle.load(f_in)

In [None]:
# Load keyword connection graph in cooccurance files
with open(keyword_npmi_graph_file_v2, 'rb') as f_in:
    keyword_npmi_graph = pickle.load(f_in)
for k, v in keyword_npmi_graph[1].items():
    print(k)
    print(v)
    break

In [None]:
# Demo function: find all the sentences that two keywords co-occur in selected sentences
def find_sentences(keyword_dict:dict, kw1:str, kw2:str):
    kw1_occur = keyword_dict.get(kw1)
    kw2_occur = keyword_dict.get(kw2)
    sents = pd.DataFrame(columns=['head', 'head_norm', 'head_span', 'tail', 'tail_norm', 'tail_span', 'sent', 'path'])
    if not kw1_occur or not kw2_occur:
        return sents
    co_occur = kw1_occur & kw2_occur
    file_dict = {}
    for occur in co_occur:
        sub_file, line_idx = occur.rsplit(':', 1)
        if sub_file not in file_dict:
            file_dict[sub_file] = []
        file_dict[sub_file].append(int(line_idx))
    for f, lines in file_dict.items():
        sentence_in_file = my_read(os.path.join(save_path, f.replace(':', '/wiki_')+'.dat'))
        records = my_read(os.path.join(save_path, f.replace(':', '/wiki_')+'.tsv'))
        for idx in lines:
            record = records[idx].split('\t')
            sent = sentence_in_file[int(record[6])]
            sents = sents.append({  'head':record[0],
                                    'head_norm':record[1],
                                    'head_span':record[2],
                                    'tail':record[3],
                                    'tail_norm':record[4],
                                    'tail_span':record[5],
                                    'sent':sent,
                                    'path':record[7]}, ignore_index=True)
    return sents

In [None]:
df = find_sentences(keyword_occur, 'python', 'programming language')
df.to_csv('sents.tsv', sep='\t', index=False)

In [None]:
'decision tree' in keyword_occur

In [None]:
len(keyword_occur['machine learning'])

In [None]:
kw1 = 'data mining'
kw2 = 'machine learning'
doc = nlp('Data mining is a process of extracting and discovering patterns in large data sets involving methods at the intersection of machine learning, statistics, and database systems.'.lower())
kw1_span = find_span(doc, kw1)
kw2_span = find_span(doc, kw2)
find_dependency_path_from_tree(doc, kw1_span[0], kw2_span[0])
# print(len(kw1_span))
# print(len(kw2_span))

In [None]:
data = keyword_connection_graph.neighbors('decision tree')
my_write('neighbors.txt', list(data))

## Online operations

In [None]:
def collect_sents_from_wiki_page(page:wikipedia.WikipediaPage):
    remove_list = ['See also', 'References', 'Further reading', 'Sources', 'External links']
    dic = {sec : page.section(sec) for sec in page.sections}
    dic['summary'] = page.summary
    sents = []
    section_list = list(dic.keys())
    while len(section_list) > 0:
        section = section_list.pop()
        if section in remove_list:
            continue
        section_text = dic[section]
        if not section_text:
            continue
        # processed_text = clean_text(section_text)
        processed_text = ' '.join(section_text.lower().split())
        temp_sents = my_sentence_tokenize(processed_text, True)
        sents += temp_sents
    return list(sents)

def collect_entity_from_wiki_page(page:wikipedia.WikipediaPage):
    return [text.lower() for text in page.links]

def collect_keyword_from_wiki_page(page:wikipedia.WikipediaPage):
    soup = BeautifulSoup(page.html(), 'html.parser')
    main_block = soup.find('div', class_='mw-parser-output')
    keywords = set([l.text.lower() for l in main_block.findAll('a') if re.match(r'^(<a href="/wiki/)', str(l))])
    return keywords



In [None]:
keyword = 'python'

p = wikipedia.page(keyword)
if p is not None:
    sents = collect_sents_from_wiki_page(p)
    keywords = collect_keyword_from_wiki_page(p)
    print('sentences collected')
    my_write('%s.txt' % keyword, sents)
    my_write('%s_kw.txt' % keyword, keywords)
    df = filter_by_path(sents)
    df.to_csv('%s_out.tsv' % keyword, sep='\t', index=False)

    dff = df[df.apply(lambda x: str(x['head']) in keywords and str(x['tail']) in keywords, axis=1)]
    dff.to_csv('%s_out_f.tsv' % keyword, sep='\t', index=False)

In [None]:
df['wanted'] = df.apply(lambda x: str(x['head']) in keywords, axis=1)

In [None]:
df.head()

In [None]:
dff.head()

In [None]:
len(dff)