# Extract Sentences from Wikipedia
+ This notebook is used for collecting sentences that tell relationship between two entities from wikipedia using some dependency path pattern
+ **This notebook is fully valid under Owl3 machine (using the /scratch/data/wikipedia/full_text-2021-03-20 data)**

In [1]:
import re
from bs4 import BeautifulSoup
import pandas as pd
import sys
import wikipedia
import os
import pickle
sys.path.append('..')

from tools.BasicUtils import my_read, my_write, MyMultiProcessing
from tools.TextProcessing import (
                normalize_text, remove_brackets, my_sentence_tokenize, build_word_tree_v2, 
                my_sentence_tokenize, filter_specific_keywords, find_dependency_path_from_tree, find_span, nlp
                )
from tools.DocProcessing import SentenceFilter

from extract_wiki import (
    wikipedia_dir, wikipedia_entity_file, wikipedia_entity_norm_file, 
    wikipedia_keyword_file, wikipedia_token_file, wikipedia_wordtree_file, 
    save_path, keyword_occur_file, keyword_connection_graph_file,
    collect_wiki_entity, get_sentence
)


# Generate the save dir
if not os.path.exists(save_path):
    os.mkdir(save_path)

sub_folders = [sub for sub in os.listdir(wikipedia_dir)]
save_sub_folders = [os.path.join(save_path, sub) for sub in sub_folders]
wiki_sub_folders = [os.path.join(wikipedia_dir, sub) for sub in sub_folders]

wiki_files = []
save_sent_files = []
save_selected_files = []
for save_dir in save_sub_folders:
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

for i in range(len(wiki_sub_folders)):
    files = [f for f in os.listdir(wiki_sub_folders[i])]
    wiki_files += [os.path.join(wiki_sub_folders[i], f) for f in files]
    save_sent_files += [os.path.join(save_sub_folders[i], f+'.dat') for f in files]
    save_selected_files += [os.path.join(save_sub_folders[i], f+'.tsv') for f in files]

# Get all files under wikipedia/full_text-2021-03-20

print('wiki sub folder example:', wiki_sub_folders[0])
print('save sub folder example:', save_sub_folders[0])
print('wiki file example:', wiki_files[0])
print('save sentence file example:', save_sent_files[0])
print('save selected sentence file example:', save_selected_files[0])

wiki sub folder example: ../../data/wikipedia/full_text-2021-03-20/BE
save sub folder example: data/extract_wiki/wiki_sent_collect/BE
wiki file example: ../../data/wikipedia/full_text-2021-03-20/BE/wiki_00
save sentence file example: data/extract_wiki/wiki_sent_collect/BE/wiki_00.dat
save selected sentence file example: data/extract_wiki/wiki_sent_collect/BE/wiki_00.tsv


## Collect wikipedia page titles as entities and generate keyword list

In [None]:
# Collect wikipedia entities
p = MyMultiProcessing(10)
output = p.run(collect_wiki_entity, wiki_files)
entity_list = []
for l in output:
    entity_list += l
my_write(wikipedia_entity_file, entity_list)

In [None]:
# Get normalized wikipedia entities
normalized_entity = []
for kw in open(wikipedia_entity_file).readlines():
    eid, ent = kw.split('\t')
    normalized_entity.append('%s\t%s' % (eid, normalize_text(ent)))
my_write(wikipedia_entity_norm_file, normalized_entity)

In [None]:
# Generate keyword list file
keywords = [remove_brackets(line.strip().split('\t')[1]) for line in open(wikipedia_entity_norm_file)]
keywords = [kw for kw in keywords if kw.split()]
keywords = filter_specific_keywords(keywords)
my_write(wikipedia_keyword_file, keywords)

In [None]:
# Build wordtree
build_word_tree_v2(wikipedia_keyword_file, wikipedia_wordtree_file, wikipedia_token_file)

## Collect sentences from wikipedia and select good sentences by path

In [None]:
# Process the wikipedia page files to sentence only file (10 min)
p = MyMultiProcessing(10)
wiki_sent_pair = [(wiki_files[i], save_sent_files[i]) for i in range(len(wiki_files))]
output = p.run(get_sentence, wiki_sent_pair)

In [None]:
# Create the sentence filter
sf = SentenceFilter(wikipedia_wordtree_file, wikipedia_token_file)

In [None]:
# Create the sentence filter
sf = SentenceFilter(wikipedia_wordtree_file, wikipedia_token_file)
def collect_sents(save_sent_file:str, save_selected_file:str):
    sents = my_read(save_sent_file)
    df = sf.list_operation(sents, use_id=True, keyword_only=True)
    df.to_csv(save_selected_file, sep='\t', index=False)

In [None]:
# Test of selecting sentences by path ['collect_sents']
test_list = [(save_sent_files[i], '%d.tsv' % i) for i in range(20)]
test_output = p.run(collect_sents, test_list)

In [None]:
# Get the keyword occurance (15 min) ['collect_kw_occur_from_selected']

In [None]:
# Build connected graph (18 min) ['build_graph']

In [None]:
# from tools.DocProcessing import CoOccurrence
# co = CoOccurrence(wikipedia_wordtree_file, wikipedia_token_file)
# def collect_occur()

## Demo

In [None]:
# Load keyword occur dict which has occurance record for all keywords in selected sentences
with open(keyword_occur_file, 'rb') as f_in:
    keyword_occur = pickle.load(f_in)

In [None]:
# Load keyword connection graph in selected sentences
with open(keyword_connection_graph_file, 'rb') as f_in:
    keyword_connection_graph = pickle.load(f_in)

In [None]:
# Demo function: find all the sentences that two keywords co-occur in selected sentences
def find_sentences(keyword_dict:dict, kw1:str, kw2:str):
    kw1_occur = keyword_dict.get(kw1)
    kw2_occur = keyword_dict.get(kw2)
    sents = pd.DataFrame(columns=['head', 'head_norm', 'head_span', 'tail', 'tail_norm', 'tail_span', 'sent', 'path'])
    if not kw1_occur or not kw2_occur:
        return sents
    co_occur = kw1_occur & kw2_occur
    file_dict = {}
    for occur in co_occur:
        sub_file, line_idx = occur.rsplit(':', 1)
        if sub_file not in file_dict:
            file_dict[sub_file] = []
        file_dict[sub_file].append(int(line_idx))
    for f, lines in file_dict.items():
        sentence_in_file = my_read(os.path.join(save_path, f.replace(':', '/wiki_')+'.dat'))
        records = my_read(os.path.join(save_path, f.replace(':', '/wiki_')+'.tsv'))
        for idx in lines:
            record = records[idx].split('\t')
            sent = sentence_in_file[int(record[6])]
            sents = sents.append({  'head':record[0],
                                    'head_norm':record[1],
                                    'head_span':record[2],
                                    'tail':record[3],
                                    'tail_norm':record[4],
                                    'tail_span':record[5],
                                    'sent':sent,
                                    'path':record[7]}, ignore_index=True)
    return sents

In [None]:
df = find_sentences(keyword_occur, 'python', 'programming language')
df.to_csv('sents.tsv', sep='\t', index=False)

In [None]:
'decision tree' in keyword_occur

In [None]:
len(keyword_occur['machine learning'])

In [None]:
kw1 = 'data mining'
kw2 = 'machine learning'
doc = nlp('Data mining is a process of extracting and discovering patterns in large data sets involving methods at the intersection of machine learning, statistics, and database systems.'.lower())
kw1_span = find_span(doc, kw1)
kw2_span = find_span(doc, kw2)
find_dependency_path_from_tree(doc, kw1_span[0], kw2_span[0])
# print(len(kw1_span))
# print(len(kw2_span))

In [None]:
data = keyword_connection_graph.neighbors('decision tree')
my_write('neighbors.txt', list(data))

## Online operations

In [None]:
def collect_sents_from_wiki_page(page:wikipedia.WikipediaPage):
    remove_list = ['See also', 'References', 'Further reading', 'Sources', 'External links']
    dic = {sec : page.section(sec) for sec in page.sections}
    dic['summary'] = page.summary
    sents = []
    section_list = list(dic.keys())
    while len(section_list) > 0:
        section = section_list.pop()
        if section in remove_list:
            continue
        section_text = dic[section]
        if not section_text:
            continue
        # processed_text = clean_text(section_text)
        processed_text = ' '.join(section_text.lower().split())
        temp_sents = my_sentence_tokenize(processed_text, True)
        sents += temp_sents
    return list(sents)

def collect_entity_from_wiki_page(page:wikipedia.WikipediaPage):
    return [text.lower() for text in page.links]

def collect_keyword_from_wiki_page(page:wikipedia.WikipediaPage):
    soup = BeautifulSoup(page.html(), 'html.parser')
    main_block = soup.find('div', class_='mw-parser-output')
    keywords = set([l.text.lower() for l in main_block.findAll('a') if re.match(r'^(<a href="/wiki/)', str(l))])
    return keywords



In [None]:
keyword = 'python'

p = wikipedia.page(keyword)
if p is not None:
    sents = collect_sents_from_wiki_page(p)
    keywords = collect_keyword_from_wiki_page(p)
    print('sentences collected')
    my_write('%s.txt' % keyword, sents)
    my_write('%s_kw.txt' % keyword, keywords)
    df = filter_by_path(sents)
    df.to_csv('%s_out.tsv' % keyword, sep='\t', index=False)

    dff = df[df.apply(lambda x: str(x['head']) in keywords and str(x['tail']) in keywords, axis=1)]
    dff.to_csv('%s_out_f.tsv' % keyword, sep='\t', index=False)

In [None]:
df['wanted'] = df.apply(lambda x: str(x['head']) in keywords, axis=1)

In [None]:
df.head()

In [None]:
dff.head()

In [None]:
len(dff)