# Extract Sentences from Wikipedia
+ This notebook is used for collecting sentences that tell relationship between two entities from wikipedia using some dependency path pattern
+ **This notebook is fully valid under Owl3 machine (using the /scratch/data/wikipedia/full_text-2021-03-20 data)**

In [1]:
import re
from bs4 import BeautifulSoup
import pandas as pd
import sys
import wikipedia
import os
import pickle
from wikipedia2vec import Wikipedia2Vec
from collections import Counter
import bz2
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import csv
import tqdm

sys.path.append('..')

from tools.BasicUtils import my_read, my_write
from tools.TextProcessing import (
                normalize_text, remove_brackets, my_sentence_tokenize, build_word_tree_v2, 
                my_sentence_tokenize, filter_specific_keywords, find_dependency_path_from_tree, find_span, nlp, 
                sent_lemmatize, exact_match
                )

from extract_wiki import (
    wikipedia_entity_file, wikipedia_entity_norm_file, 
    wikipedia_keyword_file, wikipedia_token_file, wikipedia_wordtree_file, 
    save_path, entity_occur_file, graph_file, 
    w2vec_dump_file, w2vec_keyword_file, w2vec_entity_file, w2vec_wordtree_file, w2vec_token_file, 
    w2vec_keyword2idx_file, 
    test_path, path_test_file, 
    path_pattern_count_file, 
    save_sub_folders, wiki_sub_folders, 
    wiki_files, save_sent_files, save_cooccur_files, save_selected_files, save_title_files, save_cooccur__files, 
    p, patterns, 
    collect_wiki_entity, note2line, line2note, process_file, filter_path_from_df, 
    feature_columns, feature_process, gen_pattern, gen_kw_from_wiki_ent, get_sentence
)

# Generate the save dir
if not os.path.exists(save_path):
    os.mkdir(save_path)

if not os.path.exists(test_path):
    os.mkdir(test_path)

for save_dir in save_sub_folders:
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

# Get all files under wikipedia/full_text-2021-03-20

print('wiki sub folder example:', wiki_sub_folders[0])
print('save sub folder example:', save_sub_folders[0])
print('wiki file example:', wiki_files[0])
print('save sentence file example:', save_sent_files[0])
print('save cooccur file example:', save_cooccur_files[0])
print('save selected sentence file example:', save_selected_files[0])

wiki sub folder example: ../../data/wikipedia/full_text-2021-03-20/BE
save sub folder example: data/extract_wiki/wiki_sent_collect/BE
wiki file example: ../../data/wikipedia/full_text-2021-03-20/BE/wiki_00
save sentence file example: data/extract_wiki/wiki_sent_collect/BE/wiki_00.dat
save cooccur file example: data/extract_wiki/wiki_sent_collect/BE/wiki_00_co.dat
save selected sentence file example: data/extract_wiki/wiki_sent_collect/BE/wiki_00_se.dat


## [Preparation] Collect sentences from wikipedia and select good sentences by path

In [8]:
# [Load] wikipedia2vec
with bz2.open(w2vec_dump_file) as f_in:
    w2vec = Wikipedia2Vec.load(f_in)

  return next(self.gen)


In [None]:
# [Test] wikipedia2vec

# Find similar words or entities
# ent1 = 'Python (programming language)'
# w2vec.most_similar_by_vector(w2vec.get_entity_vector(ent1), 20)

# Get similarity between two entities
# ent1 = 'Data mining'
# ent2 = 'Database system'
# cosine_similarity(w2vec.get_entity_vector(ent1).reshape(1, -1), w2vec.get_entity_vector(ent2).reshape(1, -1))

# Check the entity count and document count
ent1 = 'Hidden Markov model'
e = w2vec.get_entity(ent1)
print(e.count)
print(e.doc_count)

In [None]:
# [Create] my_mention_dict, mapping keyword mention to wikipedia2vec entities
w2vec_entity = [w for w in w2vec.dictionary.entities() if w.count >= 20 and ' '.join(w.title.split()) == w.title]
w2vec_entity_title = [w.title for w in w2vec_entity]
my_write(w2vec_entity_file, w2vec_entity_title)
my_mention_dict = {}
for ent in w2vec_entity:
    kw = gen_kw_from_wiki_ent(ent.title)
    if kw not in my_mention_dict:
        my_mention_dict[kw] = [ent.index]
    else:
        my_mention_dict[kw].append(ent.index)
w2vec_kws = filter_specific_keywords(list(my_mention_dict.keys()))
my_write(w2vec_keyword_file, w2vec_kws)
build_word_tree_v2(w2vec_keyword_file, w2vec_wordtree_file, w2vec_token_file)
filter_keyword_from_w2vec = set(w2vec_kws)
my_mention_dict = {k:v for k, v in my_mention_dict.items() if k in filter_keyword_from_w2vec}
# with open(w2vec_keyword2idx_file, 'wb') as f_out:
#     pickle.dump(my_mention_dict, f_out)
# len(my_mention_dict)

In [None]:
for w in w2vec.dictionary.entities():
    if w.title == 'Feature engineering':
        print(True)
        break

In [10]:
w2vec.get_entity('Feature engineering')

<Entity Feature engineering>

In [None]:
# [Load] my_mention_dict
with open(w2vec_keyword2idx_file, 'rb') as f_in:
    my_mention_dict = pickle.load(f_in)

In [None]:
# [Test] my_mention_dict
kw = 'feature engineering'
kw_in_mention = kw in my_mention_dict
print(kw_in_mention)
if kw_in_mention:
    for idx in my_mention_dict[kw]:
        print(w2vec.dictionary.get_item_by_index(idx))

In [None]:
# [Create] Collect sentences from wiki files (8 hours) [collect_sent_and_cooccur]

In [None]:
# Modify cooccur files
w2vec_low2ori_file = 'data/extract_wiki/w2vec_low2ori.pickle'

ent_low2ori_map = {}
for k, v in my_mention_dict.items():
    for ent in v:
        ent = w2vec.dictionary.get_entity_by_index(ent).title
        ent_lower = ent.lower()
        if ent_lower in ent_low2ori_map:
            ent_low2ori_map[ent_lower].append(ent)
        else:
            ent_low2ori_map[ent_lower] = [ent]

with open(w2vec_low2ori_file, 'wb') as f_out:
    pickle.dump(ent_low2ori_map, f_out)

In [None]:
ent_low2ori_map['saipa']

In [None]:
for file_idx in tqdm.tqdm(range(len(wiki_files))):
    with open(save_title_files[file_idx]) as f_in:
        titles = f_in.read().split('\n')
    with open(save_cooccur_files[file_idx]) as f_in:
        new_ent_lines = []
        for line_idx, line in enumerate(f_in):
            ents = line.lower().strip().split('\t')
            new_ents = []
            for ent in ents:
                if ent not in ent_low2ori_map:
                    continue
                ent_list = ent_low2ori_map[ent]
                if len(ent_list) == 1:
                    new_ents.append(ent_list[0])
                else:
                    title = titles[line_idx]
                    if title.lower() in ent_low2ori_map:
                        if title in ent_low2ori_map[title.lower()]:
                            title_vec = w2vec.get_entity_vector(title).reshape(1, -1)
                            certain_ent_matrix = np.array([w2vec.get_entity_vector(e) for e in ent_list])
                            new_ents.append(ent_list[cosine_similarity(title_vec, certain_ent_matrix)[0].argmax()])
            new_ent_lines.append('\t'.join(new_ents))
        my_write(save_cooccur_files[file_idx].replace('co.dat', 'co_.dat'), new_ent_lines)

In [3]:
def get_entity_page(ent:str):
    lines = []
    for f in save_title_files:
        found = False
        with open(f) as f_in:
            for line_idx, line in enumerate(f_in):
                if line.strip() == ent:
                    if not found:
                        found = True
                    lines.append(line2note(f, line_idx, '_ti.dat'))
                else:
                    if found:
                        return lines
    return []

In [4]:
lines = get_entity_page('Pattern recognition')
sents = [note2line(note) for note in lines]
occurs = [note2line(note, '_co_.dat') for note in lines]
ori_occurs = [note2line(note, '_co.dat') for note in lines]

In [None]:
my_write('sent_check.txt', sents)
my_write('occur_check.txt', occurs)
my_write('ori_occur_check.txt', ori_occurs)

In [5]:
lines[0]

'AJ:14:4760'

In [7]:
get_sentence('wiki_14', 'temp_sent.txt', 'temp_co.tsv', 'temp_ti.txt')

## [Preparation] Collect dataset

In [None]:
# [Create] Collect sample data using general wikipedia2vec keywords and wiki sent files
wiki_path_test_df = pd.concat([pd.DataFrame(process_file(save_sent_files[file_idx], save_cooccur__files[file_idx], feature_process, w2vec)) for file_idx in range(8)], ignore_index=True)
wiki_path_test_df.to_csv(path_test_file, sep='\t', columns=feature_columns, index=False)
print(len(wiki_path_test_df))

In [None]:
# [Load] Load path test data (pd.DataFrame)
wiki_path_test_df = pd.read_csv(open(path_test_file), sep='\t')

In [None]:
# [Create] Pattern frequency generation

sub_df = wiki_path_test_df[wiki_path_test_df['sim'] > 0.5]

sub_df = sub_df.assign(pick = sub_df.apply(lambda x: 1 if 'nsubj' in x['path'] else 0, axis=1))
sub_df = sub_df[sub_df['pick'] > 0]

sub_df['pattern'] = sub_df.apply(lambda x: gen_pattern(x['path']), axis=1)

c = Counter(sub_df['pattern'].to_list())

with open(path_pattern_count_file, 'wb') as f_out:
    pickle.dump(c, f_out)

In [None]:
# [Load] cal_freq function

with open(path_pattern_count_file, 'rb') as f_in:
    c = pickle.load(f_in)

max_cnt = c.most_common(1)[0][1]
log_max_cnt = np.log(max_cnt+1)

def cal_freq(path:str):
    cnt = c.get(path)
    cnt = (cnt if cnt else 0.5) + 1
    return np.log(cnt) / log_max_cnt

In [None]:
# [Test] cal_freq function
cal_freq('i_nsubj prep pobj prep pobj')

In [None]:
# [Test] Collect sample data using general wikipedia2vec keywords and wiki sent files
sub_df = filter_path_from_df(wiki_path_test_df, cal_freq)
sub_df = sub_df.assign(sent = sub_df.apply(lambda x: note2line(x['sent']).strip(), axis=1))
sub_df.to_csv('full_phrase_check.tsv', columns = ['sent', 'kw1', 'kw1_recall', 'kw1_full_span', 'kw2', 'kw2_recall', 'kw2_full_span'], sep='\t', index=False)

In [None]:
# [Create] collect dataset [collect_dataset]

## [Prepration] Generate Graph

In [None]:
# Generate the graph ['generate_graph']

## Examine Dataset

In [None]:
# Get the entity occurance ['collect_ent_occur_from_selected']

In [None]:
# Demo check co-occur of two entities in selected sentences
def get_selected_record(entity_dict:dict, ent1:str, ent2:str):
    kw1_occur = entity_dict.get(ent1)
    kw2_occur = entity_dict.get(ent2)
    if not kw1_occur or not kw2_occur:
        return None
    co_occur = kw1_occur & kw2_occur
    data = []
    features = ['sim', 'kw1', 'kw1_span', 'kw1_ent', 'kw2', 'kw2_span', 'kw2_ent', 'sent', 'path', 'kw1_full_span', 'kw1_recall', 'kw2_full_span', 'kw2_recall', 'coverage', 'pattern', 'pattern_freq', 'score']
    for occur in co_occur:
        record = note2line(occur, '_se.dat').strip().split('\t')
        sent = note2line(record[7]).strip()
        data_dict = {features[i] : record[i] for i in range(len(record))}
        data_dict['sent'] = sent
        data.append(data_dict)
    
    df = pd.DataFrame(data = data, columns=features)
    return df

In [None]:
# [Load] keyword occur dict which has occurance record for all keywords in selected sentences
with open(entity_occur_file, 'rb') as f_in:
    entity_occur = pickle.load(f_in)

In [None]:
df = get_selected_record(entity_occur, 'Machine learning', 'Statistical model')
if df is not None:
    print(len(df))
    df.to_csv('sents.tsv', columns=['score'] + feature_columns + ['pattern', 'pattern_freq'], sep='\t', index=False)

In [None]:
entity_occur['Data mining']

In [None]:
'Machine learning' in entity_occur

In [None]:
# [Load] Graph
with open(graph_file, 'rb') as f_in:
    graph = pickle.load(f_in)

In [None]:
print('num of nodes:', len(graph.nodes))
print('num of edges:', len(graph.edges))

In [None]:
graph.edges['Database', 'Data mining']

In [None]:
node2neig_cnt = {node : len(list(graph.neighbors(node))) for node in graph.nodes.keys()}

In [None]:
neig_cnt = [v for v in node2neig_cnt.values() if v < 20]
plt.title('num of nodes vs num of neighbors each node')
plt.hist(neig_cnt)
plt.show()

In [None]:
node2triangle_num = nx.triangles(graph)
with open('node2tri_num.pickle', 'wb') as f_out:
    pickle.dump(node2triangle_num, f_out)

In [None]:
print('num of triangles:', sum(node2triangle_num.values()) / 3)
plt.title('num of nodes vs num of triangles each node')
plt.hist([v for v in node2triangle_num.values() if v >= 1 and v < 10])
plt.show()

In [None]:
def find_triangles(graph:nx.Graph, node:str):
    triangles = set()
    neighbors = set(graph.neighbors(node))
    for neighbor in neighbors:
        second_neighbors = set(graph.neighbors(neighbor))
        inter_neighbors = neighbors & second_neighbors
        for third_neighbor in inter_neighbors:
            triangles.add((node, neighbor, third_neighbor) if neighbor < third_neighbor else (node, third_neighbor, neighbor))
    return triangles

In [None]:
triangles = list(find_triangles(graph, 'Statistical model'))
second_node = ''
# second_node = 'Natural-language processing'
threshold = 0.5
third_node = ''
triangles.sort(key=lambda x: x[1])
triangle_with_sents = []
n_seen = set()
for n1, n2, n3 in triangles:
    if second_node and n2 != second_node and n3 != second_node:
        continue
    if third_node and n2 != third_node and n3 != third_node:
        continue
    if n2 not in n_seen:
        n_seen.add(n2)
        triangle_with_sents += ['\t'.join((n1, n2, note2line(sent).strip(), str(score)))for score, sent in graph.get_edge_data(n1, n2)['data'] if score > threshold]
    if n3 not in n_seen:
        n_seen.add(n3)
        triangle_with_sents += ['\t'.join((n1, n3, note2line(sent).strip(), str(score)))for score, sent in graph.get_edge_data(n1, n3)['data'] if score > threshold]
    triangle_with_sents += ['\t'.join((n2, n3, note2line(sent).strip(), str(score)))for score, sent in graph.get_edge_data(n2, n3)['data'] if score > threshold]
my_write('triangles.tsv', triangle_with_sents)

In [None]:
list(graph.neighbors('Hidden Markov model'))

## Demo

In [None]:
# Show sentence from file
note2line('BE:00:0')

In [None]:
# Analyze sentence
doc = nlp('sephardi were exempt from the ban , but it appears that few applied for a letter of free passage .')

# Check noun phrases in the sentences
print(list(doc.noun_chunks))

In [None]:
len(doc)

In [None]:
doc = nlp('ada is a structured , statically typed , imperative , and object-oriented high-level programming language , extended from pascal and other language .')
pairs = [{'kw1' : 'ada', 'kw2' : 'programming language'}]
feature_process(doc, pairs)

## WOE re-write

In [None]:
kw1 = 'data mining'
kw2 = 'machine learning'
doc = nlp('Data mining is a process of extracting and discovering patterns in large data sets involving methods at the intersection of machine learning, statistics, and database systems.'.lower())
kw1_span = find_span(doc, kw1)
kw2_span = find_span(doc, kw2)
find_dependency_path_from_tree(doc, kw1_span[0], kw2_span[0])
# print(len(kw1_span))
# print(len(kw2_span))

## Online operations

In [None]:
def collect_sents_from_wiki_page(page:wikipedia.WikipediaPage):
    remove_list = ['See also', 'References', 'Further reading', 'Sources', 'External links']
    dic = {sec : page.section(sec) for sec in page.sections}
    dic['summary'] = page.summary
    sents = []
    section_list = list(dic.keys())
    while len(section_list) > 0:
        section = section_list.pop()
        if section in remove_list:
            continue
        section_text = dic[section]
        if not section_text:
            continue
        # processed_text = clean_text(section_text)
        processed_text = ' '.join(section_text.lower().split())
        temp_sents = my_sentence_tokenize(processed_text, True)
        sents += temp_sents
    return list(sents)

def collect_entity_from_wiki_page(page:wikipedia.WikipediaPage):
    return [text.lower() for text in page.links]

def collect_keyword_from_wiki_page(page:wikipedia.WikipediaPage):
    soup = BeautifulSoup(page.html(), 'html.parser')
    main_block = soup.find('div', class_='mw-parser-output')
    keywords = set([l.text.lower() for l in main_block.findAll('a') if re.match(r'^(<a href="/wiki/)', str(l))])
    return keywords



In [None]:
keyword = 'python'

p = wikipedia.page(keyword)
if p is not None:
    sents = collect_sents_from_wiki_page(p)
    keywords = collect_keyword_from_wiki_page(p)
    print('sentences collected')
    my_write('%s.txt' % keyword, sents)
    my_write('%s_kw.txt' % keyword, keywords)
    df = filter_by_path(sents)
    df.to_csv('%s_out.tsv' % keyword, sep='\t', index=False)

    dff = df[df.apply(lambda x: str(x['head']) in keywords and str(x['tail']) in keywords, axis=1)]
    dff.to_csv('%s_out_f.tsv' % keyword, sep='\t', index=False)

In [None]:
df['wanted'] = df.apply(lambda x: str(x['head']) in keywords, axis=1)

In [None]:
df.head()

In [None]:
dff.head()

In [None]:
len(dff)

# Appendix

## Collect wikipedia page titles as entities and generate keyword list

In [None]:
# Collect wikipedia entities and corresponding id
output = p.run(collect_wiki_entity, wiki_files)
entity_list = []
for l in output:
    entity_list += l
my_write(wikipedia_entity_file, entity_list)

In [None]:
# Get normalized wikipedia entities
normalized_entity = []
for kw in open(wikipedia_entity_file).readlines():
    eid, ent = kw.split('\t')
    normalized_entity.append('%s\t%s' % (eid, normalize_text(ent)))
my_write(wikipedia_entity_norm_file, normalized_entity)

In [None]:
# Generate keyword list file
keywords = [remove_brackets(line.strip().split('\t')[1]) for line in open(wikipedia_entity_norm_file)]
keywords = [kw for kw in keywords if kw.split()]
keywords = filter_specific_keywords(keywords)
keywords = list(set(keywords))
my_write(wikipedia_keyword_file, keywords)

In [None]:
# Build wordtree
build_word_tree_v2(wikipedia_keyword_file, wikipedia_wordtree_file, wikipedia_token_file)

## Process selected dataset

In [None]:
# Build connected graph from selected sentences(18 min) ['build_graph_from_selected']

In [None]:
# Build connected graph from cooccurance files () ['build_graph_from_cooccur]

## Hand-crafted analysis

In [None]:
wiki_test_df = wiki_path_test_df[wiki_path_test_df['sim'] >= 0.0]

def match_path_pattern(path:str):
    for pp in patterns:
        if exact_match(pp, path):
            return pp
    return ''

wiki_test_df['pattern'] = wiki_test_df.apply(lambda x: match_path_pattern(x['path']), axis=1)

In [None]:
def analysis_path_result_sim_based(df:pd.DataFrame, paths:list):
    summary_df = pd.DataFrame(columns=['path', 'cnt', 'ratio', 'avg_sim'])
    for pp in paths:
        sub_df = df[df['pattern'] == pp]
        summary_df = summary_df.append({
            'path' : pp,
            'cnt' : len(sub_df),
            'ratio' : len(sub_df) / len(df),
            'avg_sim' : sum(sub_df['sim']) / len(sub_df) if len(sub_df) else 0
        }, ignore_index=True)
    summary_df = summary_df.append({
        'path' : 'general',
        'cnt' : len(df),
        'ratio' : 1,
        'avg_sim' : sum(df['sim']) / len(df) if len(df) else 0
    }, ignore_index=True)
    return summary_df

In [None]:
analysis_path_result_sim_based(wiki_test_df, patterns)

In [None]:
def collect_example_sent_for_pattern(df:pd.DataFrame, path:str, num:int=30, posfix:str='.dat'):
    sub_df = df[df['pattern'] == path]
    num = min(len(sub_df), num)
    sub_df = sub_df[:num]
    sub_df['sent'] = sub_df.apply(lambda x: note2line(x['sent'], posfix=posfix).strip(), axis=1)
    return sub_df

for patt in patterns:
    temp_df = collect_example_sent_for_pattern(wiki_test_df, patt)
    temp_df.to_csv('%s.tsv' % (patt[:10] if len(patt) >= 10 else patt), index=False, sep='\t')