# Extract Sentences from Wikipedia
+ This notebook is used for collecting sentences that tell relationship between two entities from wikipedia using some dependency path pattern
+ **This notebook is fully valid under Owl3 machine (using the /scratch/data/wikipedia/full_text-2021-03-20 data)**

In [None]:
import re
from bs4 import BeautifulSoup
import pandas as pd
import sys
import wikipedia
import os
import pickle
from wikipedia2vec import Wikipedia2Vec
from collections import Counter
import bz2
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import csv
import tqdm

sys.path.append('..')

from tools.BasicUtils import my_read, my_write
from tools.TextProcessing import (
                my_sentence_tokenize, build_word_tree_v2, 
                my_sentence_tokenize, filter_specific_keywords, find_dependency_path_from_tree, find_span, nlp, 
                exact_match
                )

from extract_wiki import (
    wikipedia_entity_file, record_columns, 
    save_path, entity_occur_file, graph_file, single_sent_graph_file, 
    w2vec_dump_file, w2vec_keyword_file, w2vec_wordtree_file, w2vec_token_file, 
    w2vec_keyword2idx_file, 
    test_path, path_test_file, 
    path_pattern_count_file, 
    save_sub_folders, wiki_sub_folders, 
    wiki_files, save_sent_files, save_cooccur_files, save_selected_files, save_title_files, save_cooccur__files, 
    p, patterns, 
    note2line, line2note, process_file, filter_path_from_df, 
    feature_columns, feature_process, gen_pattern, gen_kw_from_wiki_ent, get_entity_page
)

# Generate the save dir
if not os.path.exists(save_path):
    os.mkdir(save_path)

if not os.path.exists(test_path):
    os.mkdir(test_path)

for save_dir in save_sub_folders:
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

# Get all files under wikipedia/full_text-2021-03-20

print('wiki sub folder example:', wiki_sub_folders[0])
print('save sub folder example:', save_sub_folders[0])
print('wiki file example:', wiki_files[0])
print('save sentence file example:', save_sent_files[0])
print('save cooccur file example:', save_cooccur_files[0])
print('save selected sentence file example:', save_selected_files[0])

In [None]:
# [Load] wikipedia2vec
with bz2.open(w2vec_dump_file) as f_in:
    w2vec = Wikipedia2Vec.load(f_in)

In [None]:
# [Test] wikipedia2vec

# Find similar words or entities
# ent1 = 'Python (programming language)'
# w2vec.most_similar_by_vector(w2vec.get_entity_vector(ent1), 20)

# Get similarity between two entities
# ent1 = 'Data mining'
# ent2 = 'Database system'
# cosine_similarity(w2vec.get_entity_vector(ent1).reshape(1, -1), w2vec.get_entity_vector(ent2).reshape(1, -1))[0, 0]

# Check the entity count and document count
ent1 = 'Hidden Markov model'
e = w2vec.get_entity(ent1)
print(e.count)
print(e.doc_count)

## [Preparation] Collect sentences, entities, entities co-occurrances, titles from wikipedia dump

In [None]:
# python extract_wiki.py collect_sent_and_cooccur (8 hours)

In [None]:
# Collect wikipedia entities
wikipedia_entity = set()
for f in tqdm.tqdm(save_title_files):
    with open(f) as f_in:
        wikipedia_entity.update(f_in.read().split('\n'))
print(len(wikipedia_entity))
my_write(wikipedia_entity_file, list(wikipedia_entity))
        

## [Preparation] Correct entity mapping in co-occurrance files

In [None]:
wikipedia_entity_low2orig_map_file = 'data/extract_wiki/wikipedia_entity_low2orig_map.pickle'
wikipedia_entity_low2orig_map = {}
with open(wikipedia_entity_file) as f_in:
    wikipedia_entity = set(f_in.read().split('\n'))
    for ent in tqdm.tqdm(wikipedia_entity):
        ent_low = ent.lower()
        if ent_low not in wikipedia_entity_low2orig_map:
            wikipedia_entity_low2orig_map[ent_low] = []
        wikipedia_entity_low2orig_map[ent_low].append(ent)
with open(wikipedia_entity_low2orig_map_file, 'wb') as f_out:
    pickle.dump(wikipedia_entity_low2orig_map, f_out)

In [None]:
with open(wikipedia_entity_low2orig_map_file, 'rb') as f_in:
    wikipedia_entity_low2orig_map = pickle.load(f_in)
    
with open(wikipedia_entity_file) as f_in:
    wikipedia_entity = set(f_in.read().split('\n'))
    
for i in tqdm.tqdm(range(len(save_cooccur_files))):
    with open(save_cooccur_files[i]) as f_in:
        new_file_lines = []
        for line_idx, line in enumerate(f_in):
            line = line.strip()
            entities = line.split('\t')
            new_entities = []
            for ent in entities:
                if ent in wikipedia_entity:
                    new_entities.append(ent)
                else:
                    ent_low = ent.lower()
                    if ent_low in wikipedia_entity_low2orig_map:
                        candidates = wikipedia_entity_low2orig_map[ent_low]
                        if len(candidates) == 1:
                            new_entities.append(candidates[0])
                        else:
                            note = line2note(save_cooccur_files[i], line_idx, '_co.dat')
                            page_title = note2line(note, '_ti.dat').strip()
                            try:
                                page_ent_vec = w2vec.get_entity_vector(page_title)
                            except:
                                continue
                            most_similar_idx, most_similar_val = -1, -1
                            for candidate_idx, candidate_ent in enumerate(candidates):
                                try:
                                    candidate_vec = w2vec.get_entity_vector(candidate_ent)
                                except:
                                    continue
                                similar_val = cosine_similarity(page_ent_vec.reshape(1, -1), candidate_vec.reshape(1, -1))[0,0]
                                if similar_val > most_similar_val:
                                    most_similar_val = similar_val
                                    most_similar_idx = candidate_idx
                            if most_similar_idx >= 0:
                                new_entities.append(candidates[most_similar_idx])
            new_file_lines.append('\t'.join(new_entities))
        my_write(save_cooccur__files[i], new_file_lines)

In [None]:
# [Test]
lines = get_entity_page('Machine learning')
sents = [note2line(note) for note in lines]
occurs = [note2line(note, '_co_.dat') for note in lines]
ori_occurs = [note2line(note, '_co.dat') for note in lines]
my_write('sent_check.txt', sents)
my_write('occur_check.txt', occurs)
my_write('ori_occur_check.txt', ori_occurs)
lines[0]

## [Preparation] Mapping keyword mention to wikipedia2vec entities

In [None]:
with open(wikipedia_entity_file) as f_in:
    wikipedia_entity = set(f_in.read().split('\n'))
    
w2vec_keyword2idx = {}

for entity in tqdm.tqdm(wikipedia_entity):
    w2vec_entity = w2vec.get_entity(entity)
    if w2vec_entity is None:
        continue
    kw = gen_kw_from_wiki_ent(entity)
    if kw not in w2vec_keyword2idx:
        w2vec_keyword2idx[kw] = [w2vec_entity.index]
    else:
        if w2vec_entity.index not in w2vec_keyword2idx[kw]:
            w2vec_keyword2idx[kw].append(w2vec_entity.index)
w2vec_kws = filter_specific_keywords(list(w2vec_keyword2idx.keys()))
my_write(w2vec_keyword_file, w2vec_kws)
build_word_tree_v2(w2vec_keyword_file, w2vec_wordtree_file, w2vec_token_file)
filter_keyword_from_w2vec = set(w2vec_kws)
w2vec_keyword2idx = {k:v for k, v in w2vec_keyword2idx.items() if k in filter_keyword_from_w2vec}
with open(w2vec_keyword2idx_file, 'wb') as f_out:
    pickle.dump(w2vec_keyword2idx, f_out)
len(w2vec_keyword2idx)

In [None]:
# [Load] w2vec_keyword2idx
with open(w2vec_keyword2idx_file, 'rb') as f_in:
    w2vec_keyword2idx = pickle.load(f_in)

In [None]:
# [Test] w2vec_keyword2idx
kw = 'feature engineering'
kw_in_mention = kw in w2vec_keyword2idx
print(kw_in_mention)
if kw_in_mention:
    for idx in w2vec_keyword2idx[kw]:
        print(w2vec.dictionary.get_item_by_index(idx))

## [Preparation] Collect dataset

In [None]:
# [Create] Collect sample data using general wikipedia2vec keywords and wiki sent files
wiki_path_test_df = pd.concat([pd.DataFrame(process_file(save_sent_files[file_idx], save_cooccur__files[file_idx], w2vec)) for file_idx in range(8)], ignore_index=True)
wiki_path_test_df.to_csv(path_test_file, sep='\t', columns=feature_columns, index=False)
print(len(wiki_path_test_df))

In [None]:
# [Load] Load path test data (pd.DataFrame)
wiki_path_test_df = pd.read_csv(open(path_test_file), sep='\t')

In [None]:
# [Create] Pattern frequency generation

sub_df = wiki_path_test_df[wiki_path_test_df['sim'] > 0.5]

sub_df = sub_df.assign(pick = sub_df.apply(lambda x: 1 if 'nsubj' in x['dep_path'] else 0, axis=1))
sub_df = sub_df[sub_df['pick'] > 0]

sub_df['pattern'] = sub_df.apply(lambda x: gen_pattern(x['dep_path']), axis=1)

c = Counter(sub_df['pattern'].to_list())

with open(path_pattern_count_file, 'wb') as f_out:
    pickle.dump(c, f_out)

In [None]:
c.most_common(10)

In [None]:
# [Load] cal_freq function

with open(path_pattern_count_file, 'rb') as f_in:
    c = pickle.load(f_in)

max_cnt = c.most_common(1)[0][1]
log_max_cnt = np.log(max_cnt+1)

def cal_freq(path:str):
    cnt = c.get(path)
    cnt = (cnt if cnt else 0.5) + 1
    return np.log(cnt) / log_max_cnt

In [None]:
# [Test] cal_freq function
cal_freq('i_nsubj prep pobj prep pobj')

In [None]:
# [Test] Collect sample data using general wikipedia2vec keywords and wiki sent files
sub_df = filter_path_from_df(wiki_path_test_df, cal_freq)
sub_df = sub_df.assign(sent = sub_df.apply(lambda x: note2line(x['sent']).strip(), axis=1))
sub_df.to_csv('full_phrase_check.tsv', columns = ['sent', 'kw1', 'kw1_recall', 'kw1_full_span', 'kw2', 'kw2_recall', 'kw2_full_span'], sep='\t', index=False)

In [None]:
# [Create] collect dataset [collect_dataset]

## [Prepration] Generate Graph

In [None]:
# Generate the graph ['generate_graph']

## Examine Dataset

In [None]:
# Get the entity occurance ['collect_ent_occur_from_selected']

In [None]:
# [Load] entity occur dict which has occurance record for all entities in selected sentences
with open(entity_occur_file, 'rb') as f_in:
    entity_occur = pickle.load(f_in)

In [None]:
# Demo check co-occur of two entities in selected sentences
def get_selected_record(entity_dict:dict, ent1:str, ent2:str):
    kw1_occur = entity_dict.get(ent1)
    kw2_occur = entity_dict.get(ent2)
    if not kw1_occur or not kw2_occur:
        return None
    co_occur = kw1_occur & kw2_occur
    data = []
    for occur in co_occur:
        record = note2line(occur, '_se.dat').strip().split('\t')
        sent = note2line(record[7]).strip()
        data_dict = {record_columns[i] : record[i] for i in range(len(record))}
        data_dict['sent'] = sent
        data.append(data_dict)
    
    df = pd.DataFrame(data = data)
    return df

In [None]:
# [Test] entity occur dict

ent1 = 'Data mining'
# Check the existance of an entity
ent1 in entity_occur

# Check the sentences where an entity appear
for note in entity_occur[ent1]:
    print(note2line(note2line(note, '_se.dat').split('\t')[7]).strip())

# Check the records of two entities
ent2 = 'Decision tree algorithm'
df = get_selected_record(entity_occur, ent1, ent2)
if df is not None:
    print(len(df))
    df.to_csv('sents.tsv', columns=['score'] + feature_columns + ['pattern', 'pattern_freq'], sep='\t', index=False)

In [None]:
# [Load] Graph
with open(graph_file, 'rb') as f_in:
    graph = pickle.load(f_in)

print('num of nodes:', len(graph.nodes))
print('num of edges:', len(graph.edges))

In [None]:
for edge in graph.edges:
    print(edge)
    print(graph.get_edge_data(*edge)['data'])
    break

In [None]:
# [Test] graph
ent1 = 'Data mining'
# Check the neighbours of an entity
# list(graph.neighbors(ent1))

# Check the edges of two entities
graph.edges[ent1, 'Data fusion']

In [None]:
# Show sentence from file
note2line('AB:56:160')

In [None]:
node2neig_cnt = {node : len(list(graph.neighbors(node))) for node in graph.nodes.keys()}

In [None]:
neig_cnt = [v for v in node2neig_cnt.values() if v < 20]
plt.title('num of nodes vs num of neighbors each node')
plt.hist(neig_cnt)
plt.show()

In [None]:
node2triangle_num = nx.triangles(graph)
with open('node2tri_num.pickle', 'wb') as f_out:
    pickle.dump(node2triangle_num, f_out)

In [None]:
print('num of triangles:', sum(node2triangle_num.values()) / 3)
plt.title('num of nodes vs num of triangles each node')
plt.hist([v for v in node2triangle_num.values() if v >= 1 and v < 10])
plt.show()

In [None]:
def find_triangles(graph:nx.Graph, node:str):
    triangles = set()
    neighbors = set(graph.neighbors(node))
    for neighbor in neighbors:
        second_neighbors = set(graph.neighbors(neighbor))
        inter_neighbors = neighbors & second_neighbors
        for third_neighbor in inter_neighbors:
            triangles.add((node, neighbor, third_neighbor) if neighbor < third_neighbor else (node, third_neighbor, neighbor))
    return triangles

In [None]:
triangles = list(find_triangles(graph, 'Statistical model'))
second_node = ''
# second_node = 'Natural-language processing'
threshold = 0.5
third_node = ''
triangles.sort(key=lambda x: x[1])
triangle_with_sents = []
n_seen = set()
for n1, n2, n3 in triangles:
    if second_node and n2 != second_node and n3 != second_node:
        continue
    if third_node and n2 != third_node and n3 != third_node:
        continue
    if n2 not in n_seen:
        n_seen.add(n2)
        triangle_with_sents += ['\t'.join((n1, n2, note2line(sent).strip(), str(score)))for score, sent in graph.get_edge_data(n1, n2)['data'] if score > threshold]
    if n3 not in n_seen:
        n_seen.add(n3)
        triangle_with_sents += ['\t'.join((n1, n3, note2line(sent).strip(), str(score)))for score, sent in graph.get_edge_data(n1, n3)['data'] if score > threshold]
    triangle_with_sents += ['\t'.join((n2, n3, note2line(sent).strip(), str(score)))for score, sent in graph.get_edge_data(n2, n3)['data'] if score > threshold]
my_write('triangles.tsv', triangle_with_sents)

In [None]:
list(graph.neighbors('Hidden Markov model'))

## Demo

In [None]:
# Analyze sentence
doc = nlp('sephardi were exempt from the ban , but it appears that few applied for a letter of free passage .')

# Check noun phrases in the sentences
print(list(doc.noun_chunks))

In [None]:
len(doc)

In [None]:
doc = nlp('ada is a structured , statically typed , imperative , and object-oriented high-level programming language , extended from pascal and other language .')
pairs = [{'kw1' : 'ada', 'kw2' : 'programming language'}]
feature_process(doc, pairs)

## WOE re-write

In [None]:
kw1 = 'data mining'
kw2 = 'machine learning'
doc = nlp('Data mining is a process of extracting and discovering patterns in large data sets involving methods at the intersection of machine learning, statistics, and database systems.'.lower())
kw1_span = find_span(doc, kw1)
kw2_span = find_span(doc, kw2)
find_dependency_path_from_tree(doc, kw1_span[0], kw2_span[0])
# print(len(kw1_span))
# print(len(kw2_span))

## Online operations

In [None]:
def collect_sents_from_wiki_page(page:wikipedia.WikipediaPage):
    remove_list = ['See also', 'References', 'Further reading', 'Sources', 'External links']
    dic = {sec : page.section(sec) for sec in page.sections}
    dic['summary'] = page.summary
    sents = []
    section_list = list(dic.keys())
    while len(section_list) > 0:
        section = section_list.pop()
        if section in remove_list:
            continue
        section_text = dic[section]
        if not section_text:
            continue
        # processed_text = clean_text(section_text)
        processed_text = ' '.join(section_text.lower().split())
        temp_sents = my_sentence_tokenize(processed_text, True)
        sents += temp_sents
    return list(sents)

def collect_entity_from_wiki_page(page:wikipedia.WikipediaPage):
    return [text.lower() for text in page.links]

def collect_keyword_from_wiki_page(page:wikipedia.WikipediaPage):
    soup = BeautifulSoup(page.html(), 'html.parser')
    main_block = soup.find('div', class_='mw-parser-output')
    keywords = set([l.text.lower() for l in main_block.findAll('a') if re.match(r'^(<a href="/wiki/)', str(l))])
    return keywords



In [None]:
keyword = 'python'

p = wikipedia.page(keyword)
if p is not None:
    sents = collect_sents_from_wiki_page(p)
    keywords = collect_keyword_from_wiki_page(p)
    print('sentences collected')
    my_write('%s.txt' % keyword, sents)
    my_write('%s_kw.txt' % keyword, keywords)
    df = filter_by_path(sents)
    df.to_csv('%s_out.tsv' % keyword, sep='\t', index=False)

    dff = df[df.apply(lambda x: str(x['head']) in keywords and str(x['tail']) in keywords, axis=1)]
    dff.to_csv('%s_out_f.tsv' % keyword, sep='\t', index=False)

# Appendix

## Hand-crafted analysis

In [None]:
wiki_test_df = wiki_path_test_df[wiki_path_test_df['sim'] >= 0.0]

def match_path_pattern(path:str):
    for pp in patterns:
        if exact_match(pp, path):
            return pp
    return ''

wiki_test_df['pattern'] = wiki_test_df.apply(lambda x: match_path_pattern(x['path']), axis=1)

In [None]:
def analysis_path_result_sim_based(df:pd.DataFrame, paths:list):
    summary_df = pd.DataFrame(columns=['path', 'cnt', 'ratio', 'avg_sim'])
    for pp in paths:
        sub_df = df[df['pattern'] == pp]
        summary_df = summary_df.append({
            'path' : pp,
            'cnt' : len(sub_df),
            'ratio' : len(sub_df) / len(df),
            'avg_sim' : sum(sub_df['sim']) / len(sub_df) if len(sub_df) else 0
        }, ignore_index=True)
    summary_df = summary_df.append({
        'path' : 'general',
        'cnt' : len(df),
        'ratio' : 1,
        'avg_sim' : sum(df['sim']) / len(df) if len(df) else 0
    }, ignore_index=True)
    return summary_df

In [None]:
analysis_path_result_sim_based(wiki_test_df, patterns)

In [None]:
def collect_example_sent_for_pattern(df:pd.DataFrame, path:str, num:int=30, posfix:str='.dat'):
    sub_df = df[df['pattern'] == path]
    num = min(len(sub_df), num)
    sub_df = sub_df[:num]
    sub_df['sent'] = sub_df.apply(lambda x: note2line(x['sent'], posfix=posfix).strip(), axis=1)
    return sub_df

for patt in patterns:
    temp_df = collect_example_sent_for_pattern(wiki_test_df, patt)
    temp_df.to_csv('%s.tsv' % (patt[:10] if len(patt) >= 10 else patt), index=False, sep='\t')

## Calculate sentence overlapping