In [1]:
import json
import string
import re
from progressbar import progressbar as pb
import pandas as pd
import wikipedia
from wikipedia.exceptions import DisambiguationError

import sys
sys.path.append("../../../experiments/") # use utils

from utils import normalize_text

## Obtain wiki paragraphs

In [2]:
TOP_K_PER_QUERY = 10 # maximum number of paragraphs that we are looking for one query
TOP_K_PER_ARTICLE = 2 # maximum number of paragraphs we take from one wiki article

def get_suitable_paragraphs(query, tokens_to_find, content, serp_pos, title, title_common_tokens, top_k_per_article=TOP_K_PER_ARTICLE):
    
    paragraphs = content.split("\n")
    
    result = {key: [] for key in [
        "query_processed",
        "article_title", "paragraph", "serp_position", 
        "title_common_tokens", "num_title_common_tokens", 
        "tokens_to_find", 
        "paragraph_common_tokens", "num_paragraph_common_tokens",
        "paragraph_number_in_article"
    ]}
    
    for i, p in enumerate(paragraphs):
        
        if not p or p.endswith("=="):
            continue
        
        normed_p = normalize_text(p)
        p_tokens = set(normed_p.split())
        
        paragraph_common_tokens = tokens_to_find & p_tokens
        
        if paragraph_common_tokens:
            result["query_processed"].append(query)
            result["article_title"].append(title)
            result["paragraph"].append(p)
            result["serp_position"].append(serp_pos)
            result["title_common_tokens"].append(list(title_common_tokens))
            result["num_title_common_tokens"].append(len(title_common_tokens))
            result["tokens_to_find"].append(list(tokens_to_find))
            result["paragraph_common_tokens"].append(list(paragraph_common_tokens))
            result["num_paragraph_common_tokens"].append(len(paragraph_common_tokens))
            result["paragraph_number_in_article"].append(i)

    return result


def get_paragraphs(query, top_k_per_query=TOP_K_PER_QUERY, top_k_per_article=TOP_K_PER_ARTICLE):
    
    normed_query = normalize_text(query)
    query_tokens = normed_query.split()
    
    wiki_article_names = wikipedia.search(normed_query, results=10)
    
    fin_df = pd.DataFrame()
    
    for serp_pos, article_name in enumerate(wiki_article_names):
        try:
            wiki_page = wikipedia.page(article_name, auto_suggest=False)
        except DisambiguationError as e:
            print("Disambiguation error:", article_name)
            
        title = wiki_page.title
        content = wiki_page.content
        
        normed_title = normalize_text(title)
        title_tokens = normed_title.split()

        tokens_to_find = set(query_tokens) - set(title_tokens)
        title_common_tokens = set(query_tokens) & set(title_tokens)

        if len(title_common_tokens) == 0: # ignore if no query tokens are presented in title
            continue
        
        result = get_suitable_paragraphs(query, tokens_to_find, content, serp_pos + 1, title, title_common_tokens)       

        result_df = (
            pd.DataFrame(result)
            .sort_values(
                by=["serp_position", "num_title_common_tokens", "num_paragraph_common_tokens", "paragraph_number_in_article"],
                ascending=[True, False, False, True]
            )
            .head(top_k_per_article)
        )
        fin_df = pd.concat((fin_df, result_df))
        
    if len(fin_df):
        fin_df = (
                pd.DataFrame(fin_df)
                .sort_values(
                    by=["serp_position", "num_title_common_tokens", "num_paragraph_common_tokens", "paragraph_number_in_article"],
                    ascending=[True, False, False, True]
                )
                .head(top_k_per_query)
            )

    return fin_df

## Keywords

In [1]:
df = pd.read_csv("../../../data/data_to_process.csv")
df.head(3)

Unnamed: 0,data_source,query_id,description,query,label
0,2019,1,Can cranberries prevent urinary tract infections?,cranberries urinary tract infections,0.0
1,2019,3,Can acupuncture be effective for people with e...,acupuncture epilepsy,0.0
2,2019,5,Can acupuncture prevent migraines?,acupuncture migraine,1.0


In [None]:
final_result_query = pd.DataFrame()

for _, info in pb(df.iterrows(), max_value=len(df)):
    q = info.query
    
    res_df = get_paragraphs(q)
    
    if len(res_df):
        res_df['query_id'] = info.query_id
        res_df['data_source'] = info.data_source
        res_df['label'] = info.label

        final_result_query = pd.concat((final_result_query, res_df))     

In [6]:
final_result_query.to_csv("../../../data/wikipedia_articles_keywords.csv", index=False)

## Question

In [None]:
final_result_description = pd.DataFrame()

for _, info in pb(df.iterrows(), max_value=len(df)):
    q = info.description
    
    res_df = get_paragraphs(q)
    
    if len(res_df):
        res_df['query_id'] = info.query_id
        res_df['data_source'] = info.data_source
        res_df['label'] = info.label

        final_result_description = pd.concat((final_result_description, res_df))

In [9]:
final_result_query.to_csv("../../../data/wikipedia_articles_question.csv", index=False)