In [8]:
import pandas as pd
import wikipedia
from wikipedia.exceptions import DisambiguationError, PageError

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("../") # use utils

from utils import normalize_text

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 400)

init_data = pd.read_csv("../data_to_process.csv").query("data_source != 'wh_topics'")
init_data.head(3)

Unnamed: 0,data_source,query_id,description,query,label
0,2019,1,Can cranberries prevent urinary tract infections?,cranberries urinary tract infections,0.0
1,2019,3,Can acupuncture be effective for people with epilepsy?,acupuncture epilepsy,0.0
2,2019,5,Can acupuncture prevent migraines?,acupuncture migraine,1.0


In [3]:
df = pd.concat((
    pd.read_csv("google_search_wiki/google_search_wiki_query_part_1.csv"),
    pd.read_csv("google_search_wiki/google_search_wiki_query_part_2.csv")
)).reset_index(drop=True)
print(len(df))
df.head(5)

1113


Unnamed: 0,query_id,data_source,pubmed_id,description,query,label
0,1,2019,https://en.wikipedia.org/wiki/Cranberry,Can cranberries prevent urinary tract infections?,cranberries urinary tract infections,0.0
1,1,2019,https://en.wikipedia.org/wiki/Cranberry_juice,Can cranberries prevent urinary tract infections?,cranberries urinary tract infections,0.0
2,1,2019,https://en.wikipedia.org/wiki/Urinary_tract_infection,Can cranberries prevent urinary tract infections?,cranberries urinary tract infections,0.0
3,1,2019,https://en.wikipedia.org/wiki/Proanthocyanidin,Can cranberries prevent urinary tract infections?,cranberries urinary tract infections,0.0
4,1,2019,https://en.wikipedia.org/wiki/Talk%3ACranberry,Can cranberries prevent urinary tract infections?,cranberries urinary tract infections,0.0


In [4]:
def process_wiki_url(url):
    last = url.split("/")[-1]
    if "Talk%" in url or url.endswith(".jpg") or \
    ("en.wikipedia.org" not in url and "simple.wikipedia.org" not in url) \
    or "Wikipedia:Reference_desk" in url:
        return ""
    
    last = re.sub("\%[A-Z0-9]{2}", " ", last)
    last = re.sub(" +", " ", last)
    
    return last.replace("_", " ").strip().strip("-")

In [5]:
df["processed_url"] = df["pubmed_id"].apply(process_wiki_url)
df = df.query("processed_url != ''").reset_index(drop=True)
print(len(df))
df.head(3)

1032


Unnamed: 0,query_id,data_source,pubmed_id,description,query,label,processed_url
0,1,2019,https://en.wikipedia.org/wiki/Cranberry,Can cranberries prevent urinary tract infections?,cranberries urinary tract infections,0.0,Cranberry
1,1,2019,https://en.wikipedia.org/wiki/Cranberry_juice,Can cranberries prevent urinary tract infections?,cranberries urinary tract infections,0.0,Cranberry juice
2,1,2019,https://en.wikipedia.org/wiki/Urinary_tract_infection,Can cranberries prevent urinary tract infections?,cranberries urinary tract infections,0.0,Urinary tract infection


## Obtain wiki paragraphs

In [9]:
TOP_K_PER_QUERY = 10 # сколько макимально параграфов ищем для одного запроса
TOP_K_PER_ARTICLE = 2 # сколько максимально параграфов берем из одной статьи

def get_suitable_paragraphs(query, tokens_to_find, content, serp_pos, title, title_common_tokens, top_k_per_article=TOP_K_PER_ARTICLE):
    
    paragraphs = content.split("\n")
    
    result = {key: [] for key in [
        "query_processed",
        "article_title", "paragraph", "serp_position", 
        "title_common_tokens", "num_title_common_tokens", 
        "tokens_to_find", 
        "paragraph_common_tokens", "num_paragraph_common_tokens",
        "paragraph_number_in_article"
    ]}
    
    for i, p in enumerate(paragraphs):
        
        if not p or p.endswith("=="):
            continue
        
        normed_p = normalize_text(p)
        p_tokens = set(normed_p.split())
        
        paragraph_common_tokens = tokens_to_find & p_tokens
        
        if paragraph_common_tokens:
            result["query_processed"].append(query)
            result["article_title"].append(title)
            result["paragraph"].append(p)
            result["serp_position"].append(serp_pos)
            result["title_common_tokens"].append(list(title_common_tokens))
            result["num_title_common_tokens"].append(len(title_common_tokens))
            result["tokens_to_find"].append(list(tokens_to_find))
            result["paragraph_common_tokens"].append(list(paragraph_common_tokens))
            result["num_paragraph_common_tokens"].append(len(paragraph_common_tokens))
            result["paragraph_number_in_article"].append(i)

    return result


def get_paragraphs(query, wiki_article_names, top_k_per_query=TOP_K_PER_QUERY, top_k_per_article=TOP_K_PER_ARTICLE):
    
    normed_query = normalize_text(query)
    query_tokens = normed_query.split()
    
    fin_df = pd.DataFrame()
    
    for serp_pos, article_name in enumerate(wiki_article_names):
        try:
            wiki_page = wikipedia.page(article_name, auto_suggest=False)
        except DisambiguationError as e:
            print("Disambiguation error:", article_name)
        except PageError as e:
            print("PageError error:", article_name)
            continue
            
        title = wiki_page.title
        content = wiki_page.content
        
        normed_title = normalize_text(title)
        title_tokens = normed_title.split()

        tokens_to_find = set(query_tokens) - set(title_tokens)
        title_common_tokens = set(query_tokens) & set(title_tokens)

        if len(title_common_tokens) == 0: # ignore if no query tokens are presented in title
            continue
        
        result = get_suitable_paragraphs(query, tokens_to_find, content, serp_pos + 1, title, title_common_tokens)       

        result_df = (
            pd.DataFrame(result)
            .sort_values(
                by=["serp_position", "num_title_common_tokens", "num_paragraph_common_tokens", "paragraph_number_in_article"],
                ascending=[True, False, False, True]
            )
            .head(top_k_per_article)
        )
        fin_df = pd.concat((fin_df, result_df))
        
    if len(fin_df):
        fin_df = (
                pd.DataFrame(fin_df)
                .sort_values(
                    by=["serp_position", "num_title_common_tokens", "num_paragraph_common_tokens", "paragraph_number_in_article"],
                    ascending=[True, False, False, True]
                )
                .head(top_k_per_query)
            )

    return fin_df

## Use query

In [11]:
final_result_query = pd.DataFrame()

for _, group in pb(df.groupby(["query_id", "data_source"]), max_value=len(init_data)):
    q = group["query"].tolist()[0]
    articles = group.processed_url.tolist()
    res_df = get_paragraphs(q, articles)
    
    if len(res_df):
        res_df['query_id'] = group.query_id.tolist()[0]
        res_df['data_source'] = group.data_source.tolist()[0]
        res_df['label'] = group.label.tolist()[0]

        final_result_query = pd.concat((final_result_query, res_df))

  1% (2 of 113) |                        | Elapsed Time: 0:00:16 ETA:   0:14:05

Disambiguation error: Articles


  3% (4 of 113) |                        | Elapsed Time: 0:00:28 ETA:   0:21:20

Disambiguation error: DTOT


  4% (5 of 113) |#                       | Elapsed Time: 0:00:34 ETA:   0:10:10

Disambiguation error: Occlusion training


 18% (21 of 113) |####                   | Elapsed Time: 0:03:03 ETA:   0:10:44

PageError error: Special:Search?title=John Ebnezar&mobileaction=toggle view desktop


 27% (31 of 113) |######                 | Elapsed Time: 0:04:27 ETA:   0:11:22

PageError error: List of Reviews used in Wikipedia articles


 28% (32 of 113) |######                 | Elapsed Time: 0:04:33 ETA:   0:09:08

Disambiguation error: all


 29% (33 of 113) |######                 | Elapsed Time: 0:04:40 ETA:   0:08:24

PageError error: List of all articles


 30% (34 of 113) |######                 | Elapsed Time: 0:04:52 ETA:   0:16:30

PageError error: 5 -Reductase inhibitor


 31% (36 of 113) |#######                | Elapsed Time: 0:05:13 ETA:   0:12:39

Disambiguation error: Hoxsey
PageError error: File:FDAHoxsey.JPG


 35% (40 of 113) |########               | Elapsed Time: 0:05:48 ETA:   0:10:12

PageError error: List of Procter   Gamble brands


 41% (47 of 113) |#########              | Elapsed Time: 0:06:50 ETA:   0:09:57

PageError error: Athlete s foot


 42% (48 of 113) |#########              | Elapsed Time: 0:06:59 ETA:   0:09:35

PageError error: List of Guy s Grocery Games episodes


 45% (51 of 113) |##########             | Elapsed Time: 0:07:34 ETA:   0:12:45

PageError error: Music therapy for Alzheimer s disease


 48% (55 of 113) |###########            | Elapsed Time: 0:08:10 ETA:   0:10:53

PageError error: Lay s WOW chips


 54% (62 of 113) |############           | Elapsed Time: 0:09:28 ETA:   0:11:01

PageError error: Luden s


 56% (64 of 113) |#############          | Elapsed Time: 0:09:48 ETA:   0:09:05

PageError error: 3 -Methoxypregnenolone
PageError error: Acute and Emergency Care
PageError error: Brown-S quard syndrome


 58% (66 of 113) |#############          | Elapsed Time: 0:10:06 ETA:   0:06:57

PageError error: Recognized content


 59% (67 of 113) |#############          | Elapsed Time: 0:10:17 ETA:   0:08:45

PageError error: List of all articles


 61% (69 of 113) |##############         | Elapsed Time: 0:10:37 ETA:   0:06:36

PageError error: August-2012
PageError error: Hare   la royale


 66% (75 of 113) |###############        | Elapsed Time: 0:11:37 ETA:   0:05:24

PageError error: Fisherman s Friend


 72% (82 of 113) |################       | Elapsed Time: 0:12:44 ETA:   0:04:42

PageError error: Bordet Gengou agar


 84% (95 of 113) |###################    | Elapsed Time: 0:14:46 ETA:   0:02:44

PageError error: Kindling (sedative hypnotic withdrawal)


 91% (103 of 113) |####################  | Elapsed Time: 0:15:58 ETA:   0:01:31

PageError error: American Society for Metabolic   Bariatric Surgery


 92% (104 of 113) |####################  | Elapsed Time: 0:16:08 ETA:   0:01:27

PageError error: De Longhi


 96% (109 of 113) |##################### | Elapsed Time: 0:16:55 ETA:   0:00:36

PageError error: Unnatural Causes: Is Inequality Making Us Sick


 97% (110 of 113) |##################### | Elapsed Time: 0:17:04 ETA:   0:00:26

PageError error: medication)


100% (113 of 113) |######################| Elapsed Time: 0:17:31 Time:  0:17:31


In [14]:
final_result_query.to_csv("wiki_articles_from_google_query.csv", index=False)

## Use description

In [15]:
final_result_query = pd.DataFrame()

for _, group in pb(df.groupby(["query_id", "data_source"]), max_value=len(init_data)):
    q = group["description"].tolist()[0]
    articles = group.processed_url.tolist()
    res_df = get_paragraphs(q, articles)
    
    if len(res_df):
        res_df['query_id'] = group.query_id.tolist()[0]
        res_df['data_source'] = group.data_source.tolist()[0]
        res_df['label'] = group.label.tolist()[0]

        final_result_query = pd.concat((final_result_query, res_df))

  1% (2 of 113) |                        | Elapsed Time: 0:00:16 ETA:   0:13:56

Disambiguation error: Articles


  3% (4 of 113) |                        | Elapsed Time: 0:00:27 ETA:   0:17:55

Disambiguation error: DTOT


  4% (5 of 113) |#                       | Elapsed Time: 0:00:32 ETA:   0:10:03

Disambiguation error: Occlusion training


 18% (21 of 113) |####                   | Elapsed Time: 0:02:58 ETA:   0:10:39

PageError error: Special:Search?title=John Ebnezar&mobileaction=toggle view desktop


 27% (31 of 113) |######                 | Elapsed Time: 0:04:19 ETA:   0:10:43

PageError error: List of Reviews used in Wikipedia articles


 28% (32 of 113) |######                 | Elapsed Time: 0:04:26 ETA:   0:08:56

Disambiguation error: all


 29% (33 of 113) |######                 | Elapsed Time: 0:04:31 ETA:   0:07:35

PageError error: List of all articles


 30% (34 of 113) |######                 | Elapsed Time: 0:04:40 ETA:   0:11:53

PageError error: 5 -Reductase inhibitor


 31% (36 of 113) |#######                | Elapsed Time: 0:05:01 ETA:   0:14:49

Disambiguation error: Hoxsey
PageError error: File:FDAHoxsey.JPG


 35% (40 of 113) |########               | Elapsed Time: 0:05:35 ETA:   0:10:13

PageError error: List of Procter   Gamble brands


 41% (47 of 113) |#########              | Elapsed Time: 0:06:34 ETA:   0:09:05

PageError error: Athlete s foot


 42% (48 of 113) |#########              | Elapsed Time: 0:06:42 ETA:   0:09:17

PageError error: List of Guy s Grocery Games episodes


 45% (51 of 113) |##########             | Elapsed Time: 0:07:13 ETA:   0:10:15

PageError error: Music therapy for Alzheimer s disease


 48% (55 of 113) |###########            | Elapsed Time: 0:07:49 ETA:   0:10:05

PageError error: Lay s WOW chips


 54% (62 of 113) |############           | Elapsed Time: 0:08:58 ETA:   0:08:46

PageError error: Luden s


 56% (64 of 113) |#############          | Elapsed Time: 0:09:17 ETA:   0:08:27

PageError error: 3 -Methoxypregnenolone
PageError error: Acute and Emergency Care
PageError error: Brown-S quard syndrome


 58% (66 of 113) |#############          | Elapsed Time: 0:09:33 ETA:   0:05:35

PageError error: Recognized content


 59% (67 of 113) |#############          | Elapsed Time: 0:09:40 ETA:   0:05:35

PageError error: List of all articles


 61% (69 of 113) |##############         | Elapsed Time: 0:10:00 ETA:   0:07:11

PageError error: August-2012
PageError error: Hare   la royale


 66% (75 of 113) |###############        | Elapsed Time: 0:10:57 ETA:   0:05:17

PageError error: Fisherman s Friend


 72% (82 of 113) |################       | Elapsed Time: 0:11:59 ETA:   0:04:31

PageError error: Bordet Gengou agar


 84% (95 of 113) |###################    | Elapsed Time: 0:14:00 ETA:   0:02:32

PageError error: Kindling (sedative hypnotic withdrawal)


 91% (103 of 113) |####################  | Elapsed Time: 0:15:13 ETA:   0:01:41

PageError error: American Society for Metabolic   Bariatric Surgery


 92% (104 of 113) |####################  | Elapsed Time: 0:15:22 ETA:   0:01:21

PageError error: De Longhi


 96% (109 of 113) |##################### | Elapsed Time: 0:16:05 ETA:   0:00:35

PageError error: Unnatural Causes: Is Inequality Making Us Sick


 97% (110 of 113) |##################### | Elapsed Time: 0:16:15 ETA:   0:00:30

PageError error: medication)


100% (113 of 113) |######################| Elapsed Time: 0:16:43 Time:  0:16:43


In [16]:
final_result_query.to_csv("wiki_articles_from_google_description.csv", index=False)