## Set up the environment and prepare the data

### Install the necessary python packages

In [None]:
%pip install pandas
%pip install numpy
%pip install whoosh

### Imports

In [41]:
import os
import pandas as pd
import re
from tqdm.notebook import tqdm

### Parse the wiki files into a dataframe and save it

In [20]:
def parse_wiki_file(content):
    titles, text = [], []
    start = -1
    content = re.sub("\[\[File: .*\]\]", "[[[File: \1]]]", content) # wrap file links in triple brackets to avoid parsing
    for match in re.finditer("^\[\[(.*)\]\]\n\n", content, re.MULTILINE):
        titles.append(match.group(1))
        if start > -1:
            text.append(re.sub("\[\[\[File: (.*)\]\]\]", "[[File: \1]]", content[start:match.start()]))
        start = match.end()
    text.append(content[start:])
    assert len(titles) == len(text)
    return list(zip(titles, text))

In [33]:
wiki_dir = "./data/wiki"
data = {"file_id": [], "title": [], "text": []}
redirects = {"file_id": [], "title": [], "text": [], "redirect": []}
for fname in [f for f in os.listdir(wiki_dir) if not f.startswith("._")]:
    with open(os.path.join(wiki_dir, fname), "r", encoding="utf-8") as file:
        content = file.read()
    
    fid = fname.split("-")[-1].replace(".txt", "")
    for title, text in parse_wiki_file(content):
        
        if text.startswith("#REDIRECT"):
            redirect = re.sub("\[tpl\].*\[/tpl\]", "", text.replace("#REDIRECT", "")).strip()
            redirects["file_id"].append(fid)
            redirects["title"].append(title)
            redirects["text"].append(text)
            redirects["redirect"].append(redirect)
        else:
            data["file_id"].append(fid)
            data["title"].append(title)
            data["text"].append(text)
wiki_df = pd.DataFrame(data)
wiki_redirects_df = pd.DataFrame(redirects)
wiki_df.head()

Unnamed: 0,file_id,title,text
0,5,BBS,BBS may refer to:\n\n==Technologies==\n\n==Org...
1,5,British Standards,CATEGORIES: 1901 establishments in the United ...
2,5,Building society,"CATEGORIES: Building societies, Cooperatives, ..."
3,5,Blue Steel (missile),CATEGORIES: Cold War air-to-surface missiles o...
4,5,Branch Davidians,"CATEGORIES: Branch Davidianism, Adventism, Chr..."


In [20]:
# manually fix small inconsistencies in the data
# wiki_df.loc[wiki_df["title"] == "Bell Curve", "redirect"] = "Bell curve"

In [42]:
# for each redirect, find the index of the redirected page for faster lookup
def find_redirect_index(redirect):
    redirect_row = wiki_df[wiki_df["title"] == redirect]
    if redirect_row.empty:
        return -1
    if redirect_row.shape[0] > 1:
        raise ValueError(f"Multiple redirects found for {redirect}")
    
    redirect_row = redirect_row.iloc[0]
    return redirect_row.index[0]
    
redirect_indexes = []
no_matches = []
for redirect in tqdm(wiki_redirects_df["redirect"]):
    index = find_redirect_index(redirect)
    if index == -1:
        no_matches.append(redirect)
    redirect_indexes.append(index)

  0%|          | 0/127899 [00:00<?, ?it/s]

In [4]:
wiki_df.to_pickle("./data/wiki.pkl")
len(wiki_df.index)

'n = 295581'

In [None]:
wiki_redirects_df.to_pickle("./data/wiki_redirects.pkl")
len(wiki_redirects_df.index)

### Parse the [questions.txt](./data/questions.txt) file into a dataframe and save it

In [None]:
with open("./data/questions.txt", "r") as file:
    questions = file.read().split("\n\n")
data = {"category": [], "question": []}
for question in questions:
    question = question.split("\n")
    data["category"].append(question[0])
    data["question"].append(" ".join(question[1:]))
questions_df = pd.DataFrame(data)
questions_df.head()

In [None]:
questions_df.to_pickle("./data/questions.pkl")
len(questions_df.index)