## Set up the environment and prepare the data

### Install the necessary python packages

In [1]:
%pip install pandas
%pip install numpy
%pip install whoosh
%pip install nltk
%pip install sklearn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[15 lines of output][0m
  [31m   [0m

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/oberon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Imports

In [3]:
import os
import pandas as pd
import re

### Parse the wiki files into a dataframe and save it

In [4]:
def parse_wiki_file(content):
    titles, text = [], []
    start = -1
    content = re.sub(r"\[\[File:(.*)\]\]", r"---File:\1---", content) # wrap file links in triple brackets to avoid parsing
    content = re.sub(r"\[\[Image:(.*)\]\]", r"---Image:\1---", content) # wrap image links in triple brackets to avoid parsing
    for match in re.finditer("^\[\[(.*)\]\]\n\n", content, re.MULTILINE):
        titles.append(match.group(1))
        if start > -1:
            t = re.sub("---File:(.*)---", r"[[File:\1]]", content[start:match.start()])
            t = re.sub("---Image:(.*)---", r"[[Image:\1]]", t)
            text.append(t)
        start = match.end()
    text.append(content[start:])
    assert len(titles) == len(text)
    return list(zip(titles, text))

In [5]:
wiki_dir = "./data/wiki"
data = {"file_id": [], "title": [], "text": []}
redirects = {"file_id": [], "title": [], "text": [], "redirect": []}
for fname in [f for f in os.listdir(wiki_dir) if not f.startswith("._")]:
    with open(os.path.join(wiki_dir, fname), "r", encoding="utf-8") as file:
        content = file.read()
    
    fid = fname.split("-")[-1].replace(".txt", "")
    for title, text in parse_wiki_file(content):
        if text.startswith("#REDIRECT"):
            redirect = re.sub("\[tpl\].*\[/tpl\]", "", text.replace("#REDIRECT", "")).strip()
            redirects["file_id"].append(fid)
            redirects["title"].append(title.title())
            redirects["text"].append(text)
            redirects["redirect"].append(redirect)
        else:
            data["file_id"].append(fid)
            data["title"].append(title.title())
            data["text"].append(text)
wiki_df = pd.DataFrame(data).drop_duplicates(subset="title")
wiki_redirects_df = pd.DataFrame(redirects).drop_duplicates(subset=("title", "redirect"))
wiki_df.head()

Unnamed: 0,file_id,title,text
0,14,Howard Hughes,"CATEGORIES: Howard Hughes, 1905 births, 1976 d..."
1,14,Hook Of Holland,"CATEGORIES: Rotterdam, Boroughs of Rotterdam, ..."
2,14,Hugh Binning,"CATEGORIES: 1627 births, 1653 deaths, Scottish..."
3,14,"Henry Home, Lord Kames","CATEGORIES: 1696 births, 1782 deaths, 18th-cen..."
4,14,Harwich,"CATEGORIES: Harwich, Port cities and towns in ..."


In [6]:
wiki_lookups = {title: index for title, index in zip(wiki_df["title"], wiki_df.index)}
assert len(wiki_lookups) == len(wiki_df.index)

# for each redirect, find the index of the redirected page for faster lookup
def find_redirect_index(redirect):
    if redirect not in wiki_lookups:
        return -1 # TODO consider searching redirect on wikipedia to find real title
    else:
        return wiki_lookups[redirect]
    
redirect_indexes = [find_redirect_index(redirect) for redirect in wiki_redirects_df["redirect"]]
wiki_redirects_df["redirect_index"] = redirect_indexes
wiki_redirects_df.head()

Unnamed: 0,file_id,title,text,redirect,redirect_index
0,14,Hydroelectric Plant,#REDIRECT Hydroelectricity\n\n\n,Hydroelectricity,-1
1,14,Horse Breed,#REDIRECT List of horse breeds\n\n\n,List of horse breeds,-1
2,14,Horse Breeds,#REDIRECT list of horse breeds\n\n\n,list of horse breeds,-1
3,14,Melody Dominated Homophony,#REDIRECT Homophony\n\n\n,Homophony,-1
4,14,Historic List Of Cities Of Europe,#REDIRECT List of largest European cities in h...,List of largest European cities in history,-1


In [10]:
wiki_df.to_pickle("./data/wiki.pkl")
wiki_redirects_df.to_pickle("./data/wiki_redirects.pkl")

len(wiki_df.index), len(wiki_redirects_df.index)

(151589, 126231)

### Parse the [questions.txt](./data/questions.txt) file into a dataframe and save it

In [8]:
with open("./data/questions.txt", "r") as file:
    questions = file.read().rstrip("\n").split("\n\n")
data = {"category": [], "question": [], "answer": []}
for question in questions:
    question = question.split("\n")
    data["category"].append(question[0])
    data["question"].append(question[1])
    data["answer"].append(question[2].title())
questions_df = pd.DataFrame(data)
questions_df.head()

Unnamed: 0,category,question,answer
0,NEWSPAPERS,"The dominant paper in our nation's capital, it...",The Washington Post
1,OLD YEAR'S RESOLUTIONS,The practice of pre-authorizing presidential u...,Taiwan
2,NEWSPAPERS,Daniel Hertzberg & James B. Stewart of this pa...,The Wall Street Journal
3,BROADWAY LYRICS,"Song that says, ""you make me smile with my hea...",My Funny Valentine
4,POTPOURRI,In 2011 bell ringers for this charity started ...,The Salvation Army|Salvation Army


In [9]:
questions_df.to_pickle("./data/questions.pkl")