## Set up the environment and prepare the data

### Install the necessary python packages

In [1]:
%pip install pandas
%pip install numpy
%pip install whoosh
%pip install nltk
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Imports

In [3]:
from env import env

import os
import pandas as pd
import re
import pickle
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

### Parse the wiki files into a dataframe and save it

In [4]:
def parse_wiki_file(content):
    titles, text = [], []
    start = -1
    content = re.sub(r"\[\[File:(.*)\]\]", r"---File:\1---", content) # wrap file links in triple brackets to avoid parsing
    content = re.sub(r"\[\[Image:(.*)\]\]", r"---Image:\1---", content) # wrap image links in triple brackets to avoid parsing
    for match in re.finditer("^\[\[(.*)\]\]\n\n", content, re.MULTILINE):
        titles.append(match.group(1))
        if start > -1:
            t = re.sub("---File:(.*)---", r"[[File:\1]]", content[start:match.start()])
            t = re.sub("---Image:(.*)---", r"[[Image:\1]]", t)
            text.append(t)
        start = match.end()
    text.append(content[start:])
    assert len(titles) == len(text)
    return list(zip(titles, text))

In [5]:
wiki_dir = "./raw_data/wiki" # this directory is only stored locally
data = {"file_id": [], "title": [], "text": []}
redirects = {"file_id": [], "title": [], "text": [], "redirect": []}
for fname in [f for f in os.listdir(wiki_dir) if not f.startswith("._")]:
    with open(os.path.join(wiki_dir, fname), "r", encoding="utf-8") as file:
        content = file.read()
    
    fid = fname.split("-")[-1].replace(".txt", "")
    for title, text in parse_wiki_file(content):
        if text.startswith("#REDIRECT"):
            redirect = re.sub("\[tpl\].*\[/tpl\]", "", text.replace("#REDIRECT", "")).strip()
            redirects["file_id"].append(fid)
            redirects["title"].append(title.title())
            redirects["text"].append(text)
            redirects["redirect"].append(redirect)
        else:
            data["file_id"].append(fid)
            data["title"].append(title.title())
            data["text"].append(text)
wiki_df = pd.DataFrame(data).drop_duplicates(subset="title")
wiki_redirects_df = pd.DataFrame(redirects).drop_duplicates(subset=("title", "redirect"))
wiki_df.head()

Unnamed: 0,file_id,title,text
0,6,Continuum Hypothesis,"CATEGORIES: Forcing (mathematics), Independenc..."
1,6,Çevik Bir,"CATEGORIES: 1939 births, People from Izmir, Li..."
2,6,Collectivism,"CATEGORIES: Collectivism, Collaboration, Corpo..."
3,6,Nepeta,"CATEGORIES: Lamiaceae, Flora of Africa, Flora ..."
4,6,Cumin,"CATEGORIES: Edible Apiaceae, Medicinal plants ..."


In [6]:
wiki_lookups = {title: index for title, index in zip(wiki_df["title"], wiki_df.index)}
assert len(wiki_lookups) == len(wiki_df.index)

# for each redirect, find the index of the redirected page for faster lookup
def find_redirect_index(redirect):
    if redirect not in wiki_lookups:
        return -1 # TODO consider searching redirect on wikipedia to find real title
    else:
        return wiki_lookups[redirect]
    
redirect_indexes = [find_redirect_index(redirect) for redirect in wiki_redirects_df["redirect"]]
wiki_redirects_df["redirect_index"] = redirect_indexes
wiki_redirects_df.head()

Unnamed: 0,file_id,title,text,redirect,redirect_index
0,6,Capitalist,#REDIRECT Capitalism\n\n\n,Capitalism,-1
1,6,Cross Cutting,#REDIRECT Cross-cutting\n\n\n,Cross-cutting,-1
2,6,Monetary Policy Of Central Banks,#REDIRECT Monetary policy\n\n\n,Monetary policy,-1
3,6,Cryptography/Hashfunction,#REDIRECT Hash function\n\n\n,Hash function,-1
4,6,Cryptography/Key,#REDIRECT Key (cryptography)\n\n\n,Key (cryptography),-1


In [7]:
wiki_df.to_pickle(f"{env.data_dir}/wiki.pkl")
wiki_redirects_df.to_pickle(f"{env.data_dir}/wiki_redirects.pkl")

len(wiki_df.index), len(wiki_redirects_df.index)

(64211, 53084)

### Parse the [questions.txt](./data/questions.txt) file into a dataframe and save it

In [8]:
with open("./raw_data/questions.txt", "r") as file:
    questions = file.read().rstrip("\n").split("\n\n")
data = {"category": [], "question": [], "answer": []}
for question in questions:
    question = question.split("\n")
    data["category"].append(question[0])
    data["question"].append(question[1])
    data["answer"].append(question[2].title())
questions_df = pd.DataFrame(data)
questions_df.head()

Unnamed: 0,category,question,answer
0,NEWSPAPERS,"The dominant paper in our nation's capital, it...",The Washington Post
1,OLD YEAR'S RESOLUTIONS,The practice of pre-authorizing presidential u...,Taiwan
2,NEWSPAPERS,Daniel Hertzberg & James B. Stewart of this pa...,The Wall Street Journal
3,BROADWAY LYRICS,"Song that says, ""you make me smile with my hea...",My Funny Valentine
4,POTPOURRI,In 2011 bell ringers for this charity started ...,The Salvation Army|Salvation Army


In [9]:
questions_df.to_pickle(f"{env.data_dir}/questions.pkl")

### Count the frequency of each term in the dataset

In [10]:
lemmatizer = WordNetLemmatizer()
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(wiki_df.text)
terms = count_vectorizer.get_feature_names_out()
total_counts = np.asarray(np.sum(count_matrix, axis=0)).reshape(-1)
term_counts = {}
for term, count in zip(terms, total_counts):
    term = lemmatizer.lemmatize(term.lower())
    if term in term_counts:
        term_counts[term] += count
    else:
        term_counts[term] = count

del count_matrix, total_counts, terms, count_vectorizer # clean up memory

with open(f"{env.data_dir}/term_counts.pkl", "wb") as file:
    pickle.dump(term_counts, file)