In [51]:
# Setup elasticsearch and llama-index
import os
from dotenv import load_dotenv

from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.elasticsearch import ElasticsearchStore

# Load .env file
load_dotenv()
ES_URL = os.getenv("ES_URL")
ES_USER = os.getenv("ES_USER")
ES_PASSWORD = os.getenv("ES_PASSWORD")
ES_CLOUD_ID = os.getenv("ES_CLOUD_ID")
ES_API_KEY = os.getenv("ES_API_KEY")
ES_INDEX_NAME = os.getenv("ES_INDEX_NAME", "matrixfilms")

# Create LlamaIndex vector store
vector_store = ElasticsearchStore(
    index_name=ES_INDEX_NAME,
    es_url=ES_URL,
    es_user=ES_USER,
    es_password=ES_PASSWORD,
    es_cloud_id=ES_CLOUD_ID,
    es_api_key=ES_API_KEY,
)

# Create index from the store
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [52]:
# Download the test data

from pathlib import Path
from py7zr import SevenZipFile
import requests

def download_wiki(dump_file):
    dump_url = f"https://s3.amazonaws.com/wikia_xml_dumps/{dump_file[:1]}/{dump_file[:2]}/{dump_file}.7z"
    dump_7z_file_path = f"{dump_file}.7z"
    response = requests.get(dump_url)
    if response.status_code == 200:
        with open(dump_7z_file_path, "wb") as file:
            file.write(response.content)
        with SevenZipFile(dump_7z_file_path, mode="r") as archive:
            archive.extractall(path=".")
    else:
        raise RuntimeError(f"Failed to download the file. HTTP Status Code: {response.status_code}")

dump_file = "matrixfilms_pages_current.xml"

if not Path(f"{dump_file}").exists():
    download_wiki(dump_file)

In [55]:
# Parse wiki documents

import wikitextparser as wtp
import xmltodict

def parse_wiki_xml(file_path, limit = None):
    docs = []
    namespaces = []
    page = 0

    def process_page(title, text):
        nonlocal docs
        doc = {"content": text, "meta": {"title": title, "id": page}}
        docs.append(doc)

    def handle_content(address, content):
        nonlocal page, namespaces
        name = address[1][0]
        if name == "siteinfo":
            # We collect a set of namespaces that indicate special purpose wiki pages that we will ignore
            for namespace_elem in content["namespaces"]["namespace"]:
                namespace = namespace_elem.get("#text")
                if namespace:
                    namespaces.append(namespace)
        elif name == "page":
            title = content["title"]
            # Ignore special pages
            if any(title.startswith(namespace + ":") for namespace in namespaces):
                return True
            revision = content.get("revision")
            if revision:
                text = revision.get("text").get("#text")
                # Use wikitextparser to extract the plain text of the page
                text = wtp.parse(text).plain_text()
                process_page(title, text)
                page = page + 1
        return not limit or page < limit

    with open(file_path, "r", encoding="utf-8") as f:
        try:
            xmltodict.parse(f.read(), item_depth=2, item_callback=handle_content)
        except xmltodict.ParsingInterrupted:
            print("ParsingInterrupted... stopping...")
    return docs

docs = parse_wiki_xml(dump_file)

In [10]:
# Index documents in elasticsearch 
from llama_index.core import Document

# That takes about 10 minutes and will cost you about $0.07 
for doc in docs:
    index.insert(Document(text=doc["content"], doc_id=doc["meta"]["id"], extra_info=doc["meta"]))

In [56]:
# Now we can perform searches
import textwrap
query_engine = index.as_query_engine()

print(textwrap.fill(query_engine.query("Tell me about battle of Zion").response, width=80))


The Battle of Zion was a significant event that marked the climax of the
perennial First Machine war. It involved the Machines attempting to destroy
Zion, the last human city. The battle saw intense fighting as the Machines sent
a massive army of Sentinels and Diggers to breach Zion's defenses. Despite
initial setbacks, including the loss of key defensive hardware due to an EMP
blast, the humans managed to rally and make a last stand. Ultimately, the tide
turned when Kid opened Gate 3 just in time for the Mjolnir to enter and activate
an EMP, annihilating the first wave of Sentinels. This turn of events led to a
peace treaty between the humans and the Machines, ending the war and allowing
both Zion and the Matrix to coexist with certain agreements in place.


In [50]:
# Create LlamaIndex vector store with a new index
vector_store = ElasticsearchStore(
    index_name=ES_INDEX_NAME+"_v2",
    es_url=ES_URL,
    es_user=ES_USER,
    es_password=ES_PASSWORD,
    es_cloud_id=ES_CLOUD_ID,
    es_api_key=ES_API_KEY,
)

# Create index from the store
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

# Create the query engine
query_engine = index.as_query_engine()

# Query away
print(textwrap.fill(query_engine.query("Who is Neo?").response, width=80))

Neo is a main protagonist in The Matrix franchise who was born as Thomas A.
Anderson. He was a former bluepill who was rescued by Morpheus and the crew of
the Nebuchadnezzar, becoming a redpill. Neo was prophesied by The Oracle to be
The One, tasked with freeing humanity from the Matrix and ending the Machine
War. Throughout the series, Neo displays exceptional combat abilities, a direct
connection to the Source, and the power to affect everything connected to it.
His true nature and powers gradually return to him over time, showcasing his
unique abilities and significance in the story.
