In [1]:
import pandas as pd
import json
import rich
from scrapy import Selector
import requests
from transformers import AutoTokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B')

def count_tokens(text):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    return len(tokens)

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [20]:
def create_all_collection(url, api_key, knowledge_list):
    # Check if collections already exists
    full_url = url + ":8080/api/v1/knowledge/list"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    response = requests.get(url=full_url, headers=headers)
    # Exit early if api call fails
    if response.status_code not in range(200,299):
        error_message = f"Recieved Non-Successful Status Code({response.status_code}), and message :{response.text}"
        print(error_message)
        return error_message

    # Parse response for list of exiting knowledges. See which are missing
    response_json = response.json()
    confirmed_knowledges = []
    for knowledge in response_json:
        confirmed_knowledges.append(knowledge["name"])
    missing_knowledges = list(set(knowledge_list) - set(confirmed_knowledges))
    print(f"Aleady existing knowledges: {confirmed_knowledges}")
    if missing_knowledges:
        print(f"Missing knowledges to create: {', '.join(missing_knowledges)}")
    else:
        print("There are no knowledges to create")
        return

    # Send Create Knowledge API call for each missing knowledge
    for missing_knowledge in missing_knowledges:
        create_knowledge_url = url + ":8080/api/v1/knowledge/create"
        data = {
            "name": missing_knowledge,
            "description": f"Create fextralife's '{missing_knowledge}' knowledge partition",
            "access_control": {
                "public": True,
            },
        }
        print(f"Attempting to create knowledge: {missing_knowledge}")
        create_response = requests.post(url=create_knowledge_url, json=data, headers=headers)
        if create_response.status_code not in range(200,299):
            error_message = f"Recieved Non-Successful Status Code({create_response.status_code}), and message :{create_response.text}"
            print(error_message)
            return error_message

        print(f"Creation Succeeded for knowledge: {missing_knowledge}")
        print(f"Confirmation response: {json.dumps(create_response.json(), indent=2)}")

url = 'http://localhost'
api_key = '<put your open web ui api key here>'
knowledge_list = ['Weapons', 'Armor', 'Items', 'Decorations', 'Misc']
create_all_collection(url, api_key, knowledge_list)

Aleady existing knowledges: []
Missing knowledges to create: Armor, Items, Misc, Decorations, Weapons
Attempting to create knowledge: Armor
Creation Succeeded for knowledge: Armor
Confirmation response: {
  "id": "0f5ef98b-5843-4d4e-8cf4-40455c76cea9",
  "user_id": "ff9229f1-f2ad-4cae-9855-9a68b3aad54c",
  "name": "Armor",
  "description": "Create fextralife's 'Armor' knowledge partition",
  "data": null,
  "meta": null,
  "access_control": {
    "public": true
  },
  "created_at": 1754400965,
  "updated_at": 1754400965,
  "files": null
}
Attempting to create knowledge: Items
Creation Succeeded for knowledge: Items
Confirmation response: {
  "id": "e99ec94f-5248-4f24-a6ac-493da8c4061a",
  "user_id": "ff9229f1-f2ad-4cae-9855-9a68b3aad54c",
  "name": "Items",
  "description": "Create fextralife's 'Items' knowledge partition",
  "data": null,
  "meta": null,
  "access_control": {
    "public": true
  },
  "created_at": 1754400967,
  "updated_at": 1754400967,
  "files": null
}
Attempting t

In [7]:
with open('./wikiproject/output/fextralife-monsterhunterwildswiki.jsonl') as f:
    data = [json.loads(line) for line in f]
    
df = pd.DataFrame(data)

In [56]:
def safe_list_get (l, idx, default):
  try:
    return l[idx]
  except IndexError:
    return default
      
df['first_breadcrumb'] = df['breadcrumb'].apply(lambda x: safe_list_get(x.split('/'), 1, 0)) 
df['second_breadcrumb'] = df['breadcrumb'].apply(lambda x: safe_list_get(x.split('/'), 2, 0)).astype(str)
df['combined_breadcrumb'] = df['first_breadcrumb'] + df['second_breadcrumb']

def count_tokens_in_wiki_content(series):
    return count_tokens(series.str.cat())

In [71]:
first_breadcrumb_list = df['first_breadcrumb'].unique().tolist()
second_breadcrumb_list = df['second_breadcrumb'].unique().tolist()

first_breadcrumb_count = dict.fromkeys(first_breadcrumb_list, 0)
second_breadcrumb_count = dict.fromkeys(second_breadcrumb_list, 0)
for index, row in df.iterrows():
    first_breadcrumb_count[row['first_breadcrumb']] += count_tokens(row['wiki_content'])
    second_breadcrumb_count[row['second_breadcrumb']] += count_tokens(row['wiki_content'])

print(json.dumps(first_breadcrumb_count, indent=2))
print(json.dumps(second_breadcrumb_count, indent=2))

{
  "World Information": 2182352,
  "": 30525,
  "Equipment": 21610541,
  "Character Information": 727701,
  "Challenge Quests": 1249,
  "Arena Quests": 1473,
  "General Information": 59870,
  "Guides & Walkthroughs": 70424
}
{
  "0": 317448,
  "Aquatic Life": 30837,
  "Monsters": 185677,
  "Weapons": 4634615,
  "Items": 5608612,
  "NPCs": 1201969,
  "Locations": 17932,
  "Quests": 513445,
  "Endemic Life": 166709,
  "Skills": 976120,
  "Armor": 4511913,
  "Buddies": 633307,
  "Event Quests": 14272,
  "Palico Weapons": 165357,
  "Status Effects": 39028,
  "Items & Equipment Loadout": 3456,
  "Weapon Mechanics": 8516,
  "Materials": 15226,
  "Talismans": 824130,
  "Decorations": 4664310,
  "Combat": 1952,
  "Pendants": 128655,
  "DLC": 12154,
  "Crafting": 3940,
  "Character Creation": 4555
}


In [74]:
for idx, val in first_breadcrumb_count.items():
    first_breadcrumb_count[idx] = first_breadcrumb_count[idx] / 24679933.0 * 100
    
for idx, val in second_breadcrumb_count.items():
    second_breadcrumb_count[idx] = second_breadcrumb_count[idx] / 24679933.0 * 100
print(json.dumps(first_breadcrumb_count, indent=2))
print(json.dumps(second_breadcrumb_count, indent=2))

{
  "World Information": 8.842617198353011,
  "": 0.12368348001593034,
  "Equipment": 87.5632077283192,
  "Character Information": 2.9485533854569215,
  "Challenge Quests": 0.005060791696638723,
  "Arena Quests": 0.005968411664650792,
  "General Information": 0.24258574770036856,
  "Guides & Walkthroughs": 0.2853492349432229
}
{
  "0": 1.2862595696673893,
  "Aquatic Life": 0.12494766497137574,
  "Monsters": 0.7523399678597182,
  "Weapons": 18.77887999128685,
  "Items": 22.7253939465719,
  "NPCs": 4.870227970229903,
  "Locations": 0.07265821993925185,
  "Quests": 2.0804148860533775,
  "Endemic Life": 0.6754840055684106,
  "Skills": 3.9551160856068774,
  "Armor": 18.281706842559096,
  "Buddies": 2.566080710186693,
  "Event Quests": 0.05782835796191181,
  "Palico Weapons": 0.6700058707614807,
  "Status Effects": 0.1581365719266742,
  "Items & Equipment Loadout": 0.014003279506471917,
  "Weapon Mechanics": 0.0345057662838874,
  "Materials": 0.061693846575677495,
  "Talismans": 3.3392716260

In [66]:
df['first_breadcrumb']

0       World Information
1       World Information
2       World Information
3       World Information
4       World Information
              ...        
4198            Equipment
4199            Equipment
4200            Equipment
4201            Equipment
4202                     
Name: first_breadcrumb, Length: 4203, dtype: object

In [30]:
df.head()

Unnamed: 0,url,title,breadcrumb,breadcrumb&title,wiki_content,updatedAt,first_breadcrumb,second_breadcrumb
0,https://monsterhunterwilds.wiki.fextralife.com...,Lore,/World Information,/World Information/Lore,\nIf the user's answer is answered by informat...,,,1.594099
1,https://monsterhunterwilds.wiki.fextralife.com...,Item+Trades,/World Information,/World Information/Item+Trades,\nIf the user's answer is answered by informat...,,,
2,https://monsterhunterwilds.wiki.fextralife.com...,NPCs,/World Information,/World Information/NPCs,\nIf the user's answer is answered by informat...,,,
3,https://monsterhunterwilds.wiki.fextralife.com...,Pop-up+Camps,/World Information,/World Information/Pop-up+Camps,\nIf the user's answer is answered by informat...,,,
4,https://monsterhunterwilds.wiki.fextralife.com...,Locations,/World Information,/World Information/Locations,\nIf the user's answer is answered by informat...,,,


In [12]:
count_tokens(df['wiki_content'].str.cat())

24679933

In [4]:
def get_breadcrumb(url, html_content):
    sel = Selector(text=html_content)
    url_end_route = url.split("/")[-1]
    breadcrumb_tags = "/" + "/".join([x for x in sel.css('div.breadcrumb-wrapper a::text').getall() if x != '+']) + "/" + url_end_route
    if breadcrumb_tags == "":
        return "/" + url_end_route
    return breadcrumb_tags

get_breadcrumb(df.url[10], df.content[10])
print(df.url[10])

https://monsterhunterwilds.wiki.fextralife.com/Mizutsune


In [5]:
from scrapy import Selector

sel = Selector(text=df.content[11])
html_node = sel.css('html')

wiki_tables = html_node.xpath('//table[@class="wiki_table"]').getall()

wiki_tables

['<table class="wiki_table"> \n   <tbody> \n    <tr> \n     <th colspan="2"> <h2>Zoh Shia</h2> </th> \n    </tr> \n    <tr> \n     <td style="text-align: center;" colspan="2"><img style=" float: ;width: 300px; height: 168px;" title="1000006916" src="/file/Monster-Hunter-Wilds/mhwilds-zoh_shia_render_001.png" alt="mhwilds zoh shia render 001" width="300" height="168"></td> \n    </tr> \n    <tr> \n     <td>Enemy Type</td> \n     <td><a class="wiki_link" title="Monster Hunter Wilds Large Monsters" href="/Large+Monsters" target="">Large Monster</a></td> \n    </tr> \n    <tr> \n     <td>Species</td> \n     <td><a class="wiki_link" title="Monster Hunter Wilds Constructs" href="/Constructs" target=""><img title="zoh_shia_monsters_mhwilds_wiki_guide24px" src="/file/Monster-Hunter-Wilds/zoh_shia_monsters_mhwilds_wiki_guide24px.png" alt="zoh shia monsters mhwilds wiki guide24px" width="24" height="24">Construct</a></td> \n    </tr> \n    <tr> \n     <td>Elements</td> \n     <td><img title="fir

In [6]:
def parse_table_with_selector(html):
    sel = Selector(text=html)
    tables = sel.xpath('//table[@class="wiki_table"]').getall()

    normalized_data = []

    for table_html in tables:
        table_sel = Selector(text=table_html)
        
        # Extract headers from <thead> if present, else first <tr>
        headers = []
        thead = table_sel.xpath('./thead')
        if thead:
            headers = thead.xpath('.//th//text()').getall()
            headers = [h.strip() for h in headers if h.strip()]
        else:
            first_tr = table_sel.xpath('.//tr')[0]
            headers = first_tr.xpath('./th//text() | ./td//text()').getall()
            headers = [h.strip() for h in headers if h.strip()]

        # Extract ros, skipping header row if no thead
        if thead:
            rows = table_sel.xpath('./tbody/tr')
        else:
            rows = table_sel.xpath('.//tr')[1:] # skip first header row

        data = []
        max_len = len(headers)

        for row in rows:
            cells = row.xpath('./th | ./td')
            row_data = []
            for cell in cells:
                # Check for nested table inside cell
                nested_table = cell.xpath('.//table')
                if nested_table:
                    nested_html = nested_table.get()
                    nested_html = parser_table_with_selector(nested_html)
                    row_data.append(nested_data)
                else: 
                    # Prefer alt or title if image present
                    img = cell.xpath('.//img')
                    if img:
                        alt = img.xpath('./@alt').get()
                        title = img.xpath('./@title').get()
                        text = alt or title or cell.xpath('string(.)').get()
                    else:
                        text = cell.xpath('string(.)').get()
                    row_data.append(text.strip() if text else '')

            max_len = max(max_len, len(row_data))
            data.append(row_data)

        # Pad headers or rows to max_len
        if len(headers) < max_len:
            headers += [f"Extra_{i}" for i in range(max_len - len(headers))]

        for r in data:
            r += [''] * (max_len - len(r))
            normalized_data.append(dict(zip(headers, r)))

    return normalized_data


# Example usage:
table_json = parse_table_with_selector(df.content[11])
import json
print(type(json.dumps(table_json,indent=2)))

<class 'str'>


In [7]:
def get_wiki_content(html_content):
    sel = Selector(text=html_content)
    wikicontent = (" ".join([x.strip() for x in sel.xpath('//div[@id="wiki-content-block"]//text()').getall()])).replace('\xa0', ' ')
    return wikicontent

get_wiki_content(df.content[10])

"       Mizutsune        Enemy Type  Large Monster    Species  Leviathans    Elements  Water    Ailments  Waterblight  Bubbleblight    Weakness  Thunder Dragon Ice    Resistances  Water Fire    Location(s)  Scarlet Forest Ruins of Wyveria       Mizutsune is a Large Monster in Monster Hunter Wilds (MHWilds) , added with First Free Title Update (April 3, 2025). Mizutsune is a Leviathan type monster and is weak to Thunder, Dragon, and Ice damage whilst resistant to Water and Fire. Large Monsters such as Mizutsune are hostile and are usually the primary objective of Hunts. They provide valuable Materials when defeated, that allow Hunters to craft gear.  Mizutsune Monster Guide: Characteristics, Weaknesses, Drops, Locations, Weapons & Armor, Strategies, Tips & Tricks, and more to help you defeat Mizutsune in MH Wilds.     Light blue bubbles Reflecting the moonlight. Fragile and ephemeral Like flowers blooming at night Traces of a creature Graceful in its precision Noble, yet beguiling An en

In [20]:
from rich.tree import Tree
from rich import print

def createPrintableTree(df):
    tree = Tree("monsterhunterwilds.wiki.fextralife.com:root")
    nodes = {}
    
    for index, row in df.iterrows():
        path = get_breadcrumb(row['url'], row['content']).replace('\xa0', ' ')
        parts = [p for p in path.split('/') if p]
        parent = tree
        partial = ""
        for part in parts:
            partial += "/" + part
            if partial not in nodes:
                nodes[partial] = parent.add(part)
            parent = nodes[partial]
    
    return tree

printableTree = createPrintableTree(df)

In [24]:
from rich.console import Console
console = Console(record=True)
console.print(printableTree)
output = console.export_text()
print(output)

In [9]:
df['breadcrumb&title'] = ""
for index, row in df.iterrows():
    path = get_breadcrumb(row['url'], row['content']).replace('\xa0', ' ')
    df.at[index, 'breadcrumb&title'] = path

df['breadcrumb&title'].head(30)

0                           //Monster+Hunter+Wilds+Wiki
1                           //Monster+Hunter+Wilds+Wiki
2     /World Information/Quests/Arena Quests/Demonic...
3     /World Information/Quests/Side Missions/Ultima...
4               /World Information/Quests/Side+Missions
5     /General Information/DLC/MHWilds Collaboration...
6     /Seasonal Events/Festival+of+Accord:+Blossomdance
7                /World Information/Locations/Grand+Hub
8            /World Information/Quests/Challenge+Quests
9                /World Information/Quests/Arena+Quests
10    /World Information/Monsters/Large Monsters/Miz...
11    /World Information/Monsters/Large Monsters/Zoh...
12              /General Information/DLC/Title+Update+1
13                         /World Information/NPCs/Alma
14                            /Equipment/Palico+Weapons
15    /Character Information/Buddies/Palico Equipmen...
16                               /Equipment/Weapons/Bow
17                      /Equipment/Weapons/Charg

In [10]:
df.url.value_counts()

url
https://monsterhunterwilds.wiki.fextralife.com/Monster+Hunter+Wilds+Wiki           2
https://monsterhunterwilds.wiki.fextralife.com/Demonic+Strength                    1
https://monsterhunterwilds.wiki.fextralife.com/Ultimate+Strength                   1
https://monsterhunterwilds.wiki.fextralife.com/Side+Missions                       1
https://monsterhunterwilds.wiki.fextralife.com/Street+Fighter+Collaboration        1
https://monsterhunterwilds.wiki.fextralife.com/Festival+of+Accord:+Blossomdance    1
https://monsterhunterwilds.wiki.fextralife.com/Grand+Hub                           1
https://monsterhunterwilds.wiki.fextralife.com/Challenge+Quests                    1
https://monsterhunterwilds.wiki.fextralife.com/Arena+Quests                        1
https://monsterhunterwilds.wiki.fextralife.com/Mizutsune                           1
https://monsterhunterwilds.wiki.fextralife.com/Zoh+Shia                            1
https://monsterhunterwilds.wiki.fextralife.com/Title+Update+1

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   url               28 non-null     object
 1   content           28 non-null     object
 2   breadcrumb&title  28 non-null     object
dtypes: object(3)
memory usage: 804.0+ bytes
