In [None]:
import pandas as pd
import json
import rich
from scrapy import Selector
import requests
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B')

def count_tokens(text):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    return len(tokens)

In [None]:
def create_all_collection(url, api_key, knowledge_list):
    # Check if collections already exists
    full_url = url + ":8080/api/v1/knowledge/list"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    response = requests.get(url=full_url, headers=headers)
    # Exit early if api call fails
    if response.status_code not in range(200,299):
        error_message = f"Recieved Non-Successful Status Code({response.status_code}), and message :{response.text}"
        print(error_message)
        return error_message

    # Parse response for list of exiting knowledges. See which are missing
    response_json = response.json()
    confirmed_knowledges = []
    print(json.dumps(response_json, indent=2))
    for knowledge in response_json:
        confirmed_knowledges.append(knowledge["name"])
    missing_knowledges = list(set(knowledge_list) - set(confirmed_knowledges))
    print(f"Aleady existing knowledges: {confirmed_knowledges}")
    if missing_knowledges:
        print(f"Missing knowledges to create: {', '.join(missing_knowledges)}")
    else:
        print("There are no knowledges to create")
        return

    # Send Create Knowledge API call for each missing knowledge
    for missing_knowledge in missing_knowledges:
        create_knowledge_url = url + ":8080/api/v1/knowledge/create"
        data = {
            "name": missing_knowledge,
            "description": f"Create fextralife's '{missing_knowledge}' knowledge partition",
            "access_control": {
                "public": True,
            },
        }
        print(f"Attempting to create knowledge: {missing_knowledge}")
        create_response = requests.post(url=create_knowledge_url, json=data, headers=headers)
        if create_response.status_code not in range(200,299):
            error_message = f"Recieved Non-Successful Status Code({create_response.status_code}), and message :{create_response.text}"
            print(error_message)
            return error_message

        print(f"Creation Succeeded for knowledge: {missing_knowledge}")
        print(f"Confirmation response: {json.dumps(create_response.json(), indent=2)}")

url = 'http://localhost'
api_key = '<pagste your openw web ui api key here>'
knowledge_list = ['Weapons', 'Armor', 'Items', 'Decorations', 'Misc']
create_all_collection(url, api_key, knowledge_list)

In [None]:
with open('./wikiproject/output/fextralife-monsterhunterwildswiki.jsonl') as f:
    data = [json.loads(line) for line in f]
    
df = pd.DataFrame(data)

In [None]:
df['doc_filepath'] = "./wikiproject/output/documents/" + (df['breadcrumb'].str.replace("/", "-") + "-" + df['title']).str.replace("/", "-").str.strip("-") + ".txt"
example_filepath = df['doc_filepath'][100]

In [None]:
def upload_file_to_openwebui_knowledge(url, api_key, filepath):
    #
    full_url = url + ":8080/api/v1/files/"
    files = {'file': open(filepath, 'rb')}
    headers = {
        'Authorization': f'Bearer {api_key}',
        'Accept': 'application/json'
    }

    upload_response = requests.post(url=full_url, headers=headers, files=files)
    if upload_response.status_code not in range(200,299):
        print("Oopsies, the upload failed!")
        print(upload_response.text)
        return
    upload_response_json = upload_response.json()
    print(json.dumps(upload_response_json, indent=2))

    

    add_to_knowledge_response = requests.
    
url = 'http://localhost'
api_key = "<paste your open web ui api key here>"
upload_file_to_openwebui_knowledge("http://localhost", api_key, example_filepath)

In [None]:
def get_files_to_openwebui(url, api_key):
    full_url = url + ":8080/api/v1/files/"
    headers = {
        'Authorization': f'Bearer {api_key}',
        'Accept': 'application/json'
    }

    filelist_response = requests.get(url=full_url, headers=headers)
    if filelist_response.status_code not in range(200,299):
        print("Oopsies, the upload failed!")
        print(filelist_response.text)
        return
    print(json.dumps(filelist_response.json(), indent=2))
    
url = 'http://localhost'
api_key = "<paste your open web ui api key here>"
get_files_to_openwebui("http://localhost", api_key)

In [None]:
def safe_list_get (l, idx, default):
  try:
    return l[idx]
  except IndexError:
    return default
      
df['first_breadcrumb'] = df['breadcrumb'].apply(lambda x: safe_list_get(x.split('/'), 1, 0)) 
df['second_breadcrumb'] = df['breadcrumb'].apply(lambda x: safe_list_get(x.split('/'), 2, 0)).astype(str)
df['combined_breadcrumb'] = df['first_breadcrumb'] + df['second_breadcrumb']

def count_tokens_in_wiki_content(series):
    return count_tokens(series.str.cat())

In [None]:
first_breadcrumb_list = df['first_breadcrumb'].unique().tolist()
second_breadcrumb_list = df['second_breadcrumb'].unique().tolist()

first_breadcrumb_count = dict.fromkeys(first_breadcrumb_list, 0)
second_breadcrumb_count = dict.fromkeys(second_breadcrumb_list, 0)
for index, row in df.iterrows():
    first_breadcrumb_count[row['first_breadcrumb']] += count_tokens(row['wiki_content'])
    second_breadcrumb_count[row['second_breadcrumb']] += count_tokens(row['wiki_content'])

print(json.dumps(first_breadcrumb_count, indent=2))
print(json.dumps(second_breadcrumb_count, indent=2))

In [None]:
for idx, val in first_breadcrumb_count.items():
    first_breadcrumb_count[idx] = first_breadcrumb_count[idx] / 24679933.0 * 100
    
for idx, val in second_breadcrumb_count.items():
    second_breadcrumb_count[idx] = second_breadcrumb_count[idx] / 24679933.0 * 100
print(json.dumps(first_breadcrumb_count, indent=2))
print(json.dumps(second_breadcrumb_count, indent=2))

In [None]:
df['first_breadcrumb']

In [None]:
df.head()

In [None]:
count_tokens(df['wiki_content'].str.cat())

In [None]:
def get_breadcrumb(url, html_content):
    sel = Selector(text=html_content)
    url_end_route = url.split("/")[-1]
    breadcrumb_tags = "/" + "/".join([x for x in sel.css('div.breadcrumb-wrapper a::text').getall() if x != '+']) + "/" + url_end_route
    if breadcrumb_tags == "":
        return "/" + url_end_route
    return breadcrumb_tags

get_breadcrumb(df.url[10], df.content[10])
print(df.url[10])

In [None]:
from scrapy import Selector

sel = Selector(text=df.content[11])
html_node = sel.css('html')

wiki_tables = html_node.xpath('//table[@class="wiki_table"]').getall()

wiki_tables

In [None]:
def parse_table_with_selector(html):
    sel = Selector(text=html)
    tables = sel.xpath('//table[@class="wiki_table"]').getall()

    normalized_data = []

    for table_html in tables:
        table_sel = Selector(text=table_html)
        
        # Extract headers from <thead> if present, else first <tr>
        headers = []
        thead = table_sel.xpath('./thead')
        if thead:
            headers = thead.xpath('.//th//text()').getall()
            headers = [h.strip() for h in headers if h.strip()]
        else:
            first_tr = table_sel.xpath('.//tr')[0]
            headers = first_tr.xpath('./th//text() | ./td//text()').getall()
            headers = [h.strip() for h in headers if h.strip()]

        # Extract ros, skipping header row if no thead
        if thead:
            rows = table_sel.xpath('./tbody/tr')
        else:
            rows = table_sel.xpath('.//tr')[1:] # skip first header row

        data = []
        max_len = len(headers)

        for row in rows:
            cells = row.xpath('./th | ./td')
            row_data = []
            for cell in cells:
                # Check for nested table inside cell
                nested_table = cell.xpath('.//table')
                if nested_table:
                    nested_html = nested_table.get()
                    nested_html = parser_table_with_selector(nested_html)
                    row_data.append(nested_data)
                else: 
                    # Prefer alt or title if image present
                    img = cell.xpath('.//img')
                    if img:
                        alt = img.xpath('./@alt').get()
                        title = img.xpath('./@title').get()
                        text = alt or title or cell.xpath('string(.)').get()
                    else:
                        text = cell.xpath('string(.)').get()
                    row_data.append(text.strip() if text else '')

            max_len = max(max_len, len(row_data))
            data.append(row_data)

        # Pad headers or rows to max_len
        if len(headers) < max_len:
            headers += [f"Extra_{i}" for i in range(max_len - len(headers))]

        for r in data:
            r += [''] * (max_len - len(r))
            normalized_data.append(dict(zip(headers, r)))

    return normalized_data


# Example usage:
table_json = parse_table_with_selector(df.content[11])
import json
print(type(json.dumps(table_json,indent=2)))

In [None]:
def get_wiki_content(html_content):
    sel = Selector(text=html_content)
    wikicontent = (" ".join([x.strip() for x in sel.xpath('//div[@id="wiki-content-block"]//text()').getall()])).replace('\xa0', ' ')
    return wikicontent

get_wiki_content(df.content[10])

In [None]:
from rich.tree import Tree
from rich import print

def createPrintableTree(df):
    tree = Tree("monsterhunterwilds.wiki.fextralife.com:root")
    nodes = {}
    
    for index, row in df.iterrows():
        path = get_breadcrumb(row['url'], row['content']).replace('\xa0', ' ')
        parts = [p for p in path.split('/') if p]
        parent = tree
        partial = ""
        for part in parts:
            partial += "/" + part
            if partial not in nodes:
                nodes[partial] = parent.add(part)
            parent = nodes[partial]
    
    return tree

printableTree = createPrintableTree(df)

In [None]:
from rich.console import Console
console = Console(record=True)
console.print(printableTree)
output = console.export_text()
print(output)

In [None]:
df['breadcrumb&title'] = ""
for index, row in df.iterrows():
    path = get_breadcrumb(row['url'], row['content']).replace('\xa0', ' ')
    df.at[index, 'breadcrumb&title'] = path

df['breadcrumb&title'].head(30)

In [None]:
df.url.value_counts()

In [None]:
df.info()