In [None]:
'''
 * Copyright 2023 LLM-Info (?????????????????????)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 '''

# Generating Consistent and High Quality Infoboxes with LLMs

In [18]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
from scipy import spatial  # for calculating vector similarities for search
from bs4 import BeautifulSoup
import requests

# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

from config import *
openai.api_key = OPENAI_API_KEY

# Vector Database

As input, we get raw Wikipedia text. As output, we will get the most similar infobox template.

NOTE: do we want to structure the template as a bunch of fields or in proper format?

In [None]:
# search function - replace with vdb function
def fake_vdb(
    query: str,
    templates: list,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 1
):
    '''"""Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]'''
    return templates[0]


In [2]:
import re
import os
import chromadb
import numpy as np
from tqdm import tqdm
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document

In [3]:
client = chromadb.PersistentClient(path="./QueryPipeline/")

collection = client.get_collection(name="infoboxes")

langchain_chroma = Chroma(
    client=client,
    collection_name="infoboxes",
    embedding_function=SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2"),
)
print("Success!")

  from .autonotebook import tqdm as notebook_tqdm


Success!


In [4]:
def retrieve_template_from_vdb(query):
    template = langchain_chroma.similarity_search_with_score(query)
    return template

In [5]:
def retrieve_template_from_object(template):
    template = template[0].metadata['source'] # extract the infobox template
    template = template[template.index("Infobox_"):] # remove random seo stuff

    # construct template consistent with wikipedia's format
    template = template.split(' ')
    template_string = "{{" + template[0] + "\n"
    for field in template[1:]:
        template_string += f'| {field} = \n'
    template_string += "}}"
    return template_string

In [15]:
# Example
query = """ taxobox taxobox
The Rhacophoridae are a family of frogs that occur in tropical sub-Saharan Africa, South India and Sri Lanka, Japan, northeastern India to eastern China and Taiwan, south through the Philippines and Greater Sundas, and Sulawesi. They are commonly known as shrub frogs, or more ambiguously as "moss frogs" or "bush frogs". Some Rhacophoridae are called "tree frogs". Among the most spectacular members of this family are numerous "flying frogs".

Although a few groups are primarily terrestrial, rhacophorids are predominantly arboreal treefrogs. Mating frogs, while in amplexus, hold on to a branch, and beat their legs to form a foam. The eggs are laid in the foam and covered with seminal fluid before the foam hardens into a protective casing. In some species, this is done in a large group. The foam is laid above a water source so the tadpoles fall into the water once they hatch.[1]

The species within this family vary in size from 1.5 to 12 cm (0.59 to 4.72 in).[1] Like other arboreal frogs, they have toe discs, and those of the genus Chiromantis have two opposable fingers on each hand. This family also contains the Old World flying frogs, including Wallace's flying frog (Rhacophorus nigropalmatus). These frogs have extensive webbing between their fore and hind limbs, allowing them to glide through the air.[2]

Taxonomy
Evolution
The Rhacophoridae are the sister group to the Mantellidae, a family of frogs restricted to Madagascar. Both families are thought to have diverged during the Paleocene, although previous studies estimated a Cretaceous divergence. Two different hypotheses for this divergence have been proposed: one that the Mantellidae and Rhacophoridae diverged when Insular India broke from Madagascar, with the Rhacophoridae colonizing the rest of Asia following the collision of India with Asia, and the other proposing that the common ancestors of both families inhabited Asia, with the ancestral Mantellidae colonizing Madagascar from India via long-distance dispersal, using India as a stepping stone.[3][4]
"""
query = 'character'
retrieve_template_from_vdb(query)

[(Document(page_content='Infobox_comics_character', metadata={'source': 'Infobox comics character Infobox comics character Infobox comics character Infobox comics character Infobox comics character  Infobox_comics_character character_name image imagesize image_size alt caption publisher debut creators voiced_by first_series first_episode first_comic real_name alter alter_ego full_name full species homeworld alliances affiliations supports aliases powers partners IOM_alter_ego IOM_full_name IOM_alliances IOM_partners IOM_aliases IOM_powers sortkey subcat cat hero villain altcat addcharcat1 addcharcat2 addcharcat3 addcharcat4 addcharcat5 addcharcat6 noimage converted\n'}),
  0.9493862390518188),
 (Document(page_content='Infobox_comics_character_and_title', metadata={'source': 'Infobox comics character and title Infobox comics character and title Infobox comics character and title Infobox comics character and title Infobox comics character and title  Infobox_comics_character_and_title cha

# LLM Querying

As input, we provide the raw Wikipedia text and the infobox template. As output, we receive the generated infobox.

In [17]:
def formulate_query(
    article: str,
    infobox_template: str
) -> str:
    introduction = "Your task is to fill out a Wikipedia infobox. Below, you are given some context text and the infobox template.\n"
    question = f"\n\n Context text: {article}\n\n Infobox Template: {infobox_template} \n\n"
    end = 'Fill out the Wikipedia infobox. Feel free to add any fields that you think would be important to know. Remember, keep the infobox concise, accurate, and of good quality.'
    return introduction + question + end

def api_call(message, model: str = GPT_MODEL):
    messages = [
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

def ask(
    article: str,
    infobox_template: str,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = formulate_query(article, infobox_template)
    if print_message:
        print(message)
    
    reply = 'lol' #api_call(message, model)
    return reply

# Pipeline

In [55]:
link = 'https://en.wikipedia.org/wiki/Special:WhatLinksHere?target=Template%3AInfobox+comics+character&namespace='
page = requests.get(link).text
soup = BeautifulSoup(page, 'html.parser')

In [71]:
links = []
start = False
for s in soup.find_all('a'):
    temp = s.get('href')
    try:
        if start and 'limit=50&dir=next' in temp:
            break
        elif start:
            links.append(temp)
        elif '&limit=500' in temp:
            start = True
    except:
        print(temp)

None


In [66]:
links

['#bodyContent',
 '/wiki/Main_Page',
 '/wiki/Wikipedia:Contents',
 '/wiki/Portal:Current_events',
 '/wiki/Special:Random',
 '/wiki/Wikipedia:About',
 '//en.wikipedia.org/wiki/Wikipedia:Contact_us',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 '/wiki/Help:Contents',
 '/wiki/Help:Introduction',
 '/wiki/Wikipedia:Community_portal',
 '/wiki/Special:RecentChanges',
 '/wiki/Wikipedia:File_upload_wizard',
 '/wiki/Main_Page',
 '/wiki/Special:Search',
 '/w/index.php?title=Special:CreateAccount&returnto=Special%3AWhatLinksHere&returntoquery=namespace%3D%26target%3DTemplate%253AInfobox%2Bcomics%2Bcharacter',
 '/w/index.php?title=Special:UserLogin&returnto=Special%3AWhatLinksHere&returntoquery=namespace%3D%26target%3DTemplate%253AInfobox%2Bcomics%2Bcharacter',
 '/w/index.php?title=Special:CreateAccount&returnto=Special%3AWhatLinksHere&returntoquery=namespace%3D%26target%3DTemplate%253AInfobox%2

In [57]:
soup.get('href')

In [26]:
fake = ['mythical creature']

In [27]:
articles = ["""
"""]

In [28]:
for template, article in zip(fake, articles):
    # classification
    template = retrieve_template_from_vdb(template)
    for t in template:
        print(t)

    num = int(input('which one?'))
    template = retrieve_template_from_object(template[num])

    # generation
    infobox = ask(article, template, print_message=True)
    

(Document(page_content='Infobox_mythical_creature', metadata={'source': 'Infobox mythical creature Infobox mythical creature Infobox mythical creature Infobox mythical creature Infobox mythical creature  Infobox_mythical_creature name image image_size image_upright caption Grouping Sub_Grouping Similar_entities Family Folklore First_Attested AKA Country Region Habitat Details\n'}), 0.5487587451934814)
(Document(page_content='Infobox_Primeval_creature', metadata={'source': 'Infobox Primeval creature Infobox Primeval creature Infobox Primeval creature Infobox Primeval creature Infobox Primeval creature  Infobox_Primeval_creature name image species period appeared first_primeval last_primeval first_new_world last_new_world number humans_killed returned\n'}), 1.1872961521148682)
(Document(page_content='Infobox_monster_truck', metadata={'source': 'Infobox monster truck Infobox monster truck Infobox monster truck Infobox monster truck Infobox monster truck  Infobox_monster_truck name image i

Your task is to fill out a Wikipedia infobox. Below, you are given some context text and the infobox template.


 Context text: 


 Infobox Template: {{Infobox_mythical_creature
| name = 
| image = 
| image_size = 
| image_upright = 
| caption = 
| Grouping = 
| Sub_Grouping = 
| Similar_entities = 
| Family = 
| Folklore = 
| First_Attested = 
| AKA = 
| Country = 
| Region = 
| Habitat = 
| Details
 = 
}} 

Fill out the Wikipedia infobox. Feel free to add any fields that you think would be important to know. Remember, keep the infobox concise, accurate, and of good quality.
