In [None]:
'''
 * Copyright 2023 LLM-Info (?????????????????????)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 '''

# Generating Consistent and High Quality Infoboxes with LLMs

In [None]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
from scipy import spatial  # for calculating vector similarities for search


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

from config import *
openai.api_key = OPENAI_API_KEY

# Vector Database

As input, we get raw Wikipedia text. As output, we will get the most similar infobox template.

NOTE: do we want to structure the template as a bunch of fields or in proper format?

In [None]:
# search function - replace with vdb function
def fake_vdb(
    query: str,
    templates: list,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 1
):
    '''"""Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]'''
    return templates[0]


In [None]:
import re
import os
import chromadb
import numpy as np
from tqdm import tqdm
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document

In [None]:
client = chromadb.PersistentClient(path="./QueryPipeline/")

collection = client.get_collection(name="infoboxes")

langchain_chroma = Chroma(
    client=client,
    collection_name="infoboxes",
    embedding_function=SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2"),
)
print("Success!")

In [None]:
def retrieve_template_from_vdb(query):
    template = langchain_chroma.similarity_search_with_score(query)
    return template

In [None]:
def retrieve_template_from_object(template):
    template = template[0].metadata['source'] # extract the infobox template
    template = template[template.index("Infobox_"):] # remove random seo stuff

    # construct template consistent with wikipedia's format
    template = template.split(' ')
    template_string = "{{" + template[0] + "\n"
    for field in template[1:]:
        template_string += f'| {field} = \n'
    template_string += "}}"
    return template_string

In [None]:
# Example
query = """
Star Wars comics have been produced by various comic book publishers since the debut of the 1977 film Star Wars.[a] Marvel Comics launched its original series in 1977, beginning with a six-issue comic adaptation of the film and running for 107 issues, including an adaptation of The Empire Strikes Back. Marvel also released an adaptation of Return of the Jedi and spin-offs based on Droids and Ewoks. A self-titled comic strip ran in American newspapers between 1979 and 1984. Blackthorne Publishing released a three-issue run of 3-D comics from 1987 to 1988.

Dark Horse Comics published the limited series Dark Empire in 1991, and ultimately produced over 100 Star Wars titles, including Tales of the Jedi (1993–1998), X-wing: Rogue Squadron (1995–1998), Republic (1998–2006), Tales (1999–2005), Empire (2002–2006), Knights of the Old Republic (2006–2010), and Legacy (2006–2010), as well as manga adaptations of the original film trilogy and the 1999 prequel The Phantom Menace.

The Walt Disney Company acquired Marvel in 2009 and Lucasfilm in 2012, and the Star Wars comics license returned to Marvel in 2015. Several new series were launched, including Star Wars, Star Wars: Darth Vader, and Doctor Aphra. In 2017, IDW Publishing launched the anthology series Star Wars Adventures. In 2022, Dark Horse resumed publishing new Star Wars comics and graphic novels.
"""
retrieve_template_from_vdb(query)

# LLM Querying

As input, we provide the raw Wikipedia text and the infobox template. As output, we receive the generated infobox.

In [None]:
def formulate_query(
    article: str,
    infobox_template: str
) -> str:
    introduction = "Your task is to fill out a Wikipedia infobox. Below, you are given some context text and the infobox template.\n"
    question = f"\n\n Context text: {article}\n\n Infobox Template: {infobox_template} \n\n"
    end = 'Fill out the Wikipedia infobox. Feel free to add any fields that you think would be important to know. Remember, keep the infobox concise, accurate, and of good quality.'
    return introduction + question + end

def api_call(message, model: str = GPT_MODEL):
    messages = [
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

def ask(
    article: str,
    infobox_template: str,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = formulate_query(article, infobox_template)
    if print_message:
        print(message)
    
    reply = 'lol' #api_call(message, model)
    return reply

# Pipeline

In [None]:
articles = ["""Halley's Comet, Comet Halley, or sometimes simply Halley, officially designated 1P/Halley, is a short-period comet visible from Earth every 75–79 years.[1] Halley is the only known short-period comet that is regularly visible to the naked eye from Earth, and thus the only naked-eye comet that can appear twice in a human lifetime.[15] It last appeared in the inner parts of the Solar System in 1986 and will next appear in mid-2061.

Halley's periodic returns to the inner Solar System have been observed and recorded by astronomers around the world since at least 240 BC. But it was not until 1705 that the English astronomer Edmond Halley understood that these appearances were re-appearances of the same comet. As a result of this discovery, the comet is named after Edmond Halley.[16]

During its 1986 visit to the inner Solar System, Halley's Comet became the first comet to be observed in detail by spacecraft, providing the first observational data on the structure of a comet nucleus and the mechanism of coma and tail formation.[17][18] These observations supported a number of longstanding hypotheses about comet construction, particularly Fred Whipple's "dirty snowball" model, which correctly predicted that Halley would be composed of a mixture of volatile ices—such as water, carbon dioxide, ammonia, and dust. The missions also provided data that substantially reformed and reconfigured these ideas; for instance, it is now understood that the surface of Halley is largely composed of dusty, non-volatile materials, and that only a small portion of it is icy."""]

In [None]:
for article in articles:
    # classification
    template = retrieve_template_from_vdb(article)
    for t in template:
        print(t)

    num = int(input('which one?'))
    template = retrieve_template_from_object(template[num])

    # generation
    infobox = ask(article, template, print_message=True)
    