In [32]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
from scipy import spatial  # for calculating vector similarities for search
import typing
from keys import API_KEY


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-4"
openai.api_key = API_KEY


In [33]:
# download pre-chunked text and pre-computed embeddings
# this file is ~200 MB, so may take a minute depending on your connection speed
embeddings_path = r"C:\Users\aliyu\OneDrive\Documents\AI\Projects\petrobot\petrel_manual.csv"

df = pd.read_csv(embeddings_path)
df = df[['text','embedding']]

In [34]:
df.head()

Unnamed: 0,text,embedding
0,\nWWeellccoommee ttoo tthhee PPeettrreell** hh...,"[-0.0035548934247344732, -0.00918952003121376,..."
1,Petrel Exploration Geophysics \nInterpret regi...,"[-0.024511124938726425, 0.005252866540104151, ..."
2,Imaging | Petrel Fault Analysis | Petrel Well ...,"[-0.02430061623454094, -0.005952354986220598, ..."
3,Framework \nPlug-ins for Petrel\nLeverage pred...,"[0.005327986553311348, -0.0005350260762497783,..."
4,Petrel Geophysics \nPerform rapid 2D and 3D se...,"[-0.03281170502305031, 0.0006662339437752962, ..."


In [36]:
# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [37]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100) -> typing.Tuple[typing.List[str], typing.List[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [38]:
# examples
strings, relatednesses = strings_ranked_by_relatedness("dynamic model", df, top_n=10)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.806


'2. Surfaces \n3. Property models'

relatedness=0.800


'Figure 5 \nThe impact on the model becomes much clearer when a larger range is used for the modeling'

relatedness=0.793


'Velocity Modeling (Domain Conversion) \nWell log upscaling'

relatedness=0.789


'Type \nChoose how to model porosity heterogeneity in the simulation. You can choose from single or\ndual porosity, or dual permeability models.'

relatedness=0.788


'Data Management \nSave all the detailed analysis for each property for use later in the modeling process or when you\nare updating your models at some later time.'

relatedness=0.787


'(Zoom) \nDragging the cap towards you makes the model come closer\nPushing the cap from you makes the model move away'

relatedness=0.782


'Facies Modeling Petrophysical Modeling Data Analysis Uncertainty Analysis Optimization \nWorkflow editor'

relatedness=0.782


'History Matching. \nWell design in 3D. Digitizing, editing and visualizing of well trajectories based on the\ngenerated geological models. Output spread sheets with detailed well report and synthetic\nwell logs.\nWell Optimizer to create a series of cost-dependant realizations based on Target points and\ncost model.\nImproved documentation and reporting of the project work through tight integration with\ndesktop tools like PowerPoint, Word and Excel.'

relatedness=0.781


"Workflow Editor \nThe Workflow editor has several functions. Two of the most important to allow rapid updates of\nmodels and to perform batch operations on input data. A workflow for rebuilding the model can be\ngenerated at the push of a button and edited as required before running, recreating the model in\na single operation. Any changes in the input data will be taken into account. Batch operations on\ninput data are created intuitively using an object orientated programming language based on\nPetrel's user interface. See Workflow editor for details."

relatedness=0.781


'Interactive Facies Modeling \nIn interactive facies modeling, discrete 3D properties can be edited or made from scratch\ninteractively using various tools. It works almost like a drawing tool, where you can switch\nbetween different drawing styles like pencil, brush or airbrush and fill the facies bodies directly\ninto the 3D grid. In this way, a completely new facies property can be made and used to condition\nthe petrophysical modeling.\nIn the image above is shown an example of a property grid of facies model with channels.\nTools for interactive facies modeling\nWhen the Facies modeling is active, a set of icons becomes available in the Function bar. These\nare divided into property tools and property actions. Note that there is no undo option for these\ntools and that it is wise to make a copy of the property before starting.\nThe Facies modeling process dialog has a tab (Edit hints tab) with some hints concerning\nsome of these tools.\nFacies modeling Tools bar\nView Mode - Curs

In [None]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below articles on the 2022 Winter Olympics to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nWikipedia article section:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the 2022 Winter Olympics."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message