In [30]:
# imports
import ast  # for converting embeddings saved as strings back to arrays
import openai  # for calling the OpenAI API
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
from scipy import spatial  # for calculating vector similarities for search
import typing
from keys import API_KEY


# models
EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-4"
openai.api_key = API_KEY


In [8]:
# download pre-chunked text and pre-computed embeddings
# this file is ~200 MB, so may take a minute depending on your connection speed
embeddings_path = r"C:\Users\aliyu\OneDrive\Documents\AI\Projects\petrobot\petrel_manual.csv"

df = pd.read_csv(embeddings_path)
df = df[['text','embedding']]

In [11]:
df.head()

Unnamed: 0,text,embedding
0,\nWWeellccoommee ttoo tthhee PPeettrreell** hh...,"[-0.0036606036592274904, -0.009110068902373314..."
1,Petrel Exploration Geophysics \nInterpret regi...,"[-0.024511124938726425, 0.005252866540104151, ..."
2,Imaging | Petrel Fault Analysis | Petrel Well ...,"[-0.024689018726348877, -0.005831622518599033,..."
3,Framework \nPlug-ins for Petrel\nLeverage pred...,"[0.005327986553311348, -0.0005350260762497783,..."
4,Petrel Geophysics \nPerform rapid 2D and 3D se...,"[-0.03281170502305031, 0.0006662339437752962, ..."


In [27]:
# convert embeddings from CSV str type back to list type
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [28]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100) -> typing.Tuple[typing.List[str], typing.List[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [31]:
# examples
strings, relatednesses = strings_ranked_by_relatedness("dynamic model", df, top_n=50)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    display(string)

relatedness=0.806


'2. Surfaces \n3. Property models'

relatedness=0.800


'Figure 5 \nThe impact on the model becomes much clearer when a larger range is used for the modeling'

relatedness=0.793


'Velocity Modeling (Domain Conversion) \nWell log upscaling'

relatedness=0.789


'Type \nChoose how to model porosity heterogeneity in the simulation. You can choose from single or\ndual porosity, or dual permeability models.'

relatedness=0.788


'Data Management \nSave all the detailed analysis for each property for use later in the modeling process or when you\nare updating your models at some later time.'

relatedness=0.787


'(Zoom) \nDragging the cap towards you makes the model come closer\nPushing the cap from you makes the model move away'

relatedness=0.782


'Facies Modeling Petrophysical Modeling Data Analysis Uncertainty Analysis Optimization \nWorkflow editor'

relatedness=0.782


'History Matching. \nWell design in 3D. Digitizing, editing and visualizing of well trajectories based on the\ngenerated geological models. Output spread sheets with detailed well report and synthetic\nwell logs.\nWell Optimizer to create a series of cost-dependant realizations based on Target points and\ncost model.\nImproved documentation and reporting of the project work through tight integration with\ndesktop tools like PowerPoint, Word and Excel.'

relatedness=0.781


"Workflow Editor \nThe Workflow editor has several functions. Two of the most important to allow rapid updates of\nmodels and to perform batch operations on input data. A workflow for rebuilding the model can be\ngenerated at the push of a button and edited as required before running, recreating the model in\na single operation. Any changes in the input data will be taken into account. Batch operations on\ninput data are created intuitively using an object orientated programming language based on\nPetrel's user interface. See Workflow editor for details."

relatedness=0.781


'Interactive Facies Modeling \nIn interactive facies modeling, discrete 3D properties can be edited or made from scratch\ninteractively using various tools. It works almost like a drawing tool, where you can switch\nbetween different drawing styles like pencil, brush or airbrush and fill the facies bodies directly\ninto the 3D grid. In this way, a completely new facies property can be made and used to condition\nthe petrophysical modeling.\nIn the image above is shown an example of a property grid of facies model with channels.\nTools for interactive facies modeling\nWhen the Facies modeling is active, a set of icons becomes available in the Function bar. These\nare divided into property tools and property actions. Note that there is no undo option for these\ntools and that it is wise to make a copy of the property before starting.\nThe Facies modeling process dialog has a tab (Edit hints tab) with some hints concerning\nsome of these tools.\nFacies modeling Tools bar\nView Mode - Curs

relatedness=0.780


'CURVE \nKeyword above'

relatedness=0.780


'Well Model Options \nSpecify options for the well definition.'

relatedness=0.778


'Fault Modeling \nThe process of Fault Modeling defines the faults in the geological model which will form the basis\nfor generating the 3D grid. These faults will define breaks in the grid; lines along which the\nhorizons inserted can be offset later. The offset which occurs is entirely dependant upon the input\ndata, so modeling reverse faults is just as easy as modeling normal faults. All pillars in the 3D grid\nwill be extended to meet the top and base of the horizons defining your grid (in Make Horizon and\nMake Zones processes), so make sure all of your fault models are modeled above the top and\nbase horizon. A modeled fault must never cross another fault without being connected.\nFigure c (left) and d. In figure c the horizons has been inserted into the 3D grid. Note that the\npillars in the grid have been cut by the top and base horizon in the grid (compare the\nintersections with figure b above). Figure d shows the result after zones have been inserted into\nthe model. The 3D

relatedness=0.778


'Fluid Description \nThe fluid description will impact the CPU time. The fastest model will be a low compressible two-\nphase model (oil/water). The low compressibility will cause the pressure solution to be easier to\nsolve and the requirement for time steps for accuracy is low. In addition, if the mobility and\ndensities of the fluids are close, one time step might still result in a reasonably accurate (also\nfrom engineering perspective, that is, standard simulation) simulation result. For a one million-\ngrid cell model, one time step might require 3 minutes of computer time with a state-of-the-art'

relatedness=0.778


'Moving Average \nOther algorithms available are Assign values , Neural net and User defined algorithm .\nDetailed information Petrophysical Modeling'

relatedness=0.777


'Object Modeling \nTruncated Gaussian with trends\nTruncated Gaussian simulation\nSequential Indicator simulation\nMulti-point facies simulation\nThe only deterministic algorithm for Facies modeling is Indicator kriging .\nOther algorithms available are Assign values , Neural net and User defined algorithm .\nDetailed information Facies Modeling\nPetrophysical Modeling algorithms\nThe Stochastic algorithms include:\nSequential Gaussian simulation\nGaussian random function simulation\nAlthough there are a number of options allowing the user to alter how they are done, such as using\nbivariate transforms, conditioning via collocated co-kriging, locally varying mean and using trends\nby pre/post processing.\nThe Deterministic algorithms for Estimation include:\nKriging interpolation'

relatedness=0.777


'Framework Petrel Facies Modeling \nNew methods to model complex, geological features and connectivity\nModel your pixel- or object-based stochastic facies using deterministic techniques. Condition the\nfacies to a seismic property or trend surfaces with the data analysis process, or use objects\nsampled directly from seismic with the volume extraction tool.'

relatedness=0.777


'Petrophysical Modeling \nPetrophysical modeling is the interpolation or simualtion of continuous data (e.g. porosity,\npermeability, etc.) throughout the model grid. In Petrel Deterministic (estimation or interpolation)\nand Stochastic methods are available for modeling the distribution of continuous properties in a\nreservoir model.\nWell data, facies realization, variograms, a secondary variable and/or trend data can be used as\ninput and various user settings are available. Usually, upscaled well logs with continuous\nproperties is the dataset available in the model grid. Filters and settings can be used to model\ndifferent parts of the grid separately (e.g. filter on facies, values, index, zones and segments and\nLocal model updated settings).\nSee General information on property modeling for information on what to consider when\nperforming property modeling. Which Modeling Algorithm provides some information on the\navailable algorithms. Details and examples of the various algori

relatedness=0.777


'Stochastic \nObject modeling allows you to populate a discrete facies model with different bodies of\nvarious geometry, facies code and fraction. Object Modeling\nSequential indicator simulation allows a stochastic distribution of the property, using\nthe pre-defined histogram. Directional settings, such as variogram and extensional trends,\nare also honored. Sequential indicator simulation\nTruncated Gaussian Simulation is a fast modeling technique for discrete properties,\nwhere the facies are known to be sequential and the variograms defining each of the facies\nare the same. It is most commonly used in carbonate environments, and the method is\npixel based; hence, it can easily deal with large amounts of input data, specified global\nfractions, 1D, 2D or 3D trends. Truncated Gaussian Simulation\nTruncated Gaussian with trends allows a stochastic distribution of the facies based on a\ngiven transition between facies and a trend direction. These trends are then converted into\nproba

relatedness=0.777


'Streamline Time Dependency \nEven though a streamline simulation can be significantly faster than standard simulations, it can\nstill be a time consuming affair if we want to model in detail all physical properties available in\nFrontSim. Some of the most important time dependencies are discussed below.\nNumber of Cells\nThe size of the model represented by the number of cells, will impact the time it will take to\ncompute the pressure and move the fluids along. For example, for a model with 1 million grid cells\nit might take up to several minutes to compute the pressure and move the fluids one time step,\ndepending somewhat on the complexity of the fluid description and the computational speed of\nthe processor of your PC. Also, the memory requirement might be in the order of one gigabyte.'

relatedness=0.776


'Interactive \nAllows the user to paint facies directly on the 3D model. The Facies painting tools are stored in\nthe Function bar for Facies moldeing (see Interactive Facies Modeling).'

relatedness=0.776


'Tracking \nDirect data manipulation'

relatedness=0.775


"Object Modeling \nObject modeling allows you to populate a discrete facies model with objects which are\ngenerated and distributed stochastically. All geometrical inputs controlling the body shape\n(width/thickness, etc.) can either be defined deterministically, follow a defined statistical\ndistribution or be assigned using a trend map.\nThe background can be assigned a given facies code or an existing facies model. Different\nerosion/replacement rules can be applied to different bodies. Vertical and areal trends can be\nused as options for defining the spatial distribution.\nA trial-and-error approach is probably the best way to begin Object modeling. Start with a\nsingle object and deterministic parameter settings to get a feel for sizes and shape. Then\ngradually build in more variation and complexity.\nHow object modeling works\nFor each body in the list of objects, the body will first be matched according to the wells, then if\nthe global fraction is less than requested, additio

relatedness=0.775


"Facies Modeling (Petrel Workflow) \nYou can perform general stochastic object modeling such as, 'Sequential Indicator Simulation',\n'Object modeling' (including fluvial channels and adaptive channels), 'Truncated Gaussian\nSimulation' or 'users own algorithm' to assign values. In addition a more complex facies model\ncan be built using the 'Multi-Point Facies Simulation' method. The petrophysical properties can be\nconditioned later on to the facies model. You can also condition a facies model to a previously\ngenerated facies model (hierarchical modeling). Petrel also supports the ability to manually draw\nand edit facies shapes using standard drawing tools. This makes it easy to put your ideas into a\n3D model. Details of stochastic and deterministic modeling can be found in Facies Modeling and"

relatedness=0.775




relatedness=0.775


'Deterministic \nall channels will have the same value. However, adaptive channels will naturally drift around\nthis value, so that a single deterministic value does not mean that the channel will be\nperfectly regular.'

relatedness=0.774


'~Curve \nKeyword below'

relatedness=0.774


'Geobody Modeling \nAssign between surfaces and polygons\nCells cut by surface\nConstant value\nConstant or surface in segment and zones\nNormal distributed random values\nUniform distributed random values\nAbove contact\nCell angle\nCell height\nAbsolute or relative depth\nDistance to an object\nCell inside out\nSeismic resampling\nCell volume'

relatedness=0.773


'Benefits \nEfficiently rank, screen and visualize multiple sensitivity runs, combining static and dynamic\ninformation to create more reliable models in less time.\nOptimize well placements by combining streamline analysis with the detailed static model to\nenhance sweep efficiency.\nValidate upscaled reservoir models with dynamic data by understanding grid orientation\nissues, thereby improving the quality of the model used for reservoir simulation.\nIdentify flow patterns\nWhen heterogeneity and reservoir uncertainties are dominating the fluid flow behavior in your\nreservoir, stochastic modeling techniques are used to create multiple views of your fine scale\ngeological model. With FrontSim, you can dynamically identify the tortuous flow paths by visually\ndepicting the injector-to-producer streamline bundles and make ranking decisions based on\nproduction history, not static methods alone.\nImprove your reservoir management\nIdentifying optimal drilling locations is not only based

relatedness=0.773


"Compressibility \nAnother factor is the compressibility introduced when using a three-phase oil/water/gas model. To\nachieve close to engineering accuracy, we will have to enable the gravity segregation and use\nmany time steps. In addition, the non-linearity of the model will cause more computational effort\nper time step. Note that FrontSim's streamline concept will allow the user to use only one long\ntime step (possibly years), even for this type of model, but the engineering accuracy defined by\nstandard simulation will be degraded.\nThe resulting set of partial differential equations cannot be solved by any analytical means due to\nits typical complexity in geometry, rock property and fluid description. Instead, a so called\nnumerical approximation is used. Many types of numerical methods are available to solve these\nequations. Most use some form of a finite difference/volume method that divides the geometry\ninto many small subsections called cells containing rock and fluid pr

relatedness=0.772




relatedness=0.772


'Common Features \nAll of the calculators, with the exception of the Dynamic Data calculator, behave in more or less\nthe same way and follow the same basic rules and syntax.'

relatedness=0.772


'Petrel Petrophysical Modeling \nAssign petrophysical values to cells in a 3D grid; use a number of different deterministic and\nstochastic modeling techniques.'

relatedness=0.772


'Simplex Non-Linear Optimizer \nSimplex non-linear is an enhanced version of the Simplex optimizer which additionally\nsupports non-linear constraints among the control variables and model responses.'

relatedness=0.771


'Black Oil Fluid Model Workflow Compositional Fluid Model Workflow Thermal Fluid Model Workflow Black Oil Fluid Model Workflow \nYou can add new black oil fluid models, and update or delete existing fluid models. Once created\nthe fluid model will appear in the Fluids folder on the Input pane. See Fluid Model (Settings)\nfor details of the settings page.\nHow to Make a New Black Oil Fluid Model from'

relatedness=0.771


'Geophysics. \nThe objects to be depth converted are dropped into the dialog and the process automatically\ndetects their native domain. At the same time, it determines the direction of the conversion based\non the selected velocity model. Clicking Apply or Ok will calculate a new virtual seismic attribute\nfor each of the chosen objects.\nDomain convert by active velocity model:\nSeismic data (3D cube or 2D line) can be domain converted individually by right-clicking on the\nobject and selecting Domain convert by active velocity model while the velocity model is\nactive.'

relatedness=0.771


'Exponential \nThis model reaches its sill (c) asymptotically and the effective range (a) is defined as the distance\nat which (h) = 0.95c.\nc = Sill - Nugget.'

relatedness=0.771


'Figure 4 \nThe first model shows that a constant value is the norm, while the second has the trend as a\nbackground with the data points modeled as anomalies on that. If the trend had been perfect, the\nproperty would have been completely smooth with no anomalies at the data points.\nThis also applies when using a stochastic algorithm, although the differences are more difficult to\nsee. Again, if the trend had been perfect, then the property on the right would have been\ncompletely smooth with no anomalies anywhere (Figure 5).'

relatedness=0.771


'Dynamic Data Calculator (Summary Vectors) \nThe Dynamic data calculator is used to perform operations on summary vectors that are on\nresults from simulation cases. You can access this calculator by right-clicking the Dynamic data\nsubject in the Views folder on the Results pane, see 2D Summary Results Calculator .\nPractical Use of the Calculator\nThe calculator has three modes that are used depending on the type of calculation performed:\nNormal calculator mode\nSingle number mode\nMultiple number mode\nFunctions and surfaces (as 2D functions) can also be used in the calculators, and for repetitive\noperations, the input text for the file can be saved to create a macro which can be re run at any\ntime.\nNote that when using the logical statements And and Or, remember to use a space both\nbefore and after the statement.'

relatedness=0.771


'Petrel Fracture Modeling \nVisualize and analyze fractured reservoirs\nModeling flow in fractured reservoirs is difficult. The challenge requires a software solution that\nsupports tight integration between the static and dynamic reservoir modeling disciplines and\nprovides a way to visualize and analyze many data types that may be direct or indirect indicators\nof fractures.\nOne of the difficulties with a traditional discrete fracture modeling workflow is that the number of\nfractures to be modeled in the field can be extremely large. Trying to represent all of them\nexplicitly in the model is often hampered by system memory limitations. Even if this is achieved,\ncalibration of fracture modeling parameters to flow simulation results demand iterative steps in a\nworkflow, and then computational performance becomes a limiting factor.\nPetrel 2010.1 proposes an original numerical representation of the fracture networks so that an\naccurate calculation of the contribution to the fluid 

relatedness=0.770


'Expressions \nTo create new data, the Dynamic Data Calculator needs some input data and the relationship\nbetween this data. This is broken down into two parts. First, a text expression describes the\nformula to be used to relate a number of variables and is entered into the expression text box.\nThis expression is parsed to extract the variables and then these variable names are bound to\nparticular items of data. Unlike the other calculators the name of the variables is not important as\nthey do not relate directly to the data, but have to be bound to data in a separate step.\nFor example, a simple expression would be "LHS = a+b" where the output will be stored in the\nvariable "LHS" and will equal the sum of the data in the input "a" and "b". Once the text of the\nexpression has been entered, hit return to parse the text or click the Parse Expression button.\nIt is possible for the calculator to return a single number, for instance finding the maximum value\nin a vector, in which c

relatedness=0.770


'Geometrical Modeling \nGeometrical modeling is the process where properties can be generated by using pre-defined\nsystem variables, such as cell volume, seismic resampling, zone index, etc. Each cell will get a\nnumerical value corresponding to the selected system variable. These properties can be important\nin processes such as volume calculations and mathematical operations between petrophysical\nproperties. Geometrical modeling is not restricted to simple geometrical properties, it also covers\nmore complex property distributions such as:\nRandom/Normal distribution\nZones/segments\nFaults/segments'

relatedness=0.770


'2. Hints \nHints for using the interactive facies editing tools and the property player.\nMake model tab\nAt the top of the Make model tab are the basic settings for facies modeling:'

relatedness=0.770


'View Mode - Flying \nThe fly function is also borrowed from Inside Reality. This feature allows the user to travel\nthrough the graphical scene as if they were Flying through it. This operation is typically mapped to\nthe joystick of the input device, see appendix 8.'

relatedness=0.770


'Deterministic \nAll bodies will have the same thickness'

relatedness=0.770


'Deterministic \nAll bodies will have the same thickness'

relatedness=0.770


'Thermal Fluid Model Workflow \nThe Petrel workflow supports thermal liveoil fluids. A thermal fluid is created in the same way as a\ncompositional fluid, either by importing keywords from in ECLIPSE fluid model (Keywords)\nformat, or from the fluid process.\nHow to Review and Edit a Thermal Fluid Model\n1. Select the thermal live oil model from Edit existing.\n2. From the General tab, select if the model has water. Select how the specific heat of\ncomponents should be supplied. This will determine the columns of the Components tab.\n3. Fill out the Components tab, defining parameters such as reference density, along with K-\nvalue coefficients for each component. If you want to define K values using either KVWI or\nKVTABTn keywords then the K-values in the component tab should be left blank at\npresent, and the necessary keywords inserted via the editor.\n4. Fill out the Samples tab as described in Compositional fluid model workflow.\n5. The Viscosity tab allows temperature dependent 

relatedness=0.769


'Optimization \nRapidly update reservoir models, better manage risk, and easily share\nknowledge\nThe Workflow Editor, an integral part of Petrel software, captures data parameters and\nrelationships that enable rapid updating of reservoir models as information from new wells\narrives.\nGeoscientists and engineers can create multiple model realizations to assess the impact on\nreserve volumetrics or to cost well placement. Engineers can run all the possible scenarios in\nECLIPSE directly, without leaving Petrel.\nIn addition to understanding uncertainty and risk management, the Workflow Editor empowers\nknowledge sharing, allowing best practices and workflows to be easily shared across your\norganization.\nWith Petrel, asset teams can reduce project cycle time and maximize productivity.\nUncertainty and Optimization\nThe Uncertainty and Optimization process allows you to to sensitivity analysis. You can create\nproxies for volumetric or simulation cases using experimental designs for f

relatedness=0.769


'Views \nthe views in here allow you to choose different ways to visualize the simulation data.\nDynamic data allows you to plot simulation data against time or other simulation data. Case\nvariables allows you to plot simulation data at a given time against values of case variables.\nDynamic data\nThis is where you select simulation results to be displayed. This sub-tree is used by both of\nthe view types.\nIdentifier, Source data type, Fluid identifiers, Aquifer type etc\nfolders of filters used to qualify the data to be visualized by one of the view types held in the\nViews folder. These sub-trees are used by both of the view types.\nCase variables\nthis folder is used only by the Case variables view, to select a variable to be displayed on X-\naxis.'

relatedness=0.769


'Depth Conversion Process \nOnce a velocity model has been created, it can be used to depth convert objects. Objects which\ncan be depth converted include:'

In [None]:
def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below articles on the 2022 Winter Olympics to answer the subsequent question. If the answer cannot be found in the articles, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nWikipedia article section:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the 2022 Winter Olympics."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message