In [2]:
import os, sys, json
from dotenv import load_dotenv
import numpy as np, pandas as pd
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI

from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate, ChatPromptTemplate

import faiss

from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

from langchain_core.documents import Document

from uuid import uuid4

LLM config

In [3]:
load_dotenv()

AZURE_DEPLOYMENT = "gpt-4o"
#API_VERSION = "2023-06-01-preview"
API_VERSION = "2024-08-01-preview"


In [4]:
llm = AzureChatOpenAI(
    azure_deployment=AZURE_DEPLOYMENT,  # or your deployment
    api_version=API_VERSION,  # or your api version
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # handle_parsing_errors=True,
)

In [5]:
embeddings = AzureOpenAIEmbeddings()

llm.invoke('Hi')

Data

In [6]:
DATA_PATH = os.path.join(os.path.abspath(""), ".." ,"data")

In [7]:
outfile1 = os.path.join(DATA_PATH, "G929147A_LP_Sales.txt")
outfile2 = os.path.join(DATA_PATH, "G1320123A_wellhead.txt")
outfile3 = os.path.join(DATA_PATH, "G1321243A_wellhead.txt")

# Extract compositions through prompt

In [8]:
with open(outfile1, encoding='utf-8') as file:
    text = file.read()

In [9]:
text

'Power Service Inc., Gas Measurement Division\n2289 Renauna Bldg. 1, Casper, WY 82601; Ph: 307-472-7722 ext. 1305\n\nSample Name\nCompany\nCounty/State\nOperator\nMeter # / Well Name\n\nWell Temp.\n\nWeil Pressure\n\nFlowCal From\n\nFlowCal To\n\nH2S PPM\n\n*H2S Results provided by CHK\nMethod Name\n\nInjection Date\n\nReport Date\n\nEZReporter Configuration File\nNGA Phys. Property Data Source\nData Source\n\nComponent | Norm Mol% |\n\nName 7 Dry\nNitrogen\nMethane\nCarbon Dioxide\nEthane\nPropane\n\n0.8715\n72.6064\n0.6396\n11.2958\n8.2430\n1.0074\n3.0569\n0.7111\n0.8357\n0.7326\n0.0000\n100.0000\n\ni-Butane\nn-Butane\ni-Pentane\nn-Pentane\nHexanes Plus\nWater\n\nTotal:\n\nSample Information\n\n{Sample Information\n\nNW Fetter 28 USA APK 7H LPSales\nChesapeake\n\nConverse, Wyoming\n\nSTAN MCLEAN\n\n1364570903\n\n93\n\n43\n\n10/01/2016 8:00\n\n1\n\n6-27-16\n\nC6+Low p.met\n\n2016-09-20 18:19:34\n\n2016-09-20 12:23:49\n\nPSI 14.73 flowcal 07112016.cfg\nGPA Standard 2145-09 (FPS)\nEZChr

In [31]:
system_msg = """
You are a chemical engineer looking at PVT reports.
You extract sampling conditions information from the report, such as Sample Pressure and Sample Temperature.
You extract component and composition from the report, such as Methane 90% mole, Ethane 10% mole, etc.
You convert the information extracted into a JSON format under the keys "Conditions" and "Components".
Output the information strictly in the JSON format without any extra characters or markdown (such as json or backticks).

- For the Conditions key:
Each entry within the "Conditions" array should be a dictionary representing the Sample Pressure or Sample Temperature.
The Sample Pressure and Sample Temperature must be reported along with their respective units.
If the value is unavailable, put N/A.

- For the "Components key:
Each entry within the "Components" array should be a dictionary representing a single component and its corresponding Mole% value from the document.
Format it such that each dictionary has a key as the component name and value as the "Mole%".
If the component "%" is not available put N/A.
Make sure to include the word "Plus" or the character "+" in the component name when it is present.

- Example of input and expected output:

Input:
PROJECT NO. COMI'ANY NAME ACCGUNT NO. - PRODUCER LEASE NO.  NAME. DESCRIP ***FIELD DATA®** SAMPI ED BY: SAMPLE PRFS. : COMMENTS      COMPONENT HELIU! HYDROGEN OXYGEN/ARGON NITROGEN  o2  METHANE ETHANE PROPANE I-BUTANE N-BUT/NE I-PENTANE PEN"ANT: HEXANES PLUS TOTALS               BTEX COMPONENTS MOLE%  BENZE        TOTAL BTEX  (CALC: GASTD 214594 & TF- “DHA (DETAILED HYDROCARBON AN:  ASTH DCI0 THIS DAT:      EMPACT ANALYTICAL SYSTEMS, INC  365  SOUTH MAIN STREET  BRIGHTON, CO 80601  EXTENDED  (303) 637-0150  NATURAL GAS ANALYSIS (*DHA)                 0312028 ANALYSISNO.: 03 COMPLIANCE PARTNERS ANALYSIS DATE:  DECEMBER 7, 2003 SAMPLEDATE :  DECEMBER 4, 2003 WESTERN GAS TO: CYLINDERNO.: 0299 MDU #6 P SCHLAGEL AMBIENT TEMP.: 800 SAMPLE TEMP. 80 GRAVITY NO PROBE GPM@ GPM@ MOLE % MASS % 14.696 14.73 0.004 G001 — = 0.000 0.000 - - 0.000 0.000 - - o7 0184 = = 227 5619 - - 971 83.565 - - 2852 4805 0.7610 07628 0.910 2248 0.2502 02508 019 0637 0.0640 00641 0228 0.741 00717 00719 0.093 0375 0.0339 0.0340 6.065 0.261 00235 0.0236 0285 1564 01187 0.1187 100.000 100.000 13230 13259 WT% BTU@ 14.696 1473 0.008 0.038 NET DRY REAL: 947.15 sef 949.34 fsef 0.001 0.008 LOW NET WET RFAL: 93064 Jscf 932.83 fsef 0011 0058 GROSS DRY REAL 1049.5 fscf 1052.02 fsef 0.005 0032 HIGH ~ GROSS WET REAL 1031.30 /sef 1033.73 fsef 0026 0136 NET DRY REAL: 20142 My GROSS DRY REAL : 22321 b DENSITY (ATR=1) 06172 169 & 605 COMPRESSIBILITY FACTOR 099776        BEEN ACQUIRED THROUGH APPLICATION OF CURRENT STATE-OF - THE-ART ANALYTICAL TECHNIQUES  THE USE (.F T7IIS INFORMATION IS THE RESPONSIBLITY OF THE USER. EMPACT ANALYTICAL SYSTEMS, ASSUMES NO  RESPON:     7Y FOR ACCURACY OF THE REFORTED INFORMATION NOR ANY CONSEQUENCES OF IT'S APPLICATION  59-013 -0/23  SW NW Frement (wnf)/  Madilen Feld . UniotFm,  | -3%-90   EMPACT ANALYTICAL SYSTEMS, INC 365 SOUTH MAIN STREET  BRIGHTON, CO 80601 303) 637-0150  E & P /GlyCalc Information        PROJECTNO. 0312028 ANALYSISNO.: 03 COMPANY NAME:  COMPLIANCE PARTNERS ANALYSIS DATE:  DECEMBER 7, 2003 ACCOUNTNO. : SAMPLE DATE DECEMBER 4, 2003 PRODUCER WESTERN GAS TO:  LEASE NO. CYLINDER NO. 0299 NAMEDESCRIP:  MDU #6  *+++FIELD DATA***  SAMPLED BY: P SCHLAGEL AMBIENT TEMP. SAMPLE PRES. 800 GRAVITY  : COMMENTS NO PROBE SAMPLE TEMP 50 Comporenet Mole % Wi %  Helium 0.004 0.001 Hydrogen 0.000 0.000 Methano 0.000 0.000 Carbon Nioxide 2279 5610 Nitrager 0117 0.184 Methane 92,971 83.565 Ethane 2.852 4.805 Propane 0910 2248 Isobutan: 0.19 06.637 n-Butane 0.228 0.741 Isopenanc 0.093 0375 n-Pentanc 0.065 0.261 Cyclopen-ane 0.006 0023 n-Hexane 0.026 0123 Cyclohexane 0.021 0.100 Other Heanes 0.066 0314 Heptanes 0.045 0258 Methycyclohexane 0037 0.205 2,24 Trimethylpentane 6.000 0.000 Benzene 0.009 0038 Toluene 001 0.058 Fihylbenzene 0.001 0.008 Xylenes 0005 0.032 C8+ Heavies 0.05% 0.405 Subtotal 106,000 100.000 Oxygen 0.000 0.000  Total 100.000 100.000

Output:
{  "Conditions": [{"Sample Pressure": "800 psia", "Sample Temperature": "80 degF"}],  "Components": [     {"Helium": "0.004", "Hydrogen": "0.000", "Oxygen/Argon": "0.000", "Nitrogen": "0.117", "CO2": "2.279", "Methane": "92.971", "Ethane": "2.852", "Propane": "0.910", "i-Butane": "0.196", "n-Butane": "0.228", "i-Pentane": "0.093", "n-Pentane": "0.065", "Hexanes Plus": "0.285"}   ] }
"""

In [32]:
chat_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                system_msg
            )
        ),
        HumanMessagePromptTemplate.from_template("{Input}"),
    ]
)

In [33]:
chain = chat_template | llm

In [34]:
response = chain.invoke({"Input": text})
print(response.content)

{
  "Conditions": [
    {
      "Sample Pressure": "93 psia",
      "Sample Temperature": "43 degF"
    }
  ],
  "Components": [
    {
      "Nitrogen": "0.8715",
      "Methane": "72.6064",
      "Carbon Dioxide": "0.6396",
      "Ethane": "11.2958",
      "Propane": "8.2430",
      "i-Butane": "1.0074",
      "n-Butane": "3.0569",
      "i-Pentane": "0.7111",
      "n-Pentane": "0.8357",
      "Hexanes Plus": "0.7326",
      "Water": "0.0000"
    }
  ]
}


In [35]:
file_response = json.loads(response.content)

In [36]:
file_response['Conditions']

[{'Sample Pressure': '93 psia', 'Sample Temperature': '43 degF'}]

In [37]:
file_response['Components']

[{'Nitrogen': '0.8715',
  'Methane': '72.6064',
  'Carbon Dioxide': '0.6396',
  'Ethane': '11.2958',
  'Propane': '8.2430',
  'i-Butane': '1.0074',
  'n-Butane': '3.0569',
  'i-Pentane': '0.7111',
  'n-Pentane': '0.8357',
  'Hexanes Plus': '0.7326',
  'Water': '0.0000'}]

In [38]:
flat_components = file_response['Components'][0]

In [48]:
import math
df = pd.DataFrame({"D": [10,20,30], "p": [20, 30, 10]})
def EOQ(row,ck,ch):
    D = row['D']
    p = row['p']
    Q = math.sqrt((2*D*ck)/(ch*p))
    row['Q'] = Q
    row['abc'] = 'hello'
    return row
ch=0.2
ck=5
df = df.apply(lambda row: EOQ(row, ck, ch), axis=1)
df

Unnamed: 0,D,p,Q,abc
0,10.0,20.0,5.0,hello
1,20.0,30.0,5.773503,hello
2,30.0,10.0,12.247449,hello


# Use semantic similarity to lump pseudo-component

Get main natural gas components, lump remaining into hexanes plus pseudo-component.

In [39]:
# get each component with its own index
texts = []
for comp in list(flat_components.keys()):
    texts.append(Document(page_content=comp))
texts

[Document(metadata={}, page_content='Nitrogen'),
 Document(metadata={}, page_content='Methane'),
 Document(metadata={}, page_content='Carbon Dioxide'),
 Document(metadata={}, page_content='Ethane'),
 Document(metadata={}, page_content='Propane'),
 Document(metadata={}, page_content='i-Butane'),
 Document(metadata={}, page_content='n-Butane'),
 Document(metadata={}, page_content='i-Pentane'),
 Document(metadata={}, page_content='n-Pentane'),
 Document(metadata={}, page_content='Hexanes Plus'),
 Document(metadata={}, page_content='Water')]

In [19]:
# Initialization

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [20]:
# Symmetry components library
components_sym = ["METHANE", "ETHANE", "PROPANE", "ISOBUTANE", "n-BUTANE", \
                  "ISOPENTANE", "n-PENTANE", "NITROGEN", "CARBON_DIOXIDE"]

In [21]:
# add components to vector store
uuids = [str(uuid4()) for _ in range(len(texts))]
vector_store.add_documents(documents=texts, ids=uuids)

['28dee1f4-c1e2-4a91-a1b9-912050e66490',
 'cd5010c9-9db7-4647-812d-ddadc6c58552',
 'dbe8989c-a870-4f35-8b5f-49c3a071f906',
 'a34e7772-76b1-491c-acc3-097426950c7b',
 'be291194-c217-4631-b11c-b7774db0d407',
 '9287c204-afff-4e9a-ac47-5989746d2167',
 '56680718-fc6e-4d27-b3fc-226cf97b8ffc',
 'd6dd4ae3-e81d-4109-ac5c-8b585143a7c7',
 '75b001e8-02c4-4de8-ad72-b5e8fedeab49',
 '98eaa696-a8f8-4c77-8b47-9a1921415032',
 '5c7a7ade-628b-4fed-b364-eca9cba3341e']

In [28]:
sym_component = {}
for comp_sym in components_sym:
    resp = vector_store.similarity_search_with_relevance_scores(comp_sym, k=1)
    pdf_component = resp[0][0].page_content
    try:
        var_val = np.float32(flat_components[pdf_component])
    except:
        var_val = 0.0
    sym_component[comp_sym] = var_val
val_sum = np.sum(np.fromiter(sym_component.values(), dtype=float))
c6_plus = 100.-val_sum
sym_component['C6+'] = c6_plus
if c6_plus < 0.: sym_component['C6+'] = 0.



In [29]:
sym_component.values()

dict_values([72.6064, 11.2958, 8.243, 1.0074, 3.0569, 0.7111, 0.8357, 0.8715, 0.6396, 0.732600212097168])

In [30]:
normalized_values = np.fromiter(sym_component.values(), dtype=float)/np.sum(np.fromiter(sym_component.values(), dtype=float))
sym_component_normalized = dict(zip(sym_component, normalized_values))
sym_component_normalized

{'METHANE': 0.7260639953613282,
 'ETHANE': 0.1129580020904541,
 'PROPANE': 0.08243000030517578,
 'ISOBUTANE': 0.010074000358581543,
 'n-BUTANE': 0.030569000244140623,
 'ISOPENTANE': 0.007110999822616577,
 'n-PENTANE': 0.0083569997549057,
 'NITROGEN': 0.00871500015258789,
 'CARBON_DIOXIDE': 0.0063959997892379765,
 'C6+': 0.00732600212097168}