In [1]:
import os
import openai
 
openai.api_key = os.environ["OPENAI_API_KEY"]
openai_api_key = os.getenv('OPENAI_API_KEY')

In [3]:
#pip install pdf2image
#pip install pytesseract
#pip install langchain
#pip install openai

from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.chains import create_extraction_chain


llm = OpenAI(openai_api_key=openai_api_key, temperature=0, model="gpt-3.5-turbo")

#### Version openai==0.28.1, langchain==0.0.316 for use of schema. Upgrade to latest versions for chat completions below


In [4]:
from packaging import version
version.parse(openai.__version__)

<Version('0.28.1')>

In [5]:
# import openai
# from packaging import version

# required_version = version.parse("1.1.1")
# current_version = version.parse(openai.__version__)

# if current_version < required_version:
#     raise ValueError(f"Error: OpenAI version {openai.__version__}"
#                      " is less than the required version 1.1.1")
# else:
#     print("OpenAI version is compatible.")

In [6]:

from pdf2image import convert_from_path

def convert_pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)


In [7]:
import pytesseract
from PIL import Image

def extract_text_from_images(images):
    extracted_text = ''
    for image in images:
        text = pytesseract.image_to_string(image)
        extracted_text += text
    return extracted_text


In [8]:
path = "GroundTruthDocuments/RK73H.pdf"
image = convert_pdf_to_images(path)
text1 = extract_text_from_images(image)

path = "GroundTruthDocuments/151002387394-1222329.pdf"
image = convert_pdf_to_images(path)
text2 = extract_text_from_images(image)

path = "GroundTruthDocuments/KGM15CR51E106K-DATA.pdf"
image = convert_pdf_to_images(path)
text3 = extract_text_from_images(image)

# Extration using a schema 

In [9]:
schema_capacitors = {
"properties": {

"Capacitance": {"type": "string"},
"Capacitance Unit": {"type": "string"},
"Voltage (V)": {"type": "string"},
"Tolerance": {"type": "string"},
"Dielectric Type": {"type": "string"},
"Package Style": {"type": "string"},
"SM Package Size (EIA)": {"type": "string"},
"Size": {"type": "string"},
"Diameter": {"type": "string"},
"Diameter Units": {"type": "string"},
"Axial Length": {"type": "string"},
"Axial Length Units": {"type": "string"},
"Radial Lead Spacing": {"type": "string"},
"Radial Lead Spacing Units": {"type": "string"},
"Max Height": {"type": "string"},
"Max Height Units": {"type": "string"},
"ESR": {"type": "string"},
"ESR Units": {"type": "string"},
"RF-Frequency Min": {"type": "string"},
"RF-Frequency Max": {"type": "string"},
"RF-Frequency Units": {"type": "string"},
"Life (hrs @ temp)": {"type": "string"},
"Op Temp Max (C)": {"type": "string"},
"Op Temp Min (C)": {"type": "string"},
"Recommended CCAP": {"type": "string"}
}}

llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview",openai_api_key=openai_api_key)
capacacitors_chain = create_extraction_chain(schema_capacitors, llm)

In [10]:
schema_resistors = {
"properties": {

"Composition": {"type": "string"},
"Resistance": {"type": "string"},
"Resistance Units": {"type": "string"},
"Power Rating (W)": {"type": "string"},
"Tolerance (%)": {"type": "string"},
"Package Style": {"type": "string"},
"SM Package Size (EIA)": {"type": "string"},
"Size": {"type": "string"},
"Axial Length": {"type": "string"},
"Axial Length Units": {"type": "string"},
"Radial Lead Spacing": {"type": "string"},
"Radial Lead Spacing Units": {"type": "string"},
"Diameter": {"type": "string"},
"Diameter Units": {"type": "string"},
"Max Height": {"type": "string"},
"Max Height Units": {"type": "string"},
"Temp Coefficient (ppm)": {"type": "string"},
"Parasitic Inductance Max": {"type": "string"},
"Parasitic Inductance Max Units": {"type": "string"},
"Recommended CCAP": {"type": "string"}
}}

llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview",openai_api_key=openai_api_key)
resistors_chain = create_extraction_chain(schema_resistors, llm)

### Capacitors

In [11]:
capacacitors_chain.run(text2)


[{'Op Temp Max (C)': '105',
  'Op Temp Min (C)': '-55',
  'Tolerance': '+20%',
  'Life (hrs @ temp)': '2,000 ~ 5,000 hours at 105°C'}]

In [12]:
capacacitors_chain.run(text3)


[{'Capacitance': '10uF',
  'Capacitance Unit': 'uF',
  'Voltage (V)': '25',
  'Tolerance': '+/-10%',
  'Op Temp Max (C)': '85',
  'Op Temp Min (C)': '-55',
  'SM Package Size (EIA)': '0603'}]

### Resistors

In [13]:
resistors_chain.run(text1)

[{'Tolerance (%)': '0.5',
  'Power Rating (W)': '0.03 to 4.0',
  'Resistance': '1 ohm to 10 megaohms',
  'Package Style': 'thick film chip resistor',
  'SM Package Size (EIA)': '0201, 0402, 0603, 0805, 1206, 1210, 2010, 2512'}]

# Extraction using JSON output

In [18]:
capacitor_attributes = ["Capacitance", "Capacitance Unit", "Voltage (V)", "Tolerance", "Dielectric Type", "Package Style", "SM Package Size (EIA)", "Size", "Diameter", "Diameter Units", "Axial Length", "Axial Length Units", "Radial Lead Spacing", "Radial Lead Spacing Units", "Max Height", "Max Height Units", "ESR", "ESR Units", "RF-Frequency Min", "RF-Frequency Max", "RF-Frequency Units", "Life (hrs @ temp)", "Op Temp Max (C)", "Op Temp Min (C)", "Recommended CCAP"]
resistor_attributes = ["Composition", "Resistance", "Resistance Units", "Power Rating (W)", "Tolerance (%)", "Package Style", "SM Package Size (EIA)", "Size", "Axial Length", "Axial Length Units", "Radial Lead Spacing", "Radial Lead Spacing Units", "Diameter", "Diameter Units", "Max Height", "Max Height Units", "Temp Coefficient (ppm)", "Parasitic Inductance Max", "Parasitic Inductance Max Units", "Recommended CCAP"]


In [19]:
def extract_attributes(document, attributes):
    try:
        completion = openai.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=[
                {
                    "role": "user",
                    "content": f"Extract the following attributes:\n{attributes}\n from the text:\n{document}\n Output should be in JSON format.",
                },
            ],
            response_format = { "type": "json_object" }
        )
        return completion.choices[0].message.content
    except Exception as e:
        return str(e)

In [20]:
print(extract_attributes(text1,resistor_attributes))

{
  "Composition": "Thick film",
  "Resistance": "1Ω - 10MΩ",
  "Resistance Units": "Ohms",
  "Power Rating (W)": {
    "0201 size": "0.03W",
    "0402 size": "0.1W",
    "0603 size": "0.1W",
    "0805 size": "0.125W",
    "1206 size": "0.25W",
    "2010 size": "0.75W",
    "2512 size": "1.0W - 4.0W"
  },
  "Tolerance (%)": "±0.5%, ±1%",
  "Package Style": "Thick film chip resistor",
  "SM Package Size (EIA)": ["0201", "0402", "0603", "0805", "1206", "2010", "2512"],
  "Size": {
    "01005": "0.016x0.008 inches",
    "0201": "0.024x0.012 inches",
    "0402": "0.04x0.02 inches",
    "0603": "0.063x0.031 inches",
    "0805": "0.079x0.049 inches",
    "1206": "0.126x0.063 inches",
    "2010": "0.198x0.098 inches",
    "2512": "0.252x0.126 inches"
  },
  "Axial Length": "n/a",
  "Axial Length Units": "n/a",
  "Radial Lead Spacing": "n/a",
  "Radial Lead Spacing Units": "n/a",
  "Diameter": "n/a",
  "Diameter Units": "n/a",
  "Max Height": {
    "01005": ".005 inches",
    "0201": ".009 inc

In [21]:
print(extract_attributes(text2,capacitor_attributes))

{
  "Capacitance": "Values not explicitly provided in the provided text",
  "Capacitance Unit": "uF (microfarads)",
  "Voltage (V)": {
    "Rated Voltage Range": "6.3V to 100V"
  },
  "Tolerance": "+20%",
  "Dielectric Type": "Aluminum Electrolytic",
  "Package Style": "SMD (Surface Mount Device)",
  "SM Package Size (EIA)": "Not explicitly provided in the provided text",
  "Size": {
    "Diameter": "From 4mm to 18mm or more",
    "Length": "From 5.7mm to 21.5mm or more"
  },
  "Diameter": "Values ranging from 4mm to 18mm",
  "Diameter Units": "mm (millimeters)",
  "Axial Length": "Values ranging from 5.7mm to 21.5mm",
  "Axial Length Units": "mm (millimeters)",
  "Radial Lead Spacing": "Data provided is for 'Lead Spacing and Diameter'",
  "Radial Lead Spacing Units": "mm (millimeters)",
  "Max Height": "Values provided as part of the 'Size' attribute",
  "Max Height Units": "mm (millimeters)",
  "ESR": "Not explicitly provided in the provided text",
  "ESR Units": "Ω (ohms)",
  "RF-Fr

In [22]:
print(extract_attributes(text3,capacitor_attributes))

{
  "Capacitance": "10uF",
  "Capacitance Unit": "uF",
  "Voltage (V)": "25Vdc",
  "Tolerance": "+/-10%",
  "Dielectric Type": "Not explicitly stated",
  "Package Style": "Not explicitly stated",
  "SM Package Size (EIA)": "0603",
  "Size": "Not explicitly stated",
  "Diameter": "Not explicitly stated",
  "Diameter Units": "Not explicitly stated",
  "Axial Length": "Not explicitly stated",
  "Axial Length Units": "Not explicitly stated",
  "Radial Lead Spacing": "Not explicitly stated",
  "Radial Lead Spacing Units": "Not explicitly stated",
  "Max Height": "Not explicitly stated",
  "Max Height Units": "Not explicitly stated",
  "ESR": "Not explicitly stated",
  "ESR Units": "Not explicitly stated",
  "RF-Frequency Min": "Not explicitly stated",
  "RF-Frequency Max": "Not explicitly stated",
  "RF-Frequency Units": "MHz - Not stated but inferred from context",
  "Life (hrs @ temp)": "Not explicitly stated",
  "Op Temp Max (C)": "85",
  "Op Temp Min (C)": "-55",
  "Recommended CCAP": "