# GOAL

- Extract the EETT data from a PDF file
- Save the data in a CSV file

In [4]:
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv

load_dotenv()

file_path = (
    "/Users/josecarlos/repos/practicos-rag/EETT Infranormativo JI Los Palomos_rev02_normativo-print.pdf"
)
loader = PyPDFLoader(file_path)
docs = loader.load()

In [3]:
len(docs)

50

In [7]:
docs[27]

Document(metadata={'source': '/Users/josecarlos/repos/practicos-rag/EETT Infranormativo JI Los Palomos_rev02_normativo-print.pdf', 'page': 27}, page_content='ESPECIFICACIONES TECNICAS\nES.\n4.2 MOBILIARIOS FIJOS\n4.2.1 ACCESORIOS SALAS DE HABITOS HIGIENICOS\n4.2.1.1 Mudador\nDimensiones O.8O x 0.90 x O.8O m\nESTRUCTURA GENERAL \nSe consulta Mueble Mudador según planos adjuntos. Estructura en placa de Melamina\nde 24mm, MDF o calidad similar o superior en características, con enchape y tapacantos\ncolor BLANCO. Las uniones se afianzarán mediante tornillos soberbios. La cabeza de los\ntornillos se cubrirá con Tapa Soberbio color blanco. \nBASE MUDADOR Y BANDEJA PARA COLCHONETAS \nSe consultan en placa de Melamina de 24mm, MDF o calidad similar o superior, con\nenchape y tapacantos color BLANCO. Considera bordes redondeados en parte frontal.\nDimensiones de acuerdo a plano de detalles. La base del mudador deberá incorporar\nhuinchas de VELCRO color blanco, 5 cm de espesor y 40cms de largo

In [24]:
from langchain_ollama import OllamaLLM
from langchain_openai import ChatOpenAI
#!ollama pull llama3.1
llama_model = OllamaLLM(model="llama3.2")
gpt_model = ChatOpenAI(model="gpt-4o-mini") 

In [30]:
from pydantic import BaseModel, Field

class Item(BaseModel):
    name: str = Field(description="Name of the item")
    brand: str = Field(description="Brand of the item")
    specifications: str = Field(description="A brief description of the specifications of the item")
    measurements: dict = Field(
        description="Dimensions of the item, where the first value is the length, the second is the width and the third is the height",
        default={
            "length": 0.0,
            "width": 0.0,
            "height": 0.0
        }
    )
    quantity: int = Field(description="Quantity of items", ge=0)

In [40]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

extraction_prompt = ChatPromptTemplate.from_template("""
You are a expert architec working in extracting structured data from a EETT, your task
is to identify items such as beds, tables, chairs, etc.

Each of these items has a description, quantity and unit of measurements,
keep close attention to the measurements, they can come in the following formats:
- 1.20 x 0.80 x 0.20
- 1.20 x 0.80
- 1.20
- 1.20 x 0.80 x 0.20 m
- 1.20 x 0.80 m
- 1.20 m
and can also be named as dimensions, size, etc. where the first value is the length, the second is the width and the third is the height.

You will extract all the data related to each item and return it in a structured format.

Here are some examples of the data you will extract:

sample text: En baños de acceso universal se ejecutará espejo de referencia Espejo
modelo Cherry-N Con Bisel: 8 mm, Espesor: 5 mm, Fijación: Incluida, Instalación: Para
ambos sentidosEsquinas Rectas, medida de 60x100 marca KLIPEN de MK o similar.

expected output:
{{
    "name": "Espejo Cherry-N Con Bisel",
    "brand": "KLIPEN",
    "specifications": "8 mm, Espesor: 5 mm, Fijación: Incluida, Instalación: Para ambos sentidosEsquinas Rectas, medida de 60x100",
    "measurements": {{
        "length": 60.0,
        "width": 100.0,
        "height": 0.0
    }},
    "quantity": 1
}}

Now here is the context.

Context: {context}
""")

# Crear una cadena para crear una pregunta
question_chain = (
    {"context": RunnablePassthrough()}
    | extraction_prompt
    | gpt_model.with_structured_output(Item)
)

sample_results = question_chain.invoke(docs[25:35])

In [42]:
import tqdm

docs_sublist = docs[25:35]

sample_results_list = []
for doc in tqdm.tqdm(docs_sublist):
    sample_results = question_chain.invoke(doc)
    sample_results_list.append(sample_results)

100%|██████████| 10/10 [00:20<00:00,  2.09s/it]


In [43]:
sample_results_list

[Item(name='Pintura exterior de muros y aleros', brand='Ceresita', specifications='Esmalte al agua, dos manos como mínimo, calidad de las pinturas deberá responder a las máximas exigencias de durabilidad y aspecto.', measurements={'length': 0.0, 'width': 0.0, 'height': 0.0}, quantity=1),
 Item(name='Espejo Cherry-N Con Bisel', brand='KLIPEN', specifications='8 mm, Espesor: 5 mm, Fijación: Incluida, Instalación: Para ambos sentidosEsquinas Rectas, medida de 60x100', measurements={'length': 0.0, 'width': 0.0, 'height': 0.0}, quantity=1),
 Item(name='Mudador', brand='No especificado', specifications='Estructura en placa de Melamina de 24mm, MDF o calidad similar o superior en características, con enchape y tapacantos color BLANCO. Las uniones se afianzarán mediante tornillos soberbios. La cabeza de los tornillos se cubrirá con Tapa Soberbio color blanco. La base del mudador deberá incorporar huinchas de VELCRO color blanco, 5 cm de espesor y 40cms de largo.', measurements={'length': 0.0, 

In [39]:
sample_results.measurements

{'length': 0.0, 'width': 0.0, 'height': 0.0}

In [46]:
import pandas as pd

df = pd.DataFrame([{
    'name': result.name,
    'brand': result.brand, 
    'specifications': result.specifications,
    'length': result.measurements['length'],
    'width': result.measurements['width'], 
    'height': result.measurements['height'],
    'quantity': result.quantity
} for result in sample_results_list])

In [47]:
df

Unnamed: 0,name,brand,specifications,length,width,height,quantity
0,Pintura exterior de muros y aleros,Ceresita,"Esmalte al agua, dos manos como mínimo, calida...",0.0,0.0,0.0,1
1,Espejo Cherry-N Con Bisel,KLIPEN,"8 mm, Espesor: 5 mm, Fijación: Incluida, Insta...",0.0,0.0,0.0,1
2,Mudador,No especificado,"Estructura en placa de Melamina de 24mm, MDF o...",0.0,0.0,0.0,1
3,Repisa,Masisa,Conformada por bastidores de perfiles metálico...,0.0,0.0,0.0,1
4,Campana Industrial Mural,Biggi,"Formato C-150M, formatos 100x93 cm, 150x93 cm ...",0.0,0.0,0.0,1
5,Tineta SINGLE-N,KLIPEN,"Color: Blanco, Diámetro Desagüe: 50 mm, Instal...",0.0,0.0,0.0,1
6,Lavamanos Comfort,KLIPEN,"Loza vitrificada blanca, Incluye rebalse, Inst...",0.0,0.0,0.0,1
7,Lavadero de 18 Litros modelo 18LT-N,Incepa,"loza vitrificada (capacidad máxima 31 litros),...",0.0,0.0,0.0,2
8,Gabinete Red Húmeda,,70x70x35cm,0.0,0.0,0.0,1
9,Manguera Contra Incendio,ANGUS,carrete automático porta manguera tipo ANGUS M...,0.0,0.0,0.0,1
