Probando otra funcion de resumen que devuelva fechas de origen de la data.

In [4]:
from langchain.document_loaders import UnstructuredFileLoader
loader = UnstructuredFileLoader("../../data/mastercard/s3.pdf")
docs = loader.load()
context = docs[0].page_content

Aca pruebo una funcion nueva en la que en vez de un resumen pruebo con una lista de oraciones.


In [30]:
import openai
import os
import json

openai.api_key = os.environ["OPENAI_API_KEY"]
MODEL = "gpt-4-1106-preview"


def clean_data(text: str):
    """Transform text using OpenAI's API."""
    function_description = f"""
    The user will give you a credit card consumption report with some meaningful data surrounded by meaningless data.
    Extract all meaningful information from the document and create a list of sentences, put those in the 'sentences' field.
    Then create a summary of the consumptions and due payments, put that summary in the summary field.
    Use this format:
    'The user [did/bought/used/paid/...] [something] on [date] at [place/store] for [amount of money].'
    Then extract the name of the user and put it in the user field.
    Then extract the date of the document and put it in the date field.
    No important details should be left out.
    This is very important information for the user, so please be sure to include all the relevant information.
    """
    tools = [
        {"type": "function",
         "function":{
            "name": "default",
            "description": function_description,
            "parameters": {
                "type": "object",
                "properties": {
                    "summary": {
                        "type": "string",
                        "description": "The summary of the document according to the function description.",
                    },
                    "sentences": {
                        "type": "string",
                        "description": "The sentences extracted of the document according to the function description.",
                    },
                    "user": {
                        "type": "string",
                        "description": "[User's first name] [User's last name]",
                    },
                    "date": {
                        "type": "string",
                        "description": "Date in the format YYYY-MM-DD",
                    },
                },
            },
        }
        }
    ]
    messages = [
        {"role": "user", "content": text},
    ]
    try:
        completion = openai.ChatCompletion.create(
            model=MODEL,
            messages=messages,
            tools=tools,
            tool_choice={"type":"function", "function": {"name": "default"}},
        )
        print(completion)
        arguments = json.loads(
            str(completion.choices[0])
        )

        return arguments

    except openai.error.InvalidRequestError:
        print("OpenAI request failed")
        return None

In [31]:
summary = clean_data(text=context)

{
  "id": "chatcmpl-8IcFnieEyYLFfDkY9PAs5Dy0j9mwp",
  "object": "chat.completion",
  "created": 1699447643,
  "model": "gpt-4-1106-preview",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": null,
        "tool_calls": [
          {
            "id": "call_Foh5CuSDOrXTCYwha5iAammL",
            "type": "function",
            "function": {
              "name": "default",
              "arguments": "{\n  \"summary\": \"Alexander Ditzend used his Mastercard Platinum credit card for multiple transactions between September and October 2023. The card was charged for purchases at Scape Park, Autopistas, Don Web, Adobe, and various other vendors.\",\n  \"sentences\": \"The user bought access to Scape Park on 24-Sep-23 for ARS 2,603.00. The user paid toll charges on 08-Sep-23 at Autopistas del 10/18. The user made payments to DON WEB on 14-Sep-23 for undisclosed amounts. The user made a purchase at PPRO *Adobe on 22-Sep-23. The user made

In [21]:
type(summary)

dict

In [22]:
summary

{'summary': 'The user Alexander Ditzend had a total payment due of ARS 102,041.04 for his Mastercard Platinum credit card, with transactions spanning from 24-Sep-23 to 04-Oct-23, including purchases at Scape Park, toll payments, web services from Don Web, Adobe subscription, transactions at Flybondi.com and with Claro.',
 'sentences': "The user Alexander Ditzend paid ARS 102,041.04 on various dates for services which include: 'Scape Park' on 24-Sep-23, toll on 08-Sep-23 and 22-Sep-23, 'Don Web' services on 14-Sep-23, 'Adobe' subscription on 22-Sep-23, transaction at 'Flybondi.com' on 03-Oct-23, and 'Claro' service on 04-Oct-23.",
 'user': 'Alexander Ditzend',
 'date': '2023-09-24'}

In [10]:
summary['date']


'2023-09-07'

In [19]:
summary['sentences']

KeyError: 'sentences'

In [33]:
summary['user']

'ALEXANDER DITZEND'

Paso siguiente: guardar el resumen en un indice junto con 'user' y 'date' como metadata ademas de 'source'. Que eso este disponible para hacer QA y generar preguntas de prueba con gpt4 para evaluar la capacidad.