# Chart to Table

Code from here: https://github.com/jxnl/instructor/blob/main/examples/vision/run_table.py

In [7]:
import sys
sys.path.append('..')
import os
from dotenv import load_dotenv
load_dotenv()

True

In [8]:
from io import StringIO
from typing import Annotated, Any, Iterable
from openai import OpenAI
from pydantic import (
    BaseModel,
    BeforeValidator,
    PlainSerializer,
    InstanceOf,
    WithJsonSchema,
)
import pandas as pd
import instructor


client = instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON)


def to_markdown(df: pd.DataFrame) -> str:
    return df.to_markdown()


def md_to_df(data: Any) -> Any:
    if isinstance(data, str):
        return (
            pd.read_csv(
                StringIO(data),  # Get rid of whitespaces
                sep="|",
                index_col=1,
            )
            .dropna(axis=1, how="all")
            .iloc[1:]
            .map(lambda x: x.strip())
        )
    return data


MarkdownDataFrame = Annotated[
    InstanceOf[pd.DataFrame],
    BeforeValidator(md_to_df),
    PlainSerializer(to_markdown),
    WithJsonSchema(
        {
            "type": "string",
            "description": """
                The markdown representation of the table, 
                each one should be tidy, do not try to join tables
                that should be seperate""",
        }
    ),
]


class Table(BaseModel):
    caption: str
    dataframe: MarkdownDataFrame

TEXT_PROMPT = """
Extract the data from the image, and describe it as a table. 
Each row is shown with a center point, which value you want to estimate from the position in the graph. 
For example is a value is almost 0.2, so 0.196 would be a good estimate.
The order of the values is the most important aspect to get right. 
Also provide the upper and lower bounds for each bar.
"""

def extract_table(url: str) -> Iterable[Table]:
    return client.chat.completions.create(
        model="gpt-4-vision-preview",
        response_model=Iterable[Table],
        max_tokens=1800,
        temperature=0.0,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": TEXT_PROMPT,
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": url},
                    },
                ],
            }
        ],
    )

<img src="https://files.gitbook.com/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FRPAuLvXI0lCcTz39j3BR%2Fuploads%2FXWcseqee9uSvfbwjlD9O%2F0.png?alt=media" alt="sarcasm" width="800"/>

In [9]:
url = "https://files.gitbook.com/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FRPAuLvXI0lCcTz39j3BR%2Fuploads%2FXWcseqee9uSvfbwjlD9O%2F0.png?alt=media"
tables = extract_table(url)

In [10]:
len(tables)

1

In [11]:
tables[0].dataframe

Unnamed: 0_level_0,Category,Estimate,Lower Bound,Upper Bound
Attribute,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gender,female,0.02,-0.02,0.06
Gender,male,0.0,-0.04,0.04
Education,no formal,-0.18,-0.22,-0.14
Education,4th grade,-0.14,-0.18,-0.1
Education,8th grade,-0.1,-0.14,-0.06
Education,high school,-0.06,-0.1,-0.02
Education,two-year college,-0.02,-0.06,0.02
Education,college degree,0.02,-0.02,0.06
Education,graduate degree,0.08,0.04,0.12
Language,fluent English,0.07,0.03,0.11


<img src="https://thomasleeper.com/cregg/reference/amce-1.png" alt="sarcasm" width="800"/>

In [12]:
url = "https://thomasleeper.com/cregg/reference/amce-1.png"
tables = extract_table(url)

In [13]:
len(tables)

1

In [14]:
tables[0].dataframe

Unnamed: 0_level_0,Estimated AMCE,Lower Bound,Upper Bound
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male (Gender),0.2,0.1,0.3
Female (Gender),-0.2,-0.3,-0.1
Graduate Degree,3.6,3.3,3.9
College Degree,2.8,2.5,3.1
Two-Year College,2.0,1.7,2.3
High School,1.2,0.9,1.5
8th Grade,0.4,0.1,0.7
4th Grade,-0.4,-0.7,-0.1
No Formal,-1.2,-1.5,-0.9
Used Interpreter,-3.2,-3.5,-2.9
