In [123]:
from openai import OpenAI
import openai
import base64
import os
import requests
import pandas as pd
import io
import glob
import time
import re
from pathlib import Path
from pdf2image import convert_from_path
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI()

# An Example Page

A demonstration of parsing of one page. 

In [99]:
# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

# Path to your image
image_path = "../../data/primary_sources/Paris_1878_Italy_7.jpg"

In [100]:
# Getting the base64 string
base64_image = encode_image(image_path)

headers = {
  "Content-Type": "application/json",
  "Authorization": f"Bearer {api_key}"
}

In [101]:
messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Parse this image to .csv with \t as a delimiter and the variables Name, Place, Description. Delimit the csv file by ```csv at the beginning and ``` at the end. In an empty cell, please put the string 'NA'."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        }
    ]
response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=2500
    )

In [102]:
csv_content = response.choices[0].message.content

# Remove markdown and extract CSV string
#csv_string = csv_content.split('```\n')[1].split('```')[0]
# Or:
csv_string = csv_content.split('```csv\n')[1].split('```')[0]

In [103]:
csv_string

"Name\tPlace\tDescription\nDel Maino (A.)\tPlaisance\tLivres et spécimens typographiques.\nDumouard frères\tMilan\tLivres; bibliothèque scientifique internationale, avec illustrations.\nElzeviriana (Typographie)Pateras (A.)\tRome\tLivres.\nGiannini (Chevalier F.)\tNaples\tLivres.\nHeçeli (U.)\tMilan\tPublications diverses.\nLucea (F.)\tMilan\tÉditions de musique de luxe et ordinaire.\nMaisner (V.)\tMilan\tVoyage autour du monde de la corvette à vapeur italienne Magenta; relation scientifique descriptive de H. H. Giglioli; un volume illustré de cartes géographiques, photographies, phototypies, chromolithographies ectylographies.\nMaison de reclusion militaire (Typographie de la)\tSavone\tSpécimens de travaux typographiques en couleur.\nMarietti (Chevalier F.)\tTurin\tOuvrages divers.\nMekitaristi Arméniens (Congrégration des RR. PP. de Saint Lazare)\tVenise\tOuvrages.\nMinelli (A.)\tRovigo\tSpécimens de livres.\nMorano (A.)\tNaples\tOuvrages divers.\nMorano (D.)\tNaples\tLivres.\nMorett

In [104]:
df = pd.read_csv(io.StringIO(csv_string), delimiter='\t')

In [105]:
df['Pdf'] = os.path.basename(image_path)

In [107]:
df.to_csv("../../data/gpt_ocr/" + os.path.basename(image_path) + ".csv", 
              sep="\t",
              index=False)  

In [106]:
df

Unnamed: 0,Name,Place,Description,Pdf
0,Del Maino (A.),Plaisance,Livres et spécimens typographiques.,Paris_1878_Italy_7.jpg
1,Dumouard frères,Milan,Livres; bibliothèque scientifique internationa...,Paris_1878_Italy_7.jpg
2,Elzeviriana (Typographie)Pateras (A.),Rome,Livres.,Paris_1878_Italy_7.jpg
3,Giannini (Chevalier F.),Naples,Livres.,Paris_1878_Italy_7.jpg
4,Heçeli (U.),Milan,Publications diverses.,Paris_1878_Italy_7.jpg
5,Lucea (F.),Milan,Éditions de musique de luxe et ordinaire.,Paris_1878_Italy_7.jpg
6,Maisner (V.),Milan,Voyage autour du monde de la corvette à vapeur...,Paris_1878_Italy_7.jpg
7,Maison de reclusion militaire (Typographie de la),Savone,Spécimens de travaux typographiques en couleur.,Paris_1878_Italy_7.jpg
8,Marietti (Chevalier F.),Turin,Ouvrages divers.,Paris_1878_Italy_7.jpg
9,Mekitaristi Arméniens (Congrégration des RR. P...,Venise,Ouvrages.,Paris_1878_Italy_7.jpg


# A function

A function to loop through all files and export jsons of raw outputs and ready-made csv's per page. 

In [64]:
def chatgpt_csv(file_path):
    # Encode the image
    # Getting the base64 string
    base64_image = encode_image(file_path)

    headers = {
      "Content-Type": "application/json",
      "Authorization": f"Bearer {api_key}"
    }
    # Construct query to ChatGPT
    payload = {
        "model": "gpt-4o",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Parse this image to .csv with \t as a delimiter and the variables Name, Place, Description. Delimit the csv file by ```csv at the beginning and ``` at the end. In an empty cell, please put the string 'NA'."
        },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
    }
        ],
        "max_tokens": 2500
    }
    # Obtain response
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    # Wait a second
    time.sleep(2)
    # Obtain .csv content and parse
    csv_content = response.json()['choices'][0]['message']['content']
    # Remove markdown and extract CSV string
    try:
        csv_string = csv_content.split('```csv\n')[1].split('```')[0]
    except:
        csv_string = csv_content.split('```\n')[1].split('```')[0]
    # Parse the csv in a df
    df = pd.read_csv(io.StringIO(csv_string), delimiter='\t')
    # Add the filename
    df['Pdf'] = os.path.basename(file_path)
    # Export to directory
    df.to_csv("../../data/gpt_ocr/" + os.path.basename(file_path) + ".csv", 
              sep="\t",
              index=False)  
    
    
    

In [117]:
# try this approach then..
def chatgpt_csv(file_path):
    base64_image = encode_image(file_path)
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Parse this image to .csv with \t as a delimiter and the variables Name, Place, Description. Delimit the csv file by ```csv at the beginning and ``` at the end. In an empty cell, please put the string 'NA'."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        }
    ]
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=2500
    )
    csv_content = response.choices[0].message.content
    csv_string = csv_content.split('```csv\n')[1].split('```')[0]
    df = pd.read_csv(io.StringIO(csv_string), delimiter='\t')
    df['Pdf'] = os.path.basename(file_path)
    output_path = os.path.join("../../data/gpt_ocr/", os.path.basename(file_path) + ".csv")
    df.to_csv(output_path, sep="\t", index=False)
    return df

In [109]:
chatgpt_csv("../../data/primary_sources/Paris_1878_Italy_8.jpg")

Unnamed: 0,Name,Place,Description,Pdf
0,Vigo (F.),à Livourne,Livres ; éditions de luxe et ordinaires.,Paris_1878_Italy_8.jpg
1,Zanichelli (N.),à Bologne,Livres.,Paris_1878_Italy_8.jpg
2,Almana (F.),à Turin,Toiles préparées pour la peinture.,Paris_1878_Italy_8.jpg
3,Binda (A. et C.),à Milan,Spécimens de papier à la mécanique.,Paris_1878_Italy_8.jpg
4,Binetti (F. et C.),à Milan,"Enveloppes de lettres, papiers colorés et regi...",Paris_1878_Italy_8.jpg
5,Bussano (J.),à Turin,Cire à cacheter et encres.,Paris_1878_Italy_8.jpg
6,Cantileana (G.),à Majori (Salerne),"Diverses qualités de papier à la main, à écrir...",Paris_1878_Italy_8.jpg
7,Cecchi (G.),à Florence,Deux albums pour photographies.,Paris_1878_Italy_8.jpg
8,Civelli (J.),à Milan,Papiers à la mécanique et à la main de divers ...,Paris_1878_Italy_8.jpg
9,De Luecchi (G.) et Cie,à Florence,Encres et vernis pour imprimerie et lithograph...,Paris_1878_Italy_8.jpg


In [110]:
directory = '../../data/primary_sources/'
pattern = '*.jpg'  # Pattern to match

# Use glob to list all files matching the pattern
matching_files = glob.glob(os.path.join(directory, pattern))
pattern = re.compile(r'Paris_1878_Italy_(\d+)\.jpg')

# Filter files where the number is 5 or greater
filtered_files = [file for file in matching_files if pattern.search(file) and int(pattern.search(file).group(1)) >= 9 and int(pattern.search(file).group(1)) not in [33, 65, 76, 79]]

filtered_files.sort()

In [111]:
for file in filtered_files:
    try:
        result = chatgpt_csv(file)
        print(f"Processed file: {file}")
    except Exception as e:
        print(f"Failed to process file: {file} with error: {e}")


Processed file: ../../data/primary_sources/Paris_1878_Italy_10.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_11.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_12.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_13.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_14.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_15.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_16.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_17.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_18.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_19.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_20.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_21.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_22.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_23.jpg
Processed file: ../../data/primary_sources/Paris_1878_Italy_24

In [121]:
chatgpt_csv("../../data/primary_sources/Paris_1878_Italy_2.jpg")

Unnamed: 0,Name,Place,Description,Pdf
0,Podesta (E.),"professeur, à Arezzo",Traité élémentaire d'agriculture; premiers élé...,Paris_1878_Italy_2.jpg
1,Rosalba (C.),"architecte, à Avellino",Projet d'un édifice scolaire municipal constru...,Paris_1878_Italy_2.jpg
2,Santi (A.),à Murano (Venise),Relations sur l'instruction et l'éducation dan...,Paris_1878_Italy_2.jpg
3,Scaraviglia (T.),"professeur, à Gualdo-Tadino (Pérouse)",Dessins à l'aquarelle par les élèves de l'écol...,Paris_1878_Italy_2.jpg
4,Société centrale ouvrière napolitaine,à Naples,"Dessins d'ornements, de figures, de géométrie,...",Paris_1878_Italy_2.jpg
5,Soli (P.),"architecte, à Milan",Projets d'écoles élémentaires.,Paris_1878_Italy_2.jpg
6,Sotis (Dr J.),à Fondi (Caserte),Conférences sur l'hygiène de l'habitant de la ...,Paris_1878_Italy_2.jpg
7,Thevenet (J.),à Milan,Cours complet d'écriture (5 modèles).,Paris_1878_Italy_2.jpg
8,Zanetti (Abbé V.),directeur du Musée verrier et de l'école de de...,Photographies des meilleurs types classiques q...,Paris_1878_Italy_2.jpg
9,Zucchetti (A.),"professeur, à Todi (Pérouse)",Dessins des élèves de l'école technique de Tod...,Paris_1878_Italy_2.jpg


## Now the 1867 file

We use the same `chatgpt_csv()` function. But first, the images have to be converted to .jpg from the .pdf file:


In [125]:
# Convert to JPG from source
images = convert_from_path('../../data/primary_sources/PARIS_1867_Italy.pdf')

for count, image in enumerate(images):
    image.save(f'../../data/primary_sources/Paris_1867_Italy_{count}.jpg', 'JPEG')

In [126]:
# try this approach then..
def chatgpt_csv(file_path):
    base64_image = encode_image(file_path)
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Parse this image to .csv with \t as a delimiter and the variables Name, Place, Description. Delimit the csv file by ```csv at the beginning and ``` at the end. In an empty cell, please put the string 'NA'."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        }
    ]
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=2500
    )
    csv_content = response.choices[0].message.content
    csv_string = csv_content.split('```csv\n')[1].split('```')[0]
    df = pd.read_csv(io.StringIO(csv_string), delimiter='\t')
    df['Pdf'] = os.path.basename(file_path)
    output_path = os.path.join("../../data/gpt_ocr/", os.path.basename(file_path) + ".csv")
    df.to_csv(output_path, sep="\t", index=False)
    return df

In [130]:
directory = '../../data/primary_sources/'
pattern = '*.jpg'  # Pattern to match

# Use glob to list all files matching the pattern
matching_files = glob.glob(os.path.join(directory, pattern))
pattern = re.compile(r'Paris_1867_Italy_(\d+)\.jpg')

# Filter files where the number is 5 or greater
filtered_files = [file for file in matching_files if pattern.search(file)]
filtered_files.sort()

In [134]:
for file in filtered_files[10:]:
    try:
        result = chatgpt_csv(file)
        print(f"Processed file: {file}")
    except Exception as e:
        print(f"Failed to process file: {file} with error: {e}")


Processed file: ../../data/primary_sources/Paris_1867_Italy_107.jpg
Failed to process file: ../../data/primary_sources/Paris_1867_Italy_108.jpg with error: Error tokenizing data. C error: Expected 3 fields in line 16, saw 4

Processed file: ../../data/primary_sources/Paris_1867_Italy_109.jpg
Processed file: ../../data/primary_sources/Paris_1867_Italy_11.jpg
Processed file: ../../data/primary_sources/Paris_1867_Italy_110.jpg
Processed file: ../../data/primary_sources/Paris_1867_Italy_111.jpg
Processed file: ../../data/primary_sources/Paris_1867_Italy_112.jpg
Processed file: ../../data/primary_sources/Paris_1867_Italy_113.jpg
Processed file: ../../data/primary_sources/Paris_1867_Italy_114.jpg
Processed file: ../../data/primary_sources/Paris_1867_Italy_115.jpg
Processed file: ../../data/primary_sources/Paris_1867_Italy_116.jpg
Processed file: ../../data/primary_sources/Paris_1867_Italy_12.jpg
Processed file: ../../data/primary_sources/Paris_1867_Italy_13.jpg
Processed file: ../../data/pri

In [138]:
#chatgpt_csv("../../data/primary_sources/Paris_1867_Italy_108.jpg")
#chatgpt_csv("../../data/primary_sources/Paris_1867_Italy_21.jpg")
#chatgpt_csv("../../data/primary_sources/Paris_1867_Italy_30.jpg")

# Next

Next up is 1889. First, again convert the .pdf to .jpg's for every page. 

In [143]:
# Convert to JPG from source
images = convert_from_path('../../data/primary_sources/PARIS_1889_Italy.pdf')

for count, image in enumerate(images):
    image.save(f'../../data/primary_sources/Paris_1889_Italy_{count}.jpg', 'JPEG')

In [144]:
directory = '../../data/primary_sources/'
pattern = '*.jpg'  # Pattern to match

# Use glob to list all files matching the pattern
matching_files = glob.glob(os.path.join(directory, pattern))
pattern = re.compile(r'Paris_1889_Italy_(\d+)\.jpg')

# Filter files where the number is 5 or greater
filtered_files = [file for file in matching_files if pattern.search(file)]
filtered_files.sort()

In [146]:
# try this approach then..
def chatgpt_csv(file_path):
    base64_image = encode_image(file_path)
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Parse this image to .csv with \t as a delimiter and the variables Name, Place, Description. Delimit the csv file by ```csv at the beginning and ``` at the end. Parse only the observations corresponding to Italy. If Italy is not mentioned in the image, assume that the entries not clearly corresponding to a country correspond to Italy. In an empty cell, please put the string 'NA'."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        }
    ]
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=2500
    )
    csv_content = response.choices[0].message.content
    csv_string = csv_content.split('```csv\n')[1].split('```')[0]
    df = pd.read_csv(io.StringIO(csv_string), delimiter='\t')
    df['Pdf'] = os.path.basename(file_path)
    output_path = os.path.join("../../data/gpt_ocr/", os.path.basename(file_path) + ".csv")
    df.to_csv(output_path, sep="\t", index=False)
    return df

In [149]:
for file in filtered_files[1:]:
    try:
        result = chatgpt_csv(file)
        print(f"Processed file: {file}")
    except Exception as e:
        print(f"Failed to process file: {file} with error: {e}")


Processed file: ../../data/primary_sources/Paris_1889_Italy_1.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_10.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_11.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_12.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_13.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_14.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_15.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_16.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_17.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_18.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_19.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_2.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_20.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_21.jpg
Processed file: ../../data/primary_sources/Paris_1889_Italy_22.j

In [150]:
chatgpt_csv("../../data/primary_sources/Paris_1889_Italy_59.jpg")

Unnamed: 0,Name,Place,Description,Pdf
0,CINZANO (François) & Cie,Turin,Vermouth et vins,Paris_1889_Italy_59.jpg
1,CITO (François et Frère),Naples,Vins et liqueurs,Paris_1889_Italy_59.jpg
2,COMIN (Émile),Pavie,Liqueurs diverses,Paris_1889_Italy_59.jpg
3,Compagnie vinicole Sicilienne,Paris,Vins de Marsala et de Syracuse,Paris_1889_Italy_59.jpg
4,CONTÌ (Mathieu),Palermo,Vins ordinaires. Etna blanc. Etna rouge. Lique...,Paris_1889_Italy_59.jpg
5,CORA Frères (Joseph et Louis),Turin,Vermouth. Vins blancs et vins rouges. Liqueurs.,Paris_1889_Italy_59.jpg
6,CURTOPASSI (Marquis Joseph),Bisceglie (Bari),Vin rouge ordinaire. « Sorgente » 1887–1888. S...,Paris_1889_Italy_59.jpg
7,DEMACCA (Émihie),Gênes,Vins divers,Paris_1889_Italy_59.jpg
8,D'EMARESE (E.),Turin,"Vins, liqueurs, sirops",Paris_1889_Italy_59.jpg
9,Établissement oenologique,Acquara,Vins,Paris_1889_Italy_59.jpg
