In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import pycountry
import re
import time
import random
import warnings
import PyPDF2
from dotenv import load_dotenv
import google.generativeai as genai
from typing import List, Dict
import json
import PyPDF2
from tqdm import tqdm
from bs4 import BeautifulSoup

# PDF Parser

In [13]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        dict: Dictionary containing page numbers and their corresponding text
    """
    # Dictionary to store text from each page
    text_by_page = {}
    
    try:
        # Open the PDF file in binary read mode
        with open(pdf_path, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get the number of pages
            num_pages = len(pdf_reader.pages)
            
            # Extract text from each page
            for page_num in range(num_pages):
                # Get the page object
                page = pdf_reader.pages[page_num]
                print("ON PAGE", page_num + 1)
                
                # Extract text from the page
                text = page.extract_text()
                
                # Store the text in our dictionary
                text_by_page[page_num + 1] = text
                
        return text_by_page
    
    # Error Messaging
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except PyPDF2.PdfReadError:
        print("Error: Invalid or corrupted PDF file.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None

print(extract_text_from_pdf("menus/rake-wine.pdf"))

ON PAGE 1
ON PAGE 2
ON PAGE 3
ON PAGE 4
ON PAGE 5
ON PAGE 6
ON PAGE 7
ON PAGE 8
ON PAGE 9
{1: 'White\nChëpìka, Petnat Catawba 2021 Finger Lakes, NY 64\nLa Ferme du Vert, "L\' Angelou Blanc Bulle" Mauzac Blanc 2021 Southwest, FRA 56\nGuiborat, "Prisme - Blanc de Blancs", Grand Cru Extra Brut Chardonnay NV Champagne, FRA 144\nPierre Moncuit, "Delos Blanc de Blancs" Grand Cru Brut Chardonnay NV Champagne, FRA 117\nRobert Moncuit, "Millésime Blanc de Blancs", Grand Cru Brut Chardonnay 2013 Champagne, FRA 196\nAzienda Agricola Monban, "Questo Neanche", Col Fondo Glera 2021 Veneto, ITA 45\nDiletta Tonello, "Marachelle", Frizzante Durella blend NV Veneto, ITA 53\nAldo Viola, "Brutto" Catarratto 2022 Sicily, ITA 49\nJoao Pato, "Ducking" Petnat Sercialinho 2022 Bairrada, PRT 61\nFamilie Bauer, "Unsprung" Petnat Gruner + Roter V. + Riesling 2021 Wagram, AUT 52\nRosé\nAnnesanti, "Raspato", Frizzante Rosato Sangiovese + Aleatico 2020 Umbria, ITA 57\nLaherte Frères, "Les Beaudiers", Rosé de Saignée

# Zero-Shot Parser with Gemini

In [3]:
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path='config.env')
google_key = os.getenv('GOOGLE_KEY')

if google_key is None:
    raise ValueError("GOOGLE_KEY not found in .env file")

In [25]:
import google.generativeai as genai
from typing import List, Dict
import json
import PyPDF2

# Made with Claude 3.5

class GeminiWineParser:
    def __init__(self, api_key: str):
        """Initialize the Gemini parser with API key"""
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-1.5-pro')
        
    def parse_wine_list(self, text: str) -> List[Dict]:
        """
        Parse wine list text using Gemini 1.5
        
        Args:
            text (str): The wine list text to parse
            
        Returns:
            List[Dict]: List of parsed wine entries
        """
        prompt = f"""Extract wine information from the text below into a structured format.
        For each wine entry, extract:
        - ID number
        - Producer
        - Wine name
        - Type (e.g., NON-VINTAGE, BLANC DE BLANCS)
        - Main Type (e.g., SPARKLING, WHITE, RED, ROSE)
        - Region
        - Vintage (if available)
        - Price
        - Size (glass, bottle, half bottle, magnum)
        
        Format as JSON with missing fields as null but get as many wines as possible even if some fields are missing. Take your time, we want everything in the text.
        
        Text to parse:
        {text}
        
        Respond with only valid JSON in this exact format:
        {{
            "wines": [
                {{
                    "id": "1234",
                    "producer": "Producer Name",
                    "name": "Wine Name",
                    "type": "Wine Type",
                    "region": "Region",
                    "country": "Country",
                    "vintage": "2020",
                    "price": "123",
                    "size": "bottle"
                }}
            ]
        }}"""

        try:
            response = self.model.generate_content(
                prompt,
                generation_config={
                    'temperature': 0.0,  # Use deterministic output
                    'top_p': 1.0,
                    'top_k': 1
                }
            )
            
            # Find the JSON in the response
            response_text = response.text
            # Look for JSON between ```json and ``` if present
            if '```json' in response_text:
                json_str = response_text.split('```json')[1].split('```')[0].strip()
            else:
                json_str = response_text.strip()
                
            # Parse the JSON response
            json_response = json.loads(json_str)
            return json_response["wines"]
            
        except Exception as e:
            print(f"Error parsing wine list: {str(e)}")
            return []

    def parse_pdf_and_wine_list(self, pdf_path: str, page_number: int = 1) -> List[Dict]:
        """
        Extract text from PDF and parse wine list
        
        Args:
            pdf_path (str): Path to PDF file
            page_number (int): Page number to parse (default: 1)
            
        Returns:
            List[Dict]: List of parsed wine entries
        """
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                if page_number <= len(reader.pages):
                    text = reader.pages[page_number - 1].extract_text()
                    return self.parse_wine_list(text)
                else:
                    raise ValueError(f"PDF has only {len(reader.pages)} pages")
        except Exception as e:
            print(f"Error processing PDF: {str(e)}")
            return []
    
    def save_to_json(self, wines: List[Dict], output_file: str):
        """Save parsed wines to a JSON file"""
        try:
            with open(output_file, 'w') as f:
                json.dump({'wines': wines}, f, indent=2)
            print(f"Successfully saved to {output_file}")
        except Exception as e:
            print(f"Error saving to file: {str(e)}")

# Add API Key
parser = GeminiWineParser(google_key)

# Example wine list text
sample_text = """SPARKLING  |  FRANCE  |  CHAMPAGNE
NON-VINTAGE
1148 Alexandre Bonnet / 'La Geande' 7 Cepages' / Extra Brut NV 310
1147 Alexandre Filaine / 'Cuvée Spéciale' / Brut NV 235
BLANC DE BLANCS
1100 Pascal Agrapart / 'Minèral' / Extra Brut 2018 350"""

# Parse text directly
results = parser.parse_pdf_and_wine_list(pdf_path='menus/rake-wine.pdf', page_number=2)

# Print results
print(json.dumps(results, indent=2))

[
  {
    "id": null,
    "producer": "Escala Humana Wines",
    "name": "Livver\u00e1",
    "type": "Tupungato Malvasia",
    "main_type": "WHITE",
    "region": "Mendoza",
    "country": "Argentina",
    "vintage": "2020",
    "price": "48",
    "size": "bottle"
  },
  {
    "id": null,
    "producer": "Gernot & Heike Heinrich",
    "name": "Naked White",
    "type": "PB + Chardonnay + PG",
    "main_type": "WHITE",
    "region": "Burgenland",
    "country": "Austria",
    "vintage": "2021",
    "price": "46",
    "size": "bottle"
  },
  {
    "id": null,
    "producer": "Matthias Warnung",
    "name": "Basis",
    "type": "Gruner Veltliner",
    "main_type": "WHITE",
    "region": "Niederosterreich",
    "country": "Austria",
    "vintage": null,
    "price": "49",
    "size": "bottle"
  },
  {
    "id": null,
    "producer": "Hager Matthias",
    "name": "Mollands",
    "type": "Gruner Veltliner",
    "main_type": "WHITE",
    "region": "Niederosterreich",
    "country": "Austria",

In [28]:
def create_csv_menu(pdf_path, page_nums=0, editor=False):
    """
    Parse PDF menu to CSV with manual correction capability
    
    Args:
        pdf_path (str): Path to PDF file
        page_nums (int): Page number to parse (default: 0 for all pages)
        
    Returns:
        str: Path to saved CSV file
    """
    print("INITIALIZING")
    # Initialize parser
    parser = GeminiWineParser(google_key)

    # Parse PDF
    print("EXTRACTING TEXT")
    pages = extract_text_from_pdf(pdf_path)
    print(pages[1])
    print("DONE EXTRACTING TEXT")

    max_pages = max(pages.keys())
    pages_to_process = page_nums if page_nums > 0 else max_pages

    # Process each page individually
    all_results = []
    for page_num in range(1, pages_to_process + 1):
        print(f"\nProcessing page {page_num}/{pages_to_process}")
        page_text = pages[page_num]
        print(f"Page {page_num} text length: {len(page_text)} characters")
        
        # Skip empty pages
        if not page_text.strip():
            print(f"Skipping page {page_num} - empty text")
            continue
            
        # Parse the page
        try:
            page_results = parser.parse_wine_list(page_text)
            print(f"Found {len(page_results)} wines on page {page_num}")
            all_results.extend(page_results)
        except Exception as e:
            print(f"Error parsing page {page_num}: {str(e)}")
            continue

    print("PARSING WINE LIST")
    print("DONE PARSING WINE LIST")
    # Convert to DataFrame
    df = pd.DataFrame(all_results)
    print("TOTAL WINES: ", len(df))
    if editor:
        # Display DataFrame for review
        print("\nPlease review the parsed data:")
        print(df)
        
        while True:
            edit = input("\nWould you like to make any corrections? (yes/no): ").lower()
            if edit == 'no':
                break
            elif edit == 'yes':
                try:
                    print("\nCurrent columns:", df.columns.tolist())
                    col = input("Enter column name to edit: ")
                    row = int(input("Enter row number to edit (0-based index): "))
                    new_value = input("Enter new value: ")
                    df.at[row, col] = new_value
                    print("\nUpdated DataFrame:")
                    print(df)
                except Exception as e:
                    print(f"Error making edit: {str(e)}")
            else:
                print("Please enter 'yes' or 'no'")
    
    # Save to CSV
    csv_path = pdf_path.rsplit('.', 1)[0] + '.csv'
    df.to_csv(csv_path, index=False)
    print(f"\nSaved corrected data to: {csv_path}")
    return df

df = create_csv_menu('menus/rake-wine.pdf', page_nums=9, editor=False)

INITIALIZING
EXTRACTING TEXT
ON PAGE 1
ON PAGE 2
ON PAGE 3
ON PAGE 4
ON PAGE 5
ON PAGE 6
ON PAGE 7
ON PAGE 8
ON PAGE 9
White
Chëpìka, Petnat Catawba 2021 Finger Lakes, NY 64
La Ferme du Vert, "L' Angelou Blanc Bulle" Mauzac Blanc 2021 Southwest, FRA 56
Guiborat, "Prisme - Blanc de Blancs", Grand Cru Extra Brut Chardonnay NV Champagne, FRA 144
Pierre Moncuit, "Delos Blanc de Blancs" Grand Cru Brut Chardonnay NV Champagne, FRA 117
Robert Moncuit, "Millésime Blanc de Blancs", Grand Cru Brut Chardonnay 2013 Champagne, FRA 196
Azienda Agricola Monban, "Questo Neanche", Col Fondo Glera 2021 Veneto, ITA 45
Diletta Tonello, "Marachelle", Frizzante Durella blend NV Veneto, ITA 53
Aldo Viola, "Brutto" Catarratto 2022 Sicily, ITA 49
Joao Pato, "Ducking" Petnat Sercialinho 2022 Bairrada, PRT 61
Familie Bauer, "Unsprung" Petnat Gruner + Roter V. + Riesling 2021 Wagram, AUT 52
Rosé
Annesanti, "Raspato", Frizzante Rosato Sangiovese + Aleatico 2020 Umbria, ITA 57
Laherte Frères, "Les Beaudiers", Rosé 