In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json

In [30]:
r = requests.get(
    "https://www.vivino.com/api/explore/explore",
    params = {
        "country_code": "FR",
        "country_codes[]":"pt",
        "currency_code":"EUR",
        "grape_filter":"varietal",
        "min_rating":"1",
        "order_by":"price",
        "order":"asc",
        "page": 1,
        "price_range_max":"500",
        "price_range_min":"0",
        "wine_type_ids[]":"1"
    },
    headers= {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
    }
)
results = [
    (
        t["vintage"]["wine"]["winery"]["name"], 
        f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
        t["vintage"]["statistics"]["ratings_average"],
        t["vintage"]["statistics"]["ratings_count"],
    )
    for t in r.json()["explore_vintage"]["matches"]
]
dataframe = pd.DataFrame(results,columns=['Winery','Wine','Rating','num_review'])

print(dataframe)


                     Winery                                  Wine  Rating  \
0                  Graça 28                          Reserva 2022     4.1   
1                 Vila Real                    Vale do Corgo 2021     3.8   
2    Casa Ermelinda Freitas      Sandstone Castelão - Shiraz 2023     4.0   
3                   Cartuxa                         EA Tinto 2021     3.8   
4   Quinta de São Sebastião         Miradouro da Vinha Tinto 2021     3.8   
5   Quinta de São Sebastião         Miradouro da Vinha Tinto 2022     3.8   
6   Quinta de São Sebastião  Janela Branca Special Selection 2022     3.7   
7                 Vila Real                Cancellus Reserva 2017     3.8   
8       Quinta Dona Mafalda                   Little Mafalda 2022     3.8   
9         Casal das Freiras                   Colheita Tinto 2022     3.5   
10            Monte do Além                     Petit Verdot 2012     3.8   
11                Vila Real              Cancellus Signature 2019     3.9   

In [67]:
# Define function


def get_vivino_data(
    country_code,
    currency_code,
    min_rating,
    ratings_count_min,
    price_range_max,
    price_range_min,
    wine_type_ids,
):
    r = requests.get(
        "https://www.vivino.com/api/explore/explore",
        params={
            "country_code": country_code,
            "currency_code": currency_code,
            "min_rating": min_rating,
            "min_ratings": ratings_count_min,
            "order_by": "price",
            "order": "asc",
            "page": 1,
            "price_range_max": price_range_max,
            "price_range_min": price_range_min,
            "wine_type_ids[]": wine_type_ids,
        },
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
        },
    )


    results = [
        {
            "name": t["vintage"]["wine"]["winery"]["name"],
            "wine": f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
            "region": t["vintage"]["wine"]["region"]["name"],
            "country": t["vintage"]["wine"]["region"]["country"]["name"],
            "rating": t["vintage"]["statistics"]["ratings_average"],
            "flavors": [x["group"] for x in t["vintage"]["wine"]["taste"]["flavor"]],
            "foods": [x["name"] for x in t["vintage"]["wine"]["style"]["food"]],
            "rating_count": t["vintage"]["statistics"]["ratings_count"],
        }
        for t in r.json()["explore_vintage"]["matches"]
    ]

    # Get lists
    wineries = [x["name"] for x in results]
    wines = [x["wine"] for x in results]
    ratings = [x["rating"] for x in results]
    flavors = [x["flavors"] for x in results]
    foods = [x["foods"] for x in results]
    country_code = [x["country"] for x in results]
    region = [x["region"] for x in results]

    # Clean flavors by removing _ and capitalizing
    flavors = [[y.replace("_", " ").title() for y in x] for x in flavors]

    num_reviews = [x["rating_count"] for x in results]

    # Save json
    with open("saved_data.json", "w") as f:
        data = json.dumps(r.json(), indent=4)
        f.write(data)

    return pd.DataFrame(
        {
            "winery": wineries,
            "wine": wines,
            "region": region,
            "country": country_code,
            "rating": ratings,
            "flavors": flavors,
            "foods": foods,
            "num_reviews": num_reviews,
        }
    )


# Get data
dataframe = get_vivino_data(
    country_code="FR",
    currency_code="EUR",
    min_rating=1,
    ratings_count_min=1000,
    price_range_max=500,
    price_range_min=0,
    wine_type_ids=1,
)

dataframe.head()

Unnamed: 0,winery,wine,region,country,rating,flavors,foods,num_reviews
0,Château de Lavagnac,Bordeaux 2015,Bordeaux,France,3.6,"[Red Fruit, Black Fruit, Earth, Oak, Non Oak, ...","[Beef, Veal, Game (deer, venison), Poultry]",76
1,Château de Callac,Graves Rouge 2011,Graves,France,3.6,"[Earth, Oak, Black Fruit, Non Oak, Red Fruit, ...","[Beef, Lamb, Game (deer, venison), Poultry]",221
2,Vinovalie,Les Blousons Noirs Malbec 2020,Cahors,France,3.6,"[Black Fruit, Spices, Red Fruit, Floral, Veget...","[Beef, Lamb, Poultry]",54
3,Les Hauts de Palette,Chateau du Barail Bordeaux 2023,Bordeaux,France,3.3,"[Oak, Microbio, Citrus Fruit, Earth, Non Oak, ...","[Beef, Veal, Game (deer, venison), Poultry]",384
4,Château Lacoste Garzac,Bordeaux 2022,Bordeaux,France,3.4,"[Oak, Black Fruit, Earth, Non Oak, Red Fruit, ...","[Beef, Veal, Game (deer, venison), Poultry]",2051


In [70]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        dict: Dictionary containing page numbers and their corresponding text
    """
    # Dictionary to store text from each page
    text_by_page = {}
    
    try:
        # Open the PDF file in binary read mode
        with open(pdf_path, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get the number of pages
            num_pages = len(pdf_reader.pages)
            
            # Extract text from each page
            for page_num in range(num_pages):
                # Get the page object
                page = pdf_reader.pages[page_num]
                
                # Extract text from the page
                text = page.extract_text()
                
                # Store the text in our dictionary
                text_by_page[page_num + 1] = text
                
        return text_by_page
    
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except PyPDF2.PdfReadError:
        print("Error: Invalid or corrupted PDF file.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None



In [None]:
print(extract_text_from_pdf("menus/coucou-wine.pdf"))

{1: "SPARKLING  |  FRANCE  |  CHAMPAGNE\nNON-VINTAGE\n1148 Alexandre Bonnet / 'La Geande' 7 Cepages' / Extra Brut NV 310\n1147 Alexandre Filaine / 'Cuvée Spéciale' / Brut NV 235\n1159 Alexandre Filaine / 'DMY' / Brut NV 295\n1116 Georges Laval / 'Cumières' 1er cru / Brut Nature NV 288\n1199 Laurent Perrier / 'Grand Siècle - No. 29' / Brut NV 685\nBLANC DE BLANCS\n1100 Pascal Agrapart / 'Minèral' / Extra Brut 2018 350\n1187 Pascal Agrapart / 'Vénus' / Extra Brut 2018 650\n1124 Raphaël et Vincent Bérêche  / 'Côte' /  Grand Cru /  Extra Brut 2005 498\n1203 Billecart-Salmon / 'Cuvée Louis Salmon' / Brut 2008 585\n1146 Franck Bonville / 'Les Belles Voyes' / Brut 2016 298\n1115 Ulysse Collin / 'Les Roises' / Extra Brut 2011 1450\n1168 Ulysse Collin / 'Les Pierrières' /  Brut 2018 725\n1106 Ulysse Collin / 'Les Pierrières' /  Brut 2011 1095\n1144 Emilien Feneuil / 'Cuvée Totum' / Extra Brut chardonnay -  petit meslier 2018 495\n1178 Emilien Feneuil / 'Cuvée Mix' / Extra Brut chardonnay -  pet

In [99]:
import re
from dataclasses import dataclass
from typing import List, Optional, Dict
import json

@dataclass
class WineEntry:
    """Data class to store wine information"""
    raw_text: str
    id: Optional[str] = None
    producer: Optional[str] = None
    name: Optional[str] = None
    vintage: Optional[str] = None
    price: Optional[str] = None
    region: Optional[str] = None
    type: Optional[str] = None
    details: Optional[str] = None

class FlexibleWineParser:
    def __init__(self, config=None):
        # Default configuration that can be overridden
        self.config = {
            'id_pattern': r'^\d{3,4}',  # Matches 3-4 digit IDs at start of line
            'price_pattern': r'\d+$',    # Matches numbers at end of line
            'vintage_pattern': r'\b(19|20)\d{2}\b',  # Matches years from 1900-2099
            'separators': [' / ', ' - ', '  ', ' '],  # Possible field separators
            'region_indicators': ['|', '•', '-'],  # Possible region separator characters
            'ignore_lines': ['', ' '],  # Lines to skip
            'header_pattern': r'^[A-Z\s|•-]+$'  # Pattern for header lines
        }
        if config:
            self.config.update(config)

    def is_header(self, line: str) -> bool:
        """Check if a line is a header (region or category)"""
        return bool(re.match(self.config['header_pattern'], line))

    def extract_price(self, text: str) -> tuple[str, Optional[str]]:
        """Extract price from text and return remaining text and price"""
        if match := re.search(self.config['price_pattern'], text):
            price = match.group()
            remaining = text[:match.start()].strip()
            return remaining, price
        return text, None

    def extract_vintage(self, text: str) -> tuple[str, Optional[str]]:
        """Extract vintage from text and return remaining text and vintage"""
        if match := re.search(self.config['vintage_pattern'], text):
            vintage = match.group()
            remaining = text[:match.start()] + " " + text[match.end():]
            return remaining.strip(), vintage
        return text, None

    def extract_id(self, text: str) -> tuple[str, Optional[str]]:
        """Extract ID from text and return remaining text and ID"""
        if match := re.match(self.config['id_pattern'], text):
            id_num = match.group()
            remaining = text[match.end():].strip()
            return remaining, id_num
        return text, None

    def split_producer_name(self, text: str) -> tuple[Optional[str], Optional[str]]:
        """Split text into producer and name using various separators"""
        for sep in self.config['separators']:
            if sep in text:
                parts = [p.strip() for p in text.split(sep, 1)]
                return parts[0], parts[1]
        return text, None

    def parse_line(self, line: str, current_region: Optional[str] = None, 
                  current_type: Optional[str] = None) -> Optional[WineEntry]:
        """Parse a single line of wine list text"""
        line = line.strip()
        
        # Skip empty or ignorable lines
        if line in self.config['ignore_lines']:
            return None

        # Check if this is a header line
        if self.is_header(line):
            if any(ind in line for ind in self.config['region_indicators']):
                return WineEntry(raw_text=line, region=line)
            return WineEntry(raw_text=line, type=line)

        # Extract components
        remaining, id_num = self.extract_id(line)
        if not id_num:  # If no ID found, might be a header or category
            if self.is_header(line):
                return WineEntry(raw_text=line, type=line)
            return None

        remaining, price = self.extract_price(remaining)
        remaining, vintage = self.extract_vintage(remaining)
        producer, name = self.split_producer_name(remaining)

        return WineEntry(
            raw_text=line,
            id=id_num,
            producer=producer,
            name=name,
            vintage=vintage,
            price=price,
            region=current_region,
            type=current_type
        )

    def parse_text(self, text: str) -> List[WineEntry]:
        """Parse entire wine list text"""
        entries = []
        current_region = None
        current_type = None
        
        for line in text.split('\n'):
            entry = self.parse_line(line, current_region, current_type)
            if entry:
                if entry.region:
                    current_region = entry.region
                elif entry.type and not entry.id:
                    current_type = entry.type
                else:
                    entries.append(entry)

        return entries

    def to_json(self, entries: List[WineEntry]) -> str:
        """Convert entries to JSON string"""
        return json.dumps([vars(entry) for entry in entries], indent=2)

    def to_dict(self, entries: List[WineEntry]) -> List[Dict]:
        """Convert entries to list of dictionaries"""
        return [vars(entry) for entry in entries]

# Test parser

text = extract_text_from_pdf("menus/coucou-wine.pdf")
pages = [text[i] for i in text.keys()]

parser = FlexibleWineParser()

for page in pages:
    print("\nParsing sample:")
    entries = parser.parse_text(page)
    print(parser.to_json(entries))


Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[]

Parsing sample:
[
  {
    "raw_text": "1510 Alexandre Bonnet / 'La For\u00eat / Ros\u00e9 de Riceys 2019 170",
    "id": "1510",
    "producer": "Alexandre Bonnet",
    "name": "'La For\u00eat / Ros\u00e9 de Riceys",
    "vintage": "2019",
    "price": "170",
    "region": null,
    "type": "FRANCE",
    "details": null
  },
  {
    "raw_text": "1537 Pascal Cotat / Sancerre 2022 260",
    "id": "1537",
    "producer": "Pascal Cotat",
    "name": "Sancerre",
    "vintage": "2022",
    "price": "260",
    "region": null,
    "type": "FRANCE",
    "details": null
  },
  {
    "raw_text": "1533 Hope Well / 'Monday's

In [144]:
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path='config.env')
google_key = os.getenv('GOOGLE_KEY')

if google_key is None:
    raise ValueError("GOOGLE_KEY not found in .env file")

In [145]:
import google.generativeai as genai
from typing import List, Dict
import json
import PyPDF2

class GeminiWineParser:
    def __init__(self, api_key: str):
        """Initialize the Gemini parser with API key"""
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-1.5-pro')
        
    def parse_wine_list(self, text: str) -> List[Dict]:
        """
        Parse wine list text using Gemini 1.5
        
        Args:
            text (str): The wine list text to parse
            
        Returns:
            List[Dict]: List of parsed wine entries
        """
        prompt = f"""Extract wine information from the text below into a structured format.
        For each wine entry, extract:
        - ID number
        - Producer
        - Wine name
        - Type (e.g., NON-VINTAGE, BLANC DE BLANCS)
        - Region
        - Vintage (if available)
        - Price
        
        Format as JSON with missing fields as null.
        
        Text to parse:
        {text}
        
        Respond with only valid JSON in this exact format:
        {{
            "wines": [
                {{
                    "id": "1234",
                    "producer": "Producer Name",
                    "name": "Wine Name",
                    "type": "Wine Type",
                    "region": "Region",
                    "vintage": "2020",
                    "price": "123"
                }}
            ]
        }}"""

        try:
            response = self.model.generate_content(
                prompt,
                generation_config={
                    'temperature': 0.0,  # Use deterministic output
                    'top_p': 1.0,
                    'top_k': 1
                }
            )
            
            # Find the JSON in the response
            response_text = response.text
            # Look for JSON between ```json and ``` if present
            if '```json' in response_text:
                json_str = response_text.split('```json')[1].split('```')[0].strip()
            else:
                json_str = response_text.strip()
                
            # Parse the JSON response
            json_response = json.loads(json_str)
            return json_response["wines"]
            
        except Exception as e:
            print(f"Error parsing wine list: {str(e)}")
            return []

    def parse_pdf_and_wine_list(self, pdf_path: str, page_number: int = 1) -> List[Dict]:
        """
        Extract text from PDF and parse wine list
        
        Args:
            pdf_path (str): Path to PDF file
            page_number (int): Page number to parse (default: 1)
            
        Returns:
            List[Dict]: List of parsed wine entries
        """
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                if page_number <= len(reader.pages):
                    text = reader.pages[page_number - 1].extract_text()
                    return self.parse_wine_list(text)
                else:
                    raise ValueError(f"PDF has only {len(reader.pages)} pages")
        except Exception as e:
            print(f"Error processing PDF: {str(e)}")
            return []
    
    def save_to_json(self, wines: List[Dict], output_file: str):
        """Save parsed wines to a JSON file"""
        try:
            with open(output_file, 'w') as f:
                json.dump({'wines': wines}, f, indent=2)
            print(f"Successfully saved to {output_file}")
        except Exception as e:
            print(f"Error saving to file: {str(e)}")


parser = GeminiWineParser(google_key)

# Example wine list text
sample_text = """SPARKLING  |  FRANCE  |  CHAMPAGNE
NON-VINTAGE
1148 Alexandre Bonnet / 'La Geande' 7 Cepages' / Extra Brut NV 310
1147 Alexandre Filaine / 'Cuvée Spéciale' / Brut NV 235
BLANC DE BLANCS
1100 Pascal Agrapart / 'Minèral' / Extra Brut 2018 350"""

# Parse text directly
results = parser.parse_wine_list(sample_text)

# Print results
print(json.dumps(results, indent=2))

[
  {
    "id": "1148",
    "producer": "Alexandre Bonnet",
    "name": "'La Geande' 7 Cepages'",
    "type": "NON-VINTAGE",
    "region": "CHAMPAGNE",
    "vintage": null,
    "price": "310"
  },
  {
    "id": "1147",
    "producer": "Alexandre Filaine",
    "name": "'Cuv\u00e9e Sp\u00e9ciale'",
    "type": "NON-VINTAGE",
    "region": "CHAMPAGNE",
    "vintage": null,
    "price": "235"
  },
  {
    "id": "1100",
    "producer": "Pascal Agrapart",
    "name": "'Min\u00e8ral'",
    "type": "BLANC DE BLANCS",
    "region": "CHAMPAGNE",
    "vintage": "2018",
    "price": "350"
  }
]


In [124]:
# Try with full page
for page in pages[:1]:
    print("\nParsing sample:")
    results = parser.parse_wine_list(page)
    
    # Create dataframe
    df = pd.DataFrame(results)
    print(df)


Parsing sample:
      id                    producer                              name  \
0   1148            Alexandre Bonnet            'La Geande' 7 Cepages'   
1   1147           Alexandre Filaine                  'Cuvée Spéciale'   
2   1159           Alexandre Filaine                             'DMY'   
3   1116               Georges Laval                'Cumières' 1er cru   
4   1199             Laurent Perrier           'Grand Siècle - No. 29'   
5   1100             Pascal Agrapart                         'Minèral'   
6   1187             Pascal Agrapart                           'Vénus'   
7   1124  Raphaël et Vincent Bérêche                            'Côte'   
8   1203            Billecart-Salmon              'Cuvée Louis Salmon'   
9   1146             Franck Bonville                'Les Belles Voyes'   
10  1115               Ulysse Collin                      'Les Roises'   
11  1168               Ulysse Collin                  'Les Pierrières'   
12  1106             

In [125]:
df

Unnamed: 0,id,producer,name,type,region,vintage,price
0,1148,Alexandre Bonnet,'La Geande' 7 Cepages',NON-VINTAGE,CHAMPAGNE,,310
1,1147,Alexandre Filaine,'Cuvée Spéciale',NON-VINTAGE,CHAMPAGNE,,235
2,1159,Alexandre Filaine,'DMY',NON-VINTAGE,CHAMPAGNE,,295
3,1116,Georges Laval,'Cumières' 1er cru,NON-VINTAGE,CHAMPAGNE,,288
4,1199,Laurent Perrier,'Grand Siècle - No. 29',NON-VINTAGE,CHAMPAGNE,,685
5,1100,Pascal Agrapart,'Minèral',BLANC DE BLANCS,CHAMPAGNE,2018.0,350
6,1187,Pascal Agrapart,'Vénus',BLANC DE BLANCS,CHAMPAGNE,2018.0,650
7,1124,Raphaël et Vincent Bérêche,'Côte',BLANC DE BLANCS,CHAMPAGNE,2005.0,498
8,1203,Billecart-Salmon,'Cuvée Louis Salmon',BLANC DE BLANCS,CHAMPAGNE,2008.0,585
9,1146,Franck Bonville,'Les Belles Voyes',BLANC DE BLANCS,CHAMPAGNE,2016.0,298
