# Imports

In [156]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import pycountry

# API Testing

In [30]:
r = requests.get(
    "https://www.vivino.com/api/explore/explore",
    params = {
        "country_code": "FR",
        "country_codes[]":"pt",
        "currency_code":"EUR",
        "grape_filter":"varietal",
        "min_rating":"1",
        "order_by":"price",
        "order":"asc",
        "page": 1,
        "price_range_max":"500",
        "price_range_min":"0",
        "wine_type_ids[]":"1"
    },
    headers= {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
    }
)
results = [
    (
        t["vintage"]["wine"]["winery"]["name"], 
        f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
        t["vintage"]["statistics"]["ratings_average"],
        t["vintage"]["statistics"]["ratings_count"],
    )
    for t in r.json()["explore_vintage"]["matches"]
]
dataframe = pd.DataFrame(results,columns=['Winery','Wine','Rating','num_review'])

print(dataframe)


                     Winery                                  Wine  Rating  \
0                  Graça 28                          Reserva 2022     4.1   
1                 Vila Real                    Vale do Corgo 2021     3.8   
2    Casa Ermelinda Freitas      Sandstone Castelão - Shiraz 2023     4.0   
3                   Cartuxa                         EA Tinto 2021     3.8   
4   Quinta de São Sebastião         Miradouro da Vinha Tinto 2021     3.8   
5   Quinta de São Sebastião         Miradouro da Vinha Tinto 2022     3.8   
6   Quinta de São Sebastião  Janela Branca Special Selection 2022     3.7   
7                 Vila Real                Cancellus Reserva 2017     3.8   
8       Quinta Dona Mafalda                   Little Mafalda 2022     3.8   
9         Casal das Freiras                   Colheita Tinto 2022     3.5   
10            Monte do Além                     Petit Verdot 2012     3.8   
11                Vila Real              Cancellus Signature 2019     3.9   

# Vivino API Call

In [None]:
# Define function

def get_vivino_data(
    name=None,
    country_code=None,
    currency_code=None,
    min_rating=0,
    ratings_count_min=0,
    price_range_max=100000,
    price_range_min=0,
    wine_type_ids=1,
):
    r = requests.get(
        "https://www.vivino.com/api/explore/explore",
        params={
            "search_query": name,
            "country_code": country_code,
            "currency_code": currency_code,
            "min_rating": min_rating,
            "min_ratings": ratings_count_min,
            "order_by": "price",
            "order": "asc",
            "page": 1,
            "price_range_max": price_range_max,
            "price_range_min": price_range_min,
            "wine_type_ids[]": wine_type_ids,
        },
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
        },
    )


    results = [
        {
            "name": t["vintage"]["wine"]["winery"]["name"],
            "id": t["vintage"]["id"],
            "wine": f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
            "region": t["vintage"]["wine"]["region"]["name"],
            "country": t["vintage"]["wine"]["region"]["country"]["name"],
            "rating": t["vintage"]["statistics"]["ratings_average"],
            "flavors": [x["group"] for x in t["vintage"]["wine"]["taste"]["flavor"]],
            "foods": [x["name"] for x in t["vintage"]["wine"]["style"]["food"]],
            "rating_count": t["vintage"]["statistics"]["ratings_count"],
        }
        for t in r.json()["explore_vintage"]["matches"]
    ]

    # Get lists
    wineries = [x["name"] for x in results]
    links = ["https://vivino.com/wines/" + str(x["id"]) for x in results] # Will expand to a bigger link when searched
    wines = [x["wine"] for x in results]
    ratings = [x["rating"] for x in results]
    flavors = [x["flavors"] for x in results]
    foods = [x["foods"] for x in results]
    country_code = [x["country"] for x in results]
    region = [x["region"] for x in results]

    # Clean flavors by removing _ and capitalizing
    flavors = [[y.replace("_", " ").title() for y in x] for x in flavors]

    # Get number of reviews
    num_reviews = [x["rating_count"] for x in results]

    return pd.DataFrame(
        {
            "winery": wineries,
            "wine": wines,
            "link": links,
            "region": region,
            "country": country_code,
            "rating": ratings,
            "flavors": flavors,
            "foods": foods,
            "num_reviews": num_reviews,
        }
    )


# Get data
dataframe = get_vivino_data(
    name="",
    country_code="FR",
    currency_code="USD",
    min_rating=1,
    ratings_count_min=1000,
    price_range_max=500,
    price_range_min=0,
    wine_type_ids=1,
)

dataframe.head()

Unnamed: 0,winery,wine,link,region,country,rating,flavors,foods,num_reviews
0,Château de Lavagnac,Bordeaux 2015,https://vivino.com/wines/93384618,Bordeaux,France,3.6,"[Red Fruit, Black Fruit, Earth, Oak, Non Oak, ...","[Beef, Veal, Game (deer, venison), Poultry]",76
1,Château de Callac,Graves Rouge 2011,https://vivino.com/wines/2137427,Graves,France,3.6,"[Earth, Oak, Black Fruit, Non Oak, Red Fruit, ...","[Beef, Lamb, Game (deer, venison), Poultry]",221
2,Vinovalie,Les Blousons Noirs Malbec 2020,https://vivino.com/wines/169303871,Cahors,France,3.6,"[Black Fruit, Spices, Red Fruit, Floral, Veget...","[Beef, Lamb, Poultry]",54
3,Les Hauts de Palette,Chateau du Barail Bordeaux 2023,https://vivino.com/wines/176759025,Bordeaux,France,3.3,"[Oak, Microbio, Citrus Fruit, Earth, Non Oak, ...","[Beef, Veal, Game (deer, venison), Poultry]",384
4,Château Lacoste Garzac,Bordeaux 2022,https://vivino.com/wines/171614480,Bordeaux,France,3.4,"[Oak, Black Fruit, Earth, Non Oak, Red Fruit, ...","[Beef, Veal, Game (deer, venison), Poultry]",2052


In [None]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    
    Args:
        pdf_path (str): Path to the PDF file
        
    Returns:
        dict: Dictionary containing page numbers and their corresponding text
    """
    # Dictionary to store text from each page
    text_by_page = {}
    
    try:
        # Open the PDF file in binary read mode
        with open(pdf_path, 'rb') as file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get the number of pages
            num_pages = len(pdf_reader.pages)
            
            # Extract text from each page
            for page_num in range(num_pages):
                # Get the page object
                page = pdf_reader.pages[page_num]
                
                # Extract text from the page
                text = page.extract_text()
                
                # Store the text in our dictionary
                text_by_page[page_num + 1] = text
                
        return text_by_page
    
    # Error Messaging
    except FileNotFoundError:
        print(f"Error: The file {pdf_path} was not found.")
        return None
    except PyPDF2.PdfReadError:
        print("Error: Invalid or corrupted PDF file.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None



In [None]:
print(extract_text_from_pdf("menus/coucou-wine.pdf"))

{1: "SPARKLING  |  FRANCE  |  CHAMPAGNE\nNON-VINTAGE\n1148 Alexandre Bonnet / 'La Geande' 7 Cepages' / Extra Brut NV 310\n1147 Alexandre Filaine / 'Cuvée Spéciale' / Brut NV 235\n1159 Alexandre Filaine / 'DMY' / Brut NV 295\n1116 Georges Laval / 'Cumières' 1er cru / Brut Nature NV 288\n1199 Laurent Perrier / 'Grand Siècle - No. 29' / Brut NV 685\nBLANC DE BLANCS\n1100 Pascal Agrapart / 'Minèral' / Extra Brut 2018 350\n1187 Pascal Agrapart / 'Vénus' / Extra Brut 2018 650\n1124 Raphaël et Vincent Bérêche  / 'Côte' /  Grand Cru /  Extra Brut 2005 498\n1203 Billecart-Salmon / 'Cuvée Louis Salmon' / Brut 2008 585\n1146 Franck Bonville / 'Les Belles Voyes' / Brut 2016 298\n1115 Ulysse Collin / 'Les Roises' / Extra Brut 2011 1450\n1168 Ulysse Collin / 'Les Pierrières' /  Brut 2018 725\n1106 Ulysse Collin / 'Les Pierrières' /  Brut 2011 1095\n1144 Emilien Feneuil / 'Cuvée Totum' / Extra Brut chardonnay -  petit meslier 2018 495\n1178 Emilien Feneuil / 'Cuvée Mix' / Extra Brut chardonnay -  pet

# Create PDF Parser

In [144]:
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path='config.env')
google_key = os.getenv('GOOGLE_KEY')

if google_key is None:
    raise ValueError("GOOGLE_KEY not found in .env file")

In [153]:
import google.generativeai as genai
from typing import List, Dict
import json
import PyPDF2

# Made with Claude 3.5

class GeminiWineParser:
    def __init__(self, api_key: str):
        """Initialize the Gemini parser with API key"""
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-1.5-pro')
        
    def parse_wine_list(self, text: str) -> List[Dict]:
        """
        Parse wine list text using Gemini 1.5
        
        Args:
            text (str): The wine list text to parse
            
        Returns:
            List[Dict]: List of parsed wine entries
        """
        prompt = f"""Extract wine information from the text below into a structured format.
        For each wine entry, extract:
        - ID number
        - Producer
        - Wine name
        - Type (e.g., NON-VINTAGE, BLANC DE BLANCS)
        - Region
        - Vintage (if available)
        - Price
        
        Format as JSON with missing fields as null.
        
        Text to parse:
        {text}
        
        Respond with only valid JSON in this exact format:
        {{
            "wines": [
                {{
                    "id": "1234",
                    "producer": "Producer Name",
                    "name": "Wine Name",
                    "type": "Wine Type",
                    "region": "Region",
                    "country": "Country",
                    "country_code": "Country Code",
                    "vintage": "2020",
                    "price": "123"
                }}
            ]
        }}"""

        try:
            response = self.model.generate_content(
                prompt,
                generation_config={
                    'temperature': 0.0,  # Use deterministic output
                    'top_p': 1.0,
                    'top_k': 1
                }
            )
            
            # Find the JSON in the response
            response_text = response.text
            # Look for JSON between ```json and ``` if present
            if '```json' in response_text:
                json_str = response_text.split('```json')[1].split('```')[0].strip()
            else:
                json_str = response_text.strip()
                
            # Parse the JSON response
            json_response = json.loads(json_str)
            return json_response["wines"]
            
        except Exception as e:
            print(f"Error parsing wine list: {str(e)}")
            return []

    def parse_pdf_and_wine_list(self, pdf_path: str, page_number: int = 1) -> List[Dict]:
        """
        Extract text from PDF and parse wine list
        
        Args:
            pdf_path (str): Path to PDF file
            page_number (int): Page number to parse (default: 1)
            
        Returns:
            List[Dict]: List of parsed wine entries
        """
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                if page_number <= len(reader.pages):
                    text = reader.pages[page_number - 1].extract_text()
                    return self.parse_wine_list(text)
                else:
                    raise ValueError(f"PDF has only {len(reader.pages)} pages")
        except Exception as e:
            print(f"Error processing PDF: {str(e)}")
            return []
    
    def save_to_json(self, wines: List[Dict], output_file: str):
        """Save parsed wines to a JSON file"""
        try:
            with open(output_file, 'w') as f:
                json.dump({'wines': wines}, f, indent=2)
            print(f"Successfully saved to {output_file}")
        except Exception as e:
            print(f"Error saving to file: {str(e)}")

# Add API Key
parser = GeminiWineParser(google_key)

# Example wine list text
sample_text = """SPARKLING  |  FRANCE  |  CHAMPAGNE
NON-VINTAGE
1148 Alexandre Bonnet / 'La Geande' 7 Cepages' / Extra Brut NV 310
1147 Alexandre Filaine / 'Cuvée Spéciale' / Brut NV 235
BLANC DE BLANCS
1100 Pascal Agrapart / 'Minèral' / Extra Brut 2018 350"""

# Parse text directly
results = parser.parse_wine_list(sample_text)

# Print results
print(json.dumps(results, indent=2))

[
  {
    "id": "1148",
    "producer": "Alexandre Bonnet",
    "name": "'La Geande' 7 Cepages'",
    "type": "NON-VINTAGE",
    "region": "Champagne",
    "country": "France",
    "country_code": "FR",
    "vintage": null,
    "price": "310"
  },
  {
    "id": "1147",
    "producer": "Alexandre Filaine",
    "name": "'Cuv\u00e9e Sp\u00e9ciale'",
    "type": "NON-VINTAGE",
    "region": "Champagne",
    "country": "France",
    "country_code": "FR",
    "vintage": null,
    "price": "235"
  },
  {
    "id": "1100",
    "producer": "Pascal Agrapart",
    "name": "'Min\u00e8ral'",
    "type": "BLANC DE BLANCS",
    "region": "Champagne",
    "country": "France",
    "country_code": "FR",
    "vintage": "2018",
    "price": "350"
  }
]


In [150]:
text = extract_text_from_pdf("menus/coucou-wine.pdf")
pages = [text[i] for i in text.keys()]

In [161]:
parser = GeminiWineParser(google_key)

# Try with full page
for page in pages[:2]:
    print("\nParsing sample:")
    results = parser.parse_wine_list(page)
    
# Create dataframe
df = pd.DataFrame(results)
df.head(10)


Parsing sample:
Error parsing wine list: Expecting value: line 147 column 9 (char 3580)

Parsing sample:


Unnamed: 0,id,producer,name,type,region,country,country_code,vintage,price
0,1114,Raphaël et Vincent Bérêche,'Rive Gauche',BLANC DE NOIRS Extra Brut,Champagne,France,,2020.0,265
1,1153,Clandestin Boreal,' Vieilles Vignes',BLANC DE NOIRS Brut Nature,Champagne,France,,,265
2,1145,Cedric Bouchard 'Roses de Jeanne','Les Ursules',BLANC DE NOIRS Brut Nature,Champagne,France,,2018.0,795
3,1198,Billecart-Salmon,'Le Clos Saint-Hilaire',BLANC DE NOIRS Brut,Champagne,France,,2006.0,1100
4,1197,Billecart-Salmon,'Le Clos Saint-Hilaire',BLANC DE NOIRS Brut,Champagne,France,,2005.0,1150
5,1162,Egly-Ouriet,'Les Crayères Vieilles Vignes',BLANC DE NOIRS Grand Cru Brut,Champagne,France,,,675
6,1176,Ulysse Collin,'Les Maillons',BLANC DE NOIRS Extra Brut,Champagne,France,,2019.0,625
7,1136,Ulysse Collin,'Les Maillons',BLANC DE NOIRS Extra Brut,Champagne,France,,2012.0,1050
8,1143,Ulysse Collin,'Les Maillons',BLANC DE NOIRS Extra Brut,Champagne,France,,2010.0,1100
9,1188,Marc Hébrart,'Noces de Craie',BLANC DE NOIRS Grand Cru Extra Brut,Champagne,France,,2018.0,310


In [168]:
def get_vivino_table(wine_df):
    # Run get_vivino_data for each wine in the dataframe
    vivino_data = []

    countries = wine_df["country"].unique()
    print(f"Getting data for {countries} countries")
    country_mapping = {country.name : country.alpha_2 for country in pycountry.countries if country.name in countries}

    wine_df["country_code"] = wine_df["country"].map(country_mapping)

    return wine_df

# Call the function
get_vivino_table(df)


Getting data for ['France'] countries


Unnamed: 0,id,producer,name,type,region,country,country_code,vintage,price
0,1114,Raphaël et Vincent Bérêche,'Rive Gauche',BLANC DE NOIRS Extra Brut,Champagne,France,FR,2020.0,265
1,1153,Clandestin Boreal,' Vieilles Vignes',BLANC DE NOIRS Brut Nature,Champagne,France,FR,,265
2,1145,Cedric Bouchard 'Roses de Jeanne','Les Ursules',BLANC DE NOIRS Brut Nature,Champagne,France,FR,2018.0,795
3,1198,Billecart-Salmon,'Le Clos Saint-Hilaire',BLANC DE NOIRS Brut,Champagne,France,FR,2006.0,1100
4,1197,Billecart-Salmon,'Le Clos Saint-Hilaire',BLANC DE NOIRS Brut,Champagne,France,FR,2005.0,1150
5,1162,Egly-Ouriet,'Les Crayères Vieilles Vignes',BLANC DE NOIRS Grand Cru Brut,Champagne,France,FR,,675
6,1176,Ulysse Collin,'Les Maillons',BLANC DE NOIRS Extra Brut,Champagne,France,FR,2019.0,625
7,1136,Ulysse Collin,'Les Maillons',BLANC DE NOIRS Extra Brut,Champagne,France,FR,2012.0,1050
8,1143,Ulysse Collin,'Les Maillons',BLANC DE NOIRS Extra Brut,Champagne,France,FR,2010.0,1100
9,1188,Marc Hébrart,'Noces de Craie',BLANC DE NOIRS Grand Cru Extra Brut,Champagne,France,FR,2018.0,310


In [192]:
import requests

url = "https://www.vivino.com/api/explore/explore"

response = requests.get(url, headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
})

print(response.json())  # Print available fields


{'error': 'At least one filter should be set'}


In [195]:
import requests
from bs4 import BeautifulSoup


base_url = "https://www.vivino.com/search/wines"

# Send request to Vivino search page
params = {"q": "Del la Mar el Mero Albariño 2022"}
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
}

response = requests.get(base_url, params=params, headers=headers)

# Check if request was successful
if response.status_code != 200:
    print("Failed to fetch data")
    # return None

soup = BeautifulSoup(response.text, "html.parser")

# Find first wine result
first_result = soup.select_one(".card.card-lg")
if not first_result:
    print("No results found.")
    # return None

# Extract wine details
try:
    wine_name = first_result.select_one(".wine-card__name").text.strip()
    link = "https://www.vivino.com" + first_result.select_one("a")["href"]
    country = first_result.select_one(".wine-card__region [data-item-type='country']").text.strip()
    region = first_result.select_one(".wine-card__region .link-color-alt-grey").text.strip()
    rating = first_result.select_one(".average__number").text.strip() if first_result.select_one(".average__number") else "N/A"
    price = first_result.select_one(".wine-price-value").text.strip() if first_result.select_one(".wine-price-value") else "N/A"

except AttributeError:
    print("Error extracting data")
    # return None

# Return wine data
data = {
    "name": wine_name,
    "link": link,
    "country": country,
    "region": region,
    "rating": rating,
    "price": price,
}

data


{'name': 'Attis De la Mar el Mero Albariño 2022',
 'link': 'https://www.vivino.com/US/en/wines/173352511',
 'country': 'Spain',
 'region': 'Rías Baixas',
 'rating': '3.9',
 'price': '—'}

In [200]:
data["link"]

'https://www.vivino.com/US/en/wines/173352511'

In [197]:
link_respone = requests.get(data['link'], headers=headers)

In [199]:
link_soup = BeautifulSoup(link_respone.text, "html.parser")

link_soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="#a61a30" name="theme-color"/>
<title>2022 Attis De la Mar el Mero Albariño | Vivino US
</title>
<meta content="De la Mar el Mero Albariño is a White wine. See reviews and pricing for the 2022 vintage." name="description"/>
<script>
  window.gdprAppliesGlobally=false;(function(){function n(e){if(!window.frames[e]){if(document.body&&document.body.firstChild){var t=document.body;var r=document.createElement("iframe");r.style.display="none";r.name=e;r.title=e;t.insertBefore(r,t.firstChild)}else{setTimeout(function(){n(e)},5)}}}function e(r,a,o,c,s){function e(e,t,r,n){if(typeof r!=="function"){return}if(!window[a]){window[a]=[]}var i=false;if(s){i=s(e,n,r)}if(!i){window[a].push({command:e,version:t,callback:r,parameter:n})}}e.stub=true;e.stubVersion=2;function t(n){if(!window[r]||window[r].stub!==true){return}if(!n.data){return}var i=typeof n.data==="string";var e;try{e=i?JSON.parse(n.data):n.data}catch(t){retu

In [224]:
food_container = link_soup.select_one(".foodPairing__foodContainer--1bvxM")

# Extract food pairing names
food_pairings = [str(a).split('aria-label="')[1].split('"')[0] for a in food_container.find_all("a")]

# Print the extracted food pairings
print(food_pairings)

['Shellfish', 'Vegetarian', 'Pasta', 'Appetizers and snacks', 'Lean fish']


In [221]:
str(food_container.find_all("a")[0]).split('aria-label="')[1].split('"')[0]

'Shellfish'