In [1]:
import pandas as pd

# NLP-specific:
import requests
from bs4 import BeautifulSoup

---

## Scraping the Red Wine product entries from the URL and creating a DataFrame

- The url __[Wine.com](https://www.wine.com/list/wine/red-wine/7155-124?sortBy=mostInteresting)__ was webscrapted to gather information about a selection of popular red wine products.
- This information included:
> 1. Product ID
> 2. Wine Type
> 3. Wine Name
> 4. Wine Origin
> 5. Average Rating
> 6. Numerical Rating
> 7. Current Price
> 8. Prediscounted Price
> 9. Nominal Discount Value
> 10. Percent Savings

In [2]:
# Define the URLs
urls = [
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=mostInteresting',
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=mostPopular',
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=topRated',
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=userRatingDesc',
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=priceLowToHigh',
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=priceHighToLow',
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=savings',
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=justIn',
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=oldToNew',
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=newToOld',
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=wineryAToZ',
    'https://www.wine.com/list/wine/red-wine/7155-124?sortBy=wineryZToA'
]

In [3]:
def scrape_wine_data(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content, 'html.parser')
    results = []

    # Extracting the number of elements available for each property
    num_elements = len(soup.body.find_all('meta', class_='schema_productID'))

    for idx in range(num_elements):
        new_entry = {}
        
        # Define the elements and their corresponding class names
        elements_info = [
            ('product_id', 'meta', 'schema_productID'),
            ('wine_type', 'span', 'listGridItemOrigin_varietal'),
            ('wine_name', 'span', 'listGridItemInfo_name'),
            ('wine_origin', 'span', 'listGridItemOrigin_text'),
            ('rating_avg', 'span', 'averageRating_average'),
            ('rating_num', 'span', 'averageRating_number'),
            ('price_current', 'span', 'productPrice_price-saleWhole'),
            ('price_prediscount', 'span', 'productPrice_price-regWhole'),
            ('discount_nom', 'span', 'productPrice_savings-amount'),
            ('savings_percent', 'span', 'productPrice_savings-percentage')
        ]
        
        for entry_name, tag, class_name in elements_info:
            elements = soup.body.find_all(tag, class_=class_name)
            if idx < len(elements):
                new_entry[entry_name] = elements[idx].text if tag == 'span' else elements[idx].get('content')
        
        results.append(new_entry)

    # Creating the DataFrame
    df = pd.DataFrame(results)
    return df

In [4]:
# Scrape data from each URL to create a list of individual DFs
dataframes = [scrape_wine_data(url) for url in urls]

In [5]:
dataframes[0].head()

Unnamed: 0,product_id,wine_type,wine_name,wine_origin,rating_avg,rating_num,price_current,price_prediscount,discount_nom,savings_percent
0,1405250,Malbec,Chateau Du Caillau Cahors 2021,"Cahors, Southwest, France",5.0,19,15,39,$23.01,59
1,1358914,Tempranillo,Bodegas Lan D-12 2019,"Rioja, Spain",4.8,28,10,19,$2.01,15
2,1288787,Other Red Blends,Bodegas La Purisma Old Vines Red Blend 2019,"Yecla, Spain",4.3,61,22,13,$7.01,23
3,1301819,Tempranillo,Eguren Ugarte Cosecha 2021,"Rioja, Spain",4.6,27,11,10,$6.01,33
4,1384660,Gamay,Domaine Gilles Coperet Brouilly Saburin 2021,"Beaujolais, Burgundy, France",4.4,12,29,18,$5.01,14


In [6]:
# Concatenate the DataFrames
df = pd.concat(dataframes, ignore_index=True)

In [7]:
df.tail()

Unnamed: 0,product_id,wine_type,wine_name,wine_origin,rating_avg,rating_num,price_current,price_prediscount,discount_nom,savings_percent
295,1149849,Other Red Blends,ZOE Red 2021,"Other Peloponnese, Greece",0.0,0,,13,,
296,1058449,Sangiovese,Ziobaffa Organic Toscana 2018,"Tuscany, Italy",4.0,122,,11,,
297,839115,Grenache,Zestos Old Vine Garnacha 2020,"Vinos de Madrid, Spain",3.9,7,,11,,
298,1500366,Grenache,Zestos Old Vine Garnacha 2021,"Vinos de Madrid, Spain",0.0,0,,12,,
299,1230356,Other Red Blends,Zeni Marogne Valpolicella Superiore Ripasso 2020,"Valpolicella, Veneto, Italy",4.7,42,,19,,


In [9]:
# Saving df to .csv
df.to_csv('../data/df.csv', index=False)