# Beautiful Soup 🍲

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [None]:
# URL of Webpage to Scrape
url = 'https://www.wineenthusiast.com/toplists/enthusiast-100-2023/' # Top 100 Wines

#Send a GET request
response = requests.get(url)

#Check if the request was successful
if response.status_code == 200:

    #Parse the HTML Content
    soup = BeautifulSoup(response.content, 'html.parser')

    #Find all the wine cards
    wine_entries = soup.find_all('div', class_='card-container')

    #Empty Wine Data Container
    wine_data = []
    for entry in wine_entries:
      rank = entry.find('h3', class_='review-ranking').text.strip()
      name = entry.find('p', class_='review-item').text.strip()
      wine_type = entry.find('div', class_='wine-type').text.strip()
      country= entry.find('p', class_='review-location').text.strip()
      points = entry.find('span', class_='points').text.strip()
      price = entry.find('span', class_='price').text.strip()

      # Create Wine Data
      wine_data.append({'Rank': rank,
                        'Name': name,
                        "Type": wine_type,
                        "Location": country,
                        "Points": points,
                        "Price": price
                        })

else:
  print('Requests Denied: ', response.status_code)

# Convert into Dataframe
ratings = pd.DataFrame(wine_data).sort_values(by = 'Rank', ascending=True)

# Optional Save
# ratings.to_csv("Wine Ratings.csv", index=False)
ratings

Unnamed: 0,Rank,Name,Type,Location,Points,Price
99,#1.,Duckhorn 2019 Monitor Ledge Vineyard Cabernet ...,Cabernet Sauvignon,Red Wine from\n ...,96 pts,$105
90,#10.,Cayuse 2020 En Cerise Vineyard Syrah,Syrah,Red Wine from\n ...,96 pts,$94
0,#100.,Sipwell 2021 Tiny Victories Sparkling Albariño,Albariño,Sparkling Wine from\n ...,90 pts,$6
89,#11.,Louis M. Martini 2019 Lot No. 1 Cabernet Sauvi...,Cabernet Sauvignon,Red Wine from\n ...,96 pts,$250
88,#12.,Anne-Sophie Pic & Michel Chapoutier 2021 Syrah,Syrah,Red Wine from\n ...,95 pts,
...,...,...,...,...,...,...
5,#95.,Manzanos 2021 111 Graciano,Spanish Red,Red Wine from\n ...,91 pts,$24
4,#96.,Morandé 2019 Morandé Adventure Tirazis Syrah,Syrah,Red Wine from\n ...,91 pts,$32
3,#97.,Mary Taylor 2019 Marine Descombe,Gamay,Red Wine from\n ...,91 pts,$32
2,#98.,Raats Family 2019 Dolomite Cabernet Franc,Cabernet Franc,Red Wine from\n ...,90 pts,$21


### **Initial Insights**
Scraped website is from the Wine Enthusiest showing the top 100 wines.
Link : https://www.wineenthusiast.com/toplists/enthusiast-100-2023/

Data Scraped from the website is succesful. Further cleaning will be needed to correct data types. With null values under price, I can confirm that there is no data for those.


## Data Preprocessing

In [None]:
# Duplicate df to avoid running the scraping multiple times
main_df = ratings.copy()

In [None]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 99 to 1
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Rank      100 non-null    object
 1   Name      100 non-null    object
 2   Type      100 non-null    object
 3   Location  100 non-null    object
 4   Points    100 non-null    object
 5   Price     100 non-null    object
dtypes: object(6)
memory usage: 5.5+ KB


In [None]:
main_df

Unnamed: 0,Rank,Name,Type,Location,Points,Price
99,#1.,Duckhorn 2019 Monitor Ledge Vineyard Cabernet Sauvignon,Cabernet Sauvignon,"Red Wine from\n Napa Valley, Napa, California",96 pts,$105
90,#10.,Cayuse 2020 En Cerise Vineyard Syrah,Syrah,"Red Wine from\n Walla Walla Valley (OR), Oregon Other, Oregon",96 pts,$94
0,#100.,Sipwell 2021 Tiny Victories Sparkling Albariño,Albariño,"Sparkling Wine from\n Lodi, Central Valley, California",90 pts,$6
89,#11.,Louis M. Martini 2019 Lot No. 1 Cabernet Sauvignon,Cabernet Sauvignon,"Red Wine from\n Napa Valley, Napa, California",96 pts,$250
88,#12.,Anne-Sophie Pic & Michel Chapoutier 2021 Syrah,Syrah,"Red Wine from\n Cornas, Rhône Valley, France",95 pts,
...,...,...,...,...,...,...
5,#95.,Manzanos 2021 111 Graciano,Spanish Red,"Red Wine from\n Rioja, Northern Spain, Spain",91 pts,$24
4,#96.,Morandé 2019 Morandé Adventure Tirazis Syrah,Syrah,"Red Wine from\n Casablanca Valley, Chile",91 pts,$32
3,#97.,Mary Taylor 2019 Marine Descombe,Gamay,"Red Wine from\n Juliénas, Beaujolais, France",91 pts,$32
2,#98.,Raats Family 2019 Dolomite Cabernet Franc,Cabernet Franc,"Red Wine from\n Stellenbosch, South Africa",90 pts,$21


In [None]:
main_df.columns = ['Rank', 'Name', 'Type', 'Location', 'Points', 'Price($)']

In [None]:
main_df['Rank'] = main_df['Rank'].str.extract(r'(\d+)').astype(int) # Extracts the Rank Number and Converts to Int Type
main_df.sort_values(by='Rank', ascending=True, inplace=True) # Sorts data by Rank

In [None]:
main_df

Unnamed: 0,Rank,Name,Type,Location,Points,Price($)
99,1,Duckhorn 2019 Monitor Ledge Vineyard Cabernet Sauvignon,Cabernet Sauvignon,"Red Wine from\n Napa Valley, Napa, California",96 pts,$105
98,2,Ratti 2019 Serradenari Nebbiolo,Nebbiolo,"Red Wine from\n Barolo, Piedmont, Italy",97 pts,$175
97,3,Schnaitmann 2019 Lämmler GG Dry Riesling,Riesling,"White Wine from\n Württemberg, Germany",97 pts,$75
96,4,Brewer-Clifton 2021 Pinot Noir,Pinot Noir,"Red Wine from\n Sta. Rita Hills, Central Coast, California",95 pts,$45
95,5,Rippon 2019 Rippon Mature Vine Lake Wanaka Pinot Noir,Pinot Noir,"Red Wine from\n Central Otago, New Zealand",97 pts,$65
...,...,...,...,...,...,...
4,96,Morandé 2019 Morandé Adventure Tirazis Syrah,Syrah,"Red Wine from\n Casablanca Valley, Chile",91 pts,$32
3,97,Mary Taylor 2019 Marine Descombe,Gamay,"Red Wine from\n Juliénas, Beaujolais, France",91 pts,$32
2,98,Raats Family 2019 Dolomite Cabernet Franc,Cabernet Franc,"Red Wine from\n Stellenbosch, South Africa",90 pts,$21
1,99,"Pietro Beconcini 2021 Fresco di Nero, Rosé of Tempranillo Rosé",Rosé,"Rosé Wine from\n Toscana, Tuscany, Italy",90 pts,$25


In [None]:
main_df['Points'] = main_df['Points'].str.extract(r'(\d+)').astype(int)
main_df['Price($)'] = main_df['Price($)'].replace('N/A', '0')

In [None]:
main_df[main_df['Price($)'] == 'N/A'] # to check if data got replaced

Unnamed: 0,Rank,Name,Type,Location,Points,Price($)


In [None]:
main_df['Price($)'] = main_df['Price($)'].str.extract(r'(\d+)').astype(float)

In [None]:
pd.set_option("display.max_colwidth", 10000) # To see
main_df['Location'] = main_df['Location'].str.replace(r'/^\s+|\s+$|\s+(?=\s)/g', "")

In [None]:
main_df['WineType'] = main_df['Location'].str.extract(r'(\w+ Wine)')

In [None]:
main_df

Unnamed: 0,Rank,Name,Type,Location,Points,Price($),WineType
99,1,Duckhorn 2019 Monitor Ledge Vineyard Cabernet Sauvignon,Cabernet Sauvignon,"Red Wine from\n Napa Valley, Napa, California",96,105.0,Red Wine
98,2,Ratti 2019 Serradenari Nebbiolo,Nebbiolo,"Red Wine from\n Barolo, Piedmont, Italy",97,175.0,Red Wine
97,3,Schnaitmann 2019 Lämmler GG Dry Riesling,Riesling,"White Wine from\n Württemberg, Germany",97,75.0,White Wine
96,4,Brewer-Clifton 2021 Pinot Noir,Pinot Noir,"Red Wine from\n Sta. Rita Hills, Central Coast, California",95,45.0,Red Wine
95,5,Rippon 2019 Rippon Mature Vine Lake Wanaka Pinot Noir,Pinot Noir,"Red Wine from\n Central Otago, New Zealand",97,65.0,Red Wine
...,...,...,...,...,...,...,...
4,96,Morandé 2019 Morandé Adventure Tirazis Syrah,Syrah,"Red Wine from\n Casablanca Valley, Chile",91,32.0,Red Wine
3,97,Mary Taylor 2019 Marine Descombe,Gamay,"Red Wine from\n Juliénas, Beaujolais, France",91,32.0,Red Wine
2,98,Raats Family 2019 Dolomite Cabernet Franc,Cabernet Franc,"Red Wine from\n Stellenbosch, South Africa",90,21.0,Red Wine
1,99,"Pietro Beconcini 2021 Fresco di Nero, Rosé of Tempranillo Rosé",Rosé,"Rosé Wine from\n Toscana, Tuscany, Italy",90,25.0,Rosé Wine


In [None]:
main_df['Location'] = main_df['Location'].str.extract('from\\n [ \t]*([^\n\r]*)')

In [None]:
main_df['Date'] = main_df['Name'].str.extract('(\d+)')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df['Date'] = main_df['Name'].str.extract('(\d+)')


In [None]:
main_df = main_df[['Rank', 'Date','Name','WineType', 'Type', 'Location', 'Points', 'Price($)']]

In [None]:
main_df

Unnamed: 0,Rank,Date,Name,WineType,Type,Location,Points,Price($)
99,1,2019,Duckhorn 2019 Monitor Ledge Vineyard Cabernet Sauvignon,Red Wine,Cabernet Sauvignon,"Napa Valley, Napa, California",96,105.0
98,2,2019,Ratti 2019 Serradenari Nebbiolo,Red Wine,Nebbiolo,"Barolo, Piedmont, Italy",97,175.0
97,3,2019,Schnaitmann 2019 Lämmler GG Dry Riesling,White Wine,Riesling,"Württemberg, Germany",97,75.0
96,4,2021,Brewer-Clifton 2021 Pinot Noir,Red Wine,Pinot Noir,"Sta. Rita Hills, Central Coast, California",95,45.0
95,5,2019,Rippon 2019 Rippon Mature Vine Lake Wanaka Pinot Noir,Red Wine,Pinot Noir,"Central Otago, New Zealand",97,65.0
...,...,...,...,...,...,...,...,...
4,96,2019,Morandé 2019 Morandé Adventure Tirazis Syrah,Red Wine,Syrah,"Casablanca Valley, Chile",91,32.0
3,97,2019,Mary Taylor 2019 Marine Descombe,Red Wine,Gamay,"Juliénas, Beaujolais, France",91,32.0
2,98,2019,Raats Family 2019 Dolomite Cabernet Franc,Red Wine,Cabernet Franc,"Stellenbosch, South Africa",90,21.0
1,99,2021,"Pietro Beconcini 2021 Fresco di Nero, Rosé of Tempranillo Rosé",Rosé Wine,Rosé,"Toscana, Tuscany, Italy",90,25.0
