# **Wine Cellars Web Scraping Program**

---

In [2]:
# Importing the necessary libraries:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException

In [None]:
!pip install webdriver_manager

**Scraping the wine scoring scale data from the "https://www.wine-searcher.com/critics-8-cellartracker?page=1" website**

In [26]:
# Scraping the scoring scale data:
webpage_scoring_scale = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
webpage_scoring_scale.get("https://www.wine-searcher.com/critics-8-cellartracker?page=1")
content_page = webpage_scoring_scale.page_source
result_scoring_scale = BeautifulSoup(content_page, 'html.parser')
prd_page1 = result_scoring_scale.find_all('div', {'class':'heading-sm'})

scoring_scale = []
for score_scale_item in prd_page1:
    score_scale_item.ul
    for item in score_scale_item.ul:
        scoring_scale.append(item.string)

for i in scoring_scale:
    if i == '\n':
        scoring_scale.remove(i)

scoring_scale

['98–100 – Extraordinary',
 '94–97 – Outstanding',
 '90–93 – Very good',
 '86–89 – Good',
 '80–85 – Average',
 '70–79 – Below average',
 '50–69 – Avoid']

In [27]:
df_scoring_scale = pd.DataFrame(scoring_scale, columns=['Scoring_Scale'])
df_scoring_scale

Unnamed: 0,Scoring_Scale
0,98–100 – Extraordinary
1,94–97 – Outstanding
2,90–93 – Very good
3,86–89 – Good
4,80–85 – Average
5,70–79 – Below average
6,50–69 – Avoid


In [28]:
# Splitting the df_scoring_scale dataframe into 2 columns, 'Scoring' and 'Scale'

scoring = []
scale = []

for i in list(df_scoring_scale['Scoring_Scale']):
    rep = i.replace(i[6:], '')
    stp = rep.rstrip()
    scoring.append(stp)

for j in list(df_scoring_scale['Scoring_Scale']):
    rep = j.replace(j[:8], '')
    stp = rep.lstrip()
    scale.append(stp)

In [29]:
scoring

['98–100', '94–97', '90–93', '86–89', '80–85', '70–79', '50–69']

In [30]:
scale

['Extraordinary',
 'Outstanding',
 'Very good',
 'Good',
 'Average',
 'Below average',
 'Avoid']

In [35]:
df_scoring_scale_updated = pd.DataFrame(zip(scoring, scale), columns=['Scoring', 'Scale'])
df_scoring_scale_updated

Unnamed: 0,Scoring,Scale
0,98–100,Extraordinary
1,94–97,Outstanding
2,90–93,Very good
3,86–89,Good
4,80–85,Average
5,70–79,Below average
6,50–69,Avoid


In [36]:
df_scoring_scale_updated.to_csv('C:\\Users\\cesar\\OneDrive\\Documents\\Cesar documents\\Data Science Projects\\Wine Ratings Analysis\\Dataset\\scoring-scale.csv', index=False)

**Scraping the data needed from the "https://www.winemag.com/ratings/" website**

In [3]:
# Creating and installing the webdriver using Selenium:
webpage = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

# Loading the initial webpage:
webpage.get("https://www.winemag.com/ratings/?s=&drink_type=wine&sort_by=pub_date_web&sort_dir=desc")

# Creating a counter for each page:
counter = 1

# Creating the lists that will hold the data scraped:
product_name = []
appellation = []
rating = []
price = []

# Scraping the first page:
content_first_page = webpage.page_source
result = BeautifulSoup(content_first_page, 'html.parser')
product_content = result.find_all('li', {'class': 'review-item'})

for item in product_content:
    product_name.append(item.h3.string)
    appellation.append(item.find('span', {'class': 'appellation'}))
    rating.append(item.find('span', {'class': 'rating'}))
    price.append(item.find('span', {'class': 'price'}))
      
# Loading the second page and waiting for 30 seconds until the scraping of the second page is done as per below:
WebDriverWait(webpage, 30).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[1]/div/section/section/div[1]/div[1]/div[7]/section/div[3]/div/div/span[2]"))).click()
counter += 1

# Looping each page and scraping it:
while webpage.find_element(By.XPATH, '/html/body/div[1]/div/section/section/div[1]/div[1]/div[7]/section/div[3]/div/div/span[2]') and counter <= 1850:
    try:
        content = webpage.page_source
        result = BeautifulSoup(content, 'html.parser')
        product_content = result.find_all('li', {'class': 'review-item'})
        
        for item in product_content:
            product_name.append(item.h3.string)
            appellation.append(item.find('span', {'class': 'appellation'}))
            rating.append(item.find('span', {'class': 'rating'}))
            price.append(item.find('span', {'class': 'price'}))
    
        # Since the page is dynamic and has notifications poping-up, it is imperative to find the button and execute it; in this case a click of this button:
        next_button = webpage.find_element(By.XPATH, "/html/body/div[1]/div/section/section/div[1]/div[1]/div[7]/section/div[3]/div/div/span[2]")
        webpage.execute_script("arguments[0].click();", next_button)
        counter += 1
        time.sleep(5)
    except:
        print('Unexpected error occured on this page! Loading the next page...')
        next_button = webpage.find_element(By.XPATH, "/html/body/div[1]/div/section/section/div[1]/div[1]/div[7]/section/div[3]/div/div/span[2]")
        webpage.execute_script("arguments[0].click();", next_button)
        counter += 1

Unexpected error occured on this page! Loading the next page...
Unexpected error occured on this page! Loading the next page...
Unexpected error occured on this page! Loading the next page...
Unexpected error occured on this page! Loading the next page...
Unexpected error occured on this page! Loading the next page...
Unexpected error occured on this page! Loading the next page...


In [4]:
product_name

['Charles Heidsieck NV Champagne Charlie Brut  (Champagne)',
 "Stag's Leap Wine Cellars 2019 Fay Estate Grown Cabernet Sauvignon (Stags Leap District)",
 'Marqués de Riscal 2016 150 Aniversario Gran Reserva  (Rioja)',
 'Salon 2012 Le Mesnil Blanc de Blancs Brut Chardonnay (Champagne)',
 'Philipponnat 2013 Clos des Goisses Extra Brut  (Champagne)',
 "Stag's Leap Wine Cellars 2019 S.L.V. Estate Grown Cabernet Sauvignon (Stags Leap District)",
 'Patrimony 2019 Cabernet Franc (Adelaida District)',
 'Cliff Lede 2019 Poetry Red (Stags Leap District)',
 'Corison 2019 Helios Sunbasket Vineyard Cabernet Franc (St. Helena)',
 'Venge 2019 DLCV Cabernet Sauvignon (Oakville)',
 'Pol Roger 2013 Cuvée Sir Winston Churchill Brut  (Champagne)',
 'Cliff Lede 2019 Rock Block Series Roundabout Midnight Cabernet Sauvignon (Stags Leap District)',
 'Cliff Lede 2019 Beckstoffer To Kalon Cabernet Sauvignon (Napa Valley)',
 'Cliff Lede 2019 Songbook Cabernet Sauvignon (Napa Valley)',
 'Patrimony 2019 Cabernet S

In [6]:
appellation_list = []
for i in appellation:
    appellation_list.append(i.string)
    print(i.string)

Champagne
Napa
Northern Spain
Champagne
Champagne
Napa
Central Coast
Napa
Napa
Napa
Champagne
Napa
Napa
Napa
Central Coast
Veneto
Napa
Northern Spain
Champagne
Central Coast
Champagne
Napa
Northern Spain
Champagne
Champagne
Napa
Central Coast
Napa
Napa
Napa
Champagne
Napa
Napa
Napa
Central Coast
Veneto
Napa
Northern Spain
Champagne
Central Coast
Central Coast
Napa
Napa
Napa
Champagne
Napa
Napa
Napa
Central Coast
Veneto
Napa
Northern Spain
Champagne
Central Coast
Napa
Central Coast
Napa
Northern Spain
Napa
Central Coast
Central Coast
Northern Spain
Central Coast
Central Coast
Napa
Napa
Sonoma
Sonoma
Northern Spain
Northern Spain
Northern Spain
Central Coast
Central Italy
Central Coast
Napa
Central Coast
Napa
Sonoma
Napa
Napa
Sonoma
Sonoma
Napa
Central Coast
Champagne
Champagne
Napa
Napa
Veneto
Sonoma
Central Coast
Sonoma
Sonoma
Northern Spain
Northern Spain
Sierra Foothills
Central Coast
Napa
Napa
Champagne
Napa-Sonoma
Napa
Sonoma
Veneto
US
Central Coast
Sonoma
Sonoma
Napa
Central Coast

In [9]:
rating_list = []
for i in rating:
    rating_list.append(i.strong.string)
    print(i.strong.string)

100
100
99
99
99
99
98
98
98
98
98
98
98
98
97
97
97
97
97
97
100
100
99
99
99
99
98
98
98
98
98
98
98
98
97
97
97
97
97
97
98
98
98
98
98
98
98
98
97
97
97
97
97
97
97
97
97
97
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
96
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
95
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
94
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93
93


In [11]:
price_list = []
for i in price:
    price_list.append(i.string)
    print(i.string)

$700
$155
$60
$992
$320
$195
$275
$325
$110
$545
N/A
$110
$225
$225
$275
$190
$175
$620
$290
$42
$700
$155
$60
$992
$320
$195
$275
$325
$110
$545
N/A
$110
$225
$225
$275
$190
$175
$620
$290
$42
$275
$325
$110
$545
N/A
$110
$225
$225
$275
$190
$175
$620
$290
$42
$245
$132
$250
$150
$125
$74
$65
$105
$98
$94
$165
$75
$325
$275
$110
$60
$55
$60
$28
$125
$150
$150
$85
$58
$150
$250
$75
$70
$140
$98
$170
$215
$125
$115
$60
$75
$132
$70
$75
$60
N/A
$100
$37
$85
$60
$195
$80
$145
$48
$190
$150
$52
$66
$75
$65
$70
$90
$18
$825
$320
N/A
N/A
$110
$40
N/A
$65
$150
$115
$55
$56
$125
$82
$275
$65
$150
$65
$50
N/A
$85
N/A
$175
$50
N/A
$65
$85
$85
$125
$99
$42
N/A
$40
$110
$65
$60
$250
$98
$52
$25
$282
$225
$75
$80
$95
$125
$350
$68
$68
$125
$80
N/A
$150
$110
$48
N/A
$159
$115
$245
$135
$65
$125
$98
$140
$135
$48
$70
$150
$88
$75
$36
$45
$40
$55
$55
$50
$88
$499
$110
$98
$60
$85
$98
$79
$79
$102
$110
$62
$85
$90
$74
$54
$56
$80
$97
$28
$60
$90
$120
$50
$165
$95
$65
$75
$22
$90
$90
$110
$60
$330
$200


In [14]:
df = pd.DataFrame(zip(product_name, appellation_list, rating_list, price_list), columns=['Product_Name', 'Appellation', 'Rating', 'Price' ])
df

Unnamed: 0,Product_Name,Appellation,Rating,Price
0,Charles Heidsieck NV Champagne Charlie Brut (...,Champagne,100,$700
1,Stag's Leap Wine Cellars 2019 Fay Estate Grown...,Napa,100,$155
2,Marqués de Riscal 2016 150 Aniversario Gran Re...,Northern Spain,99,$60
3,Salon 2012 Le Mesnil Blanc de Blancs Brut Char...,Champagne,99,$992
4,Philipponnat 2013 Clos des Goisses Extra Brut ...,Champagne,99,$320
...,...,...,...,...
11099,Bodegas Franco-Españolas 2016 Royal Reserva Te...,Northern Spain,92,$45
11100,Cedarville 2019 Estate Grenache (El Dorado),Sierra Foothills,92,$36
11101,Doña Paula 2018 Alluvia Parcel Bush Vines Malb...,Mendoza Province,92,$100
11102,El Enemigo 2017 Gran Enemigo El Cepillo Single...,Mendoza Province,92,$100


In [17]:
df.to_csv("C:\\Users\\cesar\\OneDrive\\Documents\\Cesar documents\\Data Science Projects\\Wine Ratings Analysis\\Dataset\\wine-data.csv", index=False)

In [16]:
# Getting rid of the redundant records using 'groupby':

df2 = df.groupby(['Product_Name', 'Appellation', 'Rating', 'Price']).count().reset_index()
df2

Unnamed: 0,Product_Name,Appellation,Rating,Price
0,1+1=3 NV Cygnus Sador Brut Nature Reserva Spar...,Catalonia,89,$22
1,"10,000 Hours 2019 Red (Red Mountain)",Columbia Valley,88,$35
2,"10,000 Hours 2019 Syrah (Red Mountain)",Columbia Valley,90,$35
3,1000 Stories 2020 Bourbon Barrel Aged Batch No...,California Other,88,$19
4,12 Linajes 2017 Reserva Tempranillo (Ribera d...,Northern Spain,94,$65
...,...,...,...,...
10206,Zuccardi 2020 Poligonos Malbec (Paraje Altamira),Mendoza Province,90,$30
10207,Zuccardi 2020 Poligonos San Pablo Cabernet Fra...,Mendoza Province,91,$30
10208,Zuccardi 2020 Q Malbec (Uco Valley),Mendoza Province,90,$20
10209,Ökonomierat Rebholz 2015 Hansjörg Rebholz Sieb...,Germany,94,$155


In [48]:
# Getting rid of the currency in the 'Price' column and changing its name to 'Price_usd'

Price_usd = []
for i in df2['Price']:
    Price_usd.append(i.strip('$'))

df3 = pd.DataFrame(Price_usd, columns=['Price_usd'])

df4 = pd.DataFrame(zip(product_name, appellation_list, rating_list, Price_usd), columns=['Product_Name', 'Appellation', 'Rating', 'Price_usd'])
df4

Unnamed: 0,Product_Name,Appellation,Rating,Price_usd
0,Charles Heidsieck NV Champagne Charlie Brut (...,Champagne,100,20
1,Stag's Leap Wine Cellars 2019 Fay Estate Grown...,Napa,100,22
2,Salon 2012 Le Mesnil Blanc de Blancs Brut Char...,Champagne,99,35
3,Philipponnat 2013 Clos des Goisses Extra Brut ...,Champagne,99,35
4,Stag's Leap Wine Cellars 2019 S.L.V. Estate Gr...,Napa,99,19
...,...,...,...,...
8522,Bruliam 2019 Sangacimo Roberts Road Pinot Noir...,Sonoma,90,130
8523,Brutocao 2015 Bliss Vineyard Zinfandel (Mendoc...,Mendocino County,90,30
8524,Brutocao 2018 Hopland Ranches Estate Cabernet ...,Mendocino County,90,30
8525,Kaiken 2018 Obertura Cabernet Franc (Mendoza),Mendoza Province,90,30
