# Imports


In [265]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import math
import time

# Load in Existing Dataset:

In [283]:
scraped_data = pd.read_csv("scraped_data.csv")
scraped_data.set_index(list(scraped_data.columns)[0], inplace=True)

# OG Scraping

In [287]:
lots_dict = {}
categories = ['Distillery', 'Age', 'Vintage', 'Region', 'Bottler', 'Cask Type', 'Bottled Strength', 'Bottle Size',
             'Distillery Status']
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
}
date_format = "%m.%d.%y"
per_page = 40

In [288]:
def og_scraper():
    '''Scrapes data from Whiskey Auctioneer and saves to dictionary'''
    
    # more_new_data = True

    # URL of the first page of results
    url_stem = "https://whiskyauctioneer.com/auction-search"
    
    url = True
    
    live_url = "https://whiskyauctioneer.com/current-auction"
    page = requests.get(live_url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    start_page = math.floor(int(re.findall(r'\d+', soup.find("p", class_ = "left").text)[0]) / per_page)-1
    
    i = start_page
    #i = 209
    
    # Loop through remaining pages until you get to where there aren't any more
    while url:
        print(i)
        
        if i == 0:
            url = url_stem
        else:
            url = url_stem + f'?page={i}'

        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.content, "html.parser")
        
        result_1 = soup.find("div", class_ ="views-row views-row-1 views-row-odd views-row-first producthomepage")
        result = result_1
        # Now we need to iterate through the lots on the page
        while result:
            lot_dict = {}
            
            lot_num = int(re.findall(r'\d+', result.find("span", class_ = "lotnumber label-lot").text)[0])
            if lot_num in list(scraped_data.index):
                more_new_data = False
                continue
            
            # Skip this iteration if the lot is part of a current auction:
            if "Bid Now" in result.text:
                result = result.nextSibling
                continue
                 
            lot_dict["name"] = result.find("a")["_title"]
            lot_dict["price"] = int(''.join(re.findall(r'\d+', result.find("div", class_ = "lotwin cru").text)))
            lot_dict['reserve_met'] = 1 if "Reserve not met" in result.find("div", class_ = "lotwin cru").text else 0
            lot_dict['end_date'] = datetime.strptime(result.find("div", 
                                class_ = "enddatein").text.split(':')[1].strip(), date_format).date()
            lot_num = int(re.findall(r'\d+', result.find("span", class_ = "lotnumber label-lot").text)[0])
               
            # Get lot site
            lot_url = result.find("a", href=True)['href']
            
            lot_page = requests.get(lot_url, headers=headers)
            lot_soup = BeautifulSoup(lot_page.content, "html.parser")
            
            child = lot_soup.find("div", class_ = "whiskyproduct").findChild()
            # Iterate through children to get all info
            while child:
                
                for val in categories:
                    if val in child.text:
                        cat = val
                        
                value = str(child.text).replace("\xa0", '').split(':')[1]
                
                lot_dict[cat] = value
                
                child = child.nextSibling
                
            prod_details = lot_soup.find("div", {"class": "field-item even", 
                                                 "property": "content:encoded"}).text.replace("\xa0", '')
            
            lot_dict['details'] = prod_details
                
            lots_dict[lot_num] = lot_dict
            
            # This iterates the loop to the next lot
            result = result.nextSibling
        
        i += 1
        time.sleep(3)
         
        
    return lots_dict
        

In [291]:
og_scraper()

206
207


KeyboardInterrupt: 

In [None]:
scraped_data = pd.DataFrame.from_dict(lots_dict, orient = "index")
scraped_data.to_csv("scraped_data.csv")

# Iterative Scraping

In [281]:
def scraper():
    '''Scrapes data from Whiskey Auctioneer and saves to dictionary'''
    
    more_new_data = True

    # URL of the first page of results
    url_stem = "https://whiskyauctioneer.com/auction-search"
    
    url = True
    
    live_url = "https://whiskyauctioneer.com/current-auction"
    page = requests.get(live_url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    start_page = math.floor(int(re.findall(r'\d+', soup.find("p", class_ = "left").text)[0]) / per_page)-1
    
    i = start_page
    #i = 209
    
    # Loop through remaining pages until you get to where there aren't any more
    while url and more_new_data:
        print(i)
        
        if i == 0:
            url = url_stem
        else:
            url = url_stem + f'?page={i}'

        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.content, "html.parser")
        
        result_1 = soup.find("div", class_ ="views-row views-row-1 views-row-odd views-row-first producthomepage")
        result = result_1
        # Now we need to iterate through the lots on the page
        while result and more_new_data:
            lot_dict = {}
            
            lot_num = int(re.findall(r'\d+', result.find("span", class_ = "lotnumber label-lot").text)[0])
            if lot_num in list(scraped_data.index):
                more_new_data = False
            
            # Skip this iteration if the lot is part of a current auction:
            if "Bid Now" in result.text:
                result = result.nextSibling
                continue
                 
            lot_dict["name"] = result.find("a")["_title"]
            lot_dict["price"] = int(''.join(re.findall(r'\d+', result.find("div", class_ = "lotwin cru").text)))
            lot_dict['reserve_met'] = 1 if "Reserve not met" in result.find("div", class_ = "lotwin cru").text else 0
            lot_dict['end_date'] = datetime.strptime(result.find("div", 
                                class_ = "enddatein").text.split(':')[1].strip(), date_format).date()
            lot_num = int(re.findall(r'\d+', result.find("span", class_ = "lotnumber label-lot").text)[0])
               
            # Get lot site
            lot_url = result.find("a", href=True)['href']
            
            lot_page = requests.get(lot_url, headers=headers)
            lot_soup = BeautifulSoup(lot_page.content, "html.parser")
            
            child = lot_soup.find("div", class_ = "whiskyproduct").findChild()
            # Iterate through children to get all info
            while child:
                
                for val in categories:
                    if val in child.text:
                        cat = val
                        
                value = str(child.text).replace("\xa0", '').split(':')[1]
                
                lot_dict[cat] = value
                
                child = child.nextSibling
                
            prod_details = lot_soup.find("div", {"class": "field-item even", 
                                                 "property": "content:encoded"}).text.replace("\xa0", '')
            
            lot_dict['details'] = prod_details
                
            lots_dict[lot_num] = lot_dict
            
            # This iterates the loop to the next lot
            result = result.nextSibling
        
        i += 1
        time.sleep(3)
         
        
    return lots_dict
        

In [284]:
#scraper()

In [277]:
scraped_data = pd.DataFrame.from_dict(lots_dict, orient = "index")
scraped_data.to_csv("scraped_data.csv")

# Output

In [None]:
# expression = distillery (ex = Wild Turkey), age, proof, vintage (year), 
# uniquely identify each expression and track price over time
# only care about 