In [63]:
# import dependencies
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import re

In [2]:
# getting the links and names of all rums currently in the data set
ids = []
links = []
names = []

for i in range(1, 17865):
    url = f'https://www.rum-x.com/rums/{i}'
    page = requests.get(url)
    
    if page.status_code == 200:
        content = page.content
        soup = BeautifulSoup(content, features='lxml')
        
        ids.append(i)
        links.append(soup.link.get("href"))
        names.append(soup.link.get("href").split('/')[-1])
    
    else:
        ids.append(i)
        links.append(np.nan)
        names.append(np.nan)

rumDF = pd.DataFrame({
    'ID': ids,
    'Link': links,
    'Name': names
})

rumDF = rumDF.dropna()
rumDF.to_csv('rumPrelim.csv', index=False)

In [3]:
df = pd.read_csv('rumPrelim.csv')
print(df['Link'][13])
df.head()

https://www.rum-x.com/rums/14/bleeding-heart-rum-company-don-papa-10-years


Unnamed: 0,ID,Link,Name
0,1,https://www.rum-x.com/rums/1/maison-ferrand-we...,maison-ferrand-west-indies-plantation-extra-ol...
1,2,https://www.rum-x.com/rums/2/bleeding-heart-ru...,bleeding-heart-rum-company-don-papa-rum
2,3,https://www.rum-x.com/rums/3/destilerias-unida...,destilerias-unidas-s-a-diplomatico-botucal-res...
3,4,https://www.rum-x.com/rums/4/industrias-licore...,industrias-licoreras-de-guatemala-ron-zacapa-s...
4,5,https://www.rum-x.com/rums/5/kraken-black-spic...,kraken-black-spiced-rum


In [10]:
# testing getting all the details we want from the rum page
url = df['Link'][13]
page = requests.get(url)
content = page.content
soup = BeautifulSoup(content, features='lxml')

In [11]:
# getting the rum facts div
rumFacts = soup.find_all("div", {"class": "rum-facts"})[0]

facts = dict()
for fact in soup.find_all("div", {"class": "rum-facts"})[0].find_all('div', {'class':['fact-info', 'main-link']}):
    values = fact.getText().strip().split('\n')
    facts[values[0].strip().lower().replace(' ', '_')] = values[1].strip().lower()
facts

{'country': 'philippines',
 'distillery': 'bleeding heart rum company',
 'brand': 'don papa',
 'rumx_id': 'rx14',
 'abv': '43%',
 'category': 'spiced rum',
 'made_from': 'molasses',
 'distillation': 'column still',
 'age': '10 years',
 'bottle_volume': '70cl',
 'price_range': 'under 100€',
 'type_of_spirit': 'rum'}

In [12]:
# full description
rumDesc = soup.find_all("p", {"class": "description-read-more__text"})[0].getText().lower()
rumDesc

"discover the rich and enchanting world of don papa 10 years, a premium rum hailing from the beautiful philippines. expertly crafted at the renowned bleeding heart rum company distillery, this sumptuous spirit captivates the senses with its delightful vanilla, spice, and sweet notes, making it a must-try for rum connoisseurs and enthusiasts alike. distilled from the finest molasses using a column still and aged for a full decade, don papa 10 years boasts a vibrant symphony of flavors. revel in the delicate hints of orange and mild spices that dance on your palate, as the rum's smooth and well-rounded finish leaves you craving more. indulge in the exquisite aroma of caramel, citrus, and sweet vanilla that envelops you as you raise your glass to experience the magic of this timeless rum. its natural richness and complexity make it an ideal pairing with desserts or simply enjoyed neat. rum lovers who appreciate a beautifully balanced blend will adore the bold yet delicate character of don

In [41]:
# overall rating and number of reviews
rumRating = float(soup.find_all('a', {'href': '#community-reviews'})[1].find_all('span', {'class': 'overall-rating-text'})[0].getText().replace(',', ''))
rumNumReviews = int(soup.find_all('a', {'href': '#community-reviews'})[1].find_all('span', {'class': 'overall-rating-max'})[2].getText().split()[0].replace(',', ''))
print(rumRating, rumNumReviews)

7.1 144


In [57]:
# price and unit
rumPrice = float(soup.find_all('div', {'class': 'rum-collection'})[0].find_all('span', {'class': 'overall-rating-text'})[0].getText().split()[0])
rumPriceUnit = soup.find_all('div', {'class': 'rum-collection'})[0].find_all('span', {'class': 'overall-rating-text'})[0].getText().split()[1]
print(rumPrice, rumPriceUnit)

64.0 €


In [78]:
# get smells, tastes, finishes
smells = []
tastes = []
finishes = []
last_tag = ''

for tag in soup.find_all('div', {'class': 'mt-4'})[0].find_all(['h4', 'span']):
    if tag.name == 'h4':
        last_tag = tag.text
    elif tag.name == 'span':
        if last_tag == 'Aroma':
            smells.append(tag.text)
        elif last_tag == 'Taste':
            tastes.append(tag.text)
        elif last_tag == 'Finish':
            finishes.append(tag.text)
print(smells)
print(tastes)
print(finishes)

['Vanilla', 'Orange', 'Caramel', 'Sweet', 'Citrus', 'OrangePeal']
['Vanilla', 'Spice', 'Sweet', 'Orange', 'Mild', 'Caramel']
['Vanilla', 'Sweet', 'Orange', 'Fruity', 'Spicy', 'Tropical fruit']


In [79]:
# getting bottles closed, opened, emptied, rate
rumNumClosed = float(soup.find_all('div', {'class': 'rum-collection'})[0].find_all('div', {'class': 'col'})[0].getText().strip().split()[0].replace(',', ''))
rumNumOpened = float(soup.find_all('div', {'class': 'rum-collection'})[0].find_all('div', {'class': 'col'})[1].getText().strip().split()[0].replace(',', ''))
rumNumEmptied = float(soup.find_all('div', {'class': 'rum-collection'})[0].find_all('div', {'class': 'col'})[2].getText().strip().split()[0].replace(',', ''))
rumOpenRate = round((rumNumOpened) / (rumNumClosed + rumNumOpened + rumNumEmptied), 4)

print(rumNumClosed,rumNumOpened, rumNumEmptied, rumOpenRate)

418.0 127.0 44.0 0.2156


In [None]:
# trying to determine the number of rum fact categories so i know what columns to expect to extract from the fact cards
categories = set()
for l in df['Link']:
    if not pd.isna(l):
        try:
            page = requests.get(l)
            content = page.content
            soup = BeautifulSoup(content, features='lxml')
            rumFacts = soup.find_all("div", {"class": "rum-facts"})[0]
            facts = dict()
            for fact in soup.find_all("div", {"class": "rum-facts"})[0].find_all('div', {'class':'fact-info'}):
                values = fact.getText().strip().split('\n')
                facts[values[0].strip().lower().replace(' ', '_')] = values[1].strip().lower()
            categories.update(set(facts.keys()))
        except:
            print(f'couldnt get facts: {l}')
            

In [84]:
# testing building a more complete dataset
rumPrelim = pd.read_csv('Data/rumPrelim.csv')

# lists to hold scrapped data
abvs = []
ages = []
bottle_volumes = []
bottlers = []
brands = []
cask_numbers = []
categories = []
countries = []
distillations = []
distilleries = []
made_froms = []
marks = []
num_bottles = []
price_ranges = []
vintages = []
descriptions = []
ratings = []
num_reviews = []
prices = []
price_units = []
smell_notes = []
taste_notes = []
finish_notes = []
num_closed = []
num_opened = []
num_emptied = []
open_rates = []


# iterate over each link and try to get as many pieces of information as possible
for i in range(len(rumPrelim)):
    time.sleep(1)
    url = rumPrelim['Link'][i]
    page = requests.get(url)
    
    # case for page loaded
    if page.status_code == 200:
        content = page.content
        soup = BeautifulSoup(content, features='lxml')
        
        # handle rum facts data
        try:
            facts = dict()
            for fact in soup.find_all("div", {"class": "rum-facts"})[0].find_all('div', {'class':'fact-info'}):
                values = fact.getText().strip().split('\n')
                facts[values[0].strip().lower().replace(' ', '_')] = values[1].strip().lower()
            
            # abv
            if 'abv' in facts.keys():
                if facts['abv'] != '':
                    abvs.append(facts['abv'].replace('%', ''))
                else:
                    abvs.append(np.nan)
            else:
                abvs.append(np.nan)
                
            # age
            if 'age' in facts.keys():
                if facts['age'] != '':
                    ages.append(facts['age'])
                else:
                    ages.append(np.nan)
            else:
                ages.append(np.nan)

            # bottle_volume
            if 'bottle_volume' in facts.keys():
                if facts['bottle_volume'] != '':
                    bottle_volumes.append(facts['bottle_volume'])
                else:
                    bottle_volumes.append(np.nan)
            else:
                bottle_volumes.append(np.nan)

            # bottler
            if 'bottler' in facts.keys():
                if facts['bottler'] != '':
                    bottlers.append(facts['bottler'])
                else:
                    bottlers.append(np.nan)
            else:
                bottlers.append(np.nan) 

            # brands
            if 'brand' in facts.keys():
                if facts['brand'] != '':
                    brands.append(facts['brand'])
                else:
                    brands.append(np.nan)
            else:
                brands.append(np.nan)
                
            # cask_number
            if 'cask_number' in facts.keys():
                if facts['cask_number'] != '':
                    cask_numbers.append(facts['cask_number'])
                else:
                    cask_numbers.append(np.nan)
            else:
                cask_numbers.append(np.nan) 
                
            # category
            if 'category' in facts.keys():
                if facts['category'] != '':
                    categories.append(facts['category'])
                else:
                    categories.append(np.nan)
            else:
                categories.append(np.nan)  
                
            # country
            if 'country' in facts.keys():
                if facts['country'] != '':
                    countries.append(facts['country'])
                else:
                    countries.append()
            else:
                countries.append(np.nan) 
                
            # distillation
            if 'distillation' in facts.keys():
                if facts['distillation'] != '':
                    distillations.append(facts['distillation'])
                else:
                    distillations.append(np.nan)
            else:
                distillations.append(np.nan) 
                
            # distillery
            if 'distillery' in facts.keys():
                if facts['distillery'] != '':
                    distilleries.append(facts['distillery'])
                else:
                    distilleries.append(np.nan)
            else:
                distilleries.append(np.nan) 
                
            # made_from
            if 'made_from' in facts.keys():
                if facts['made_from'] != '':
                    made_froms.append(facts['made_from'])
                else:
                    made_froms.append(np.nan)
            else:
                made_froms.append(np.nan)
                
            # mark
            if 'mark' in facts.keys():
                if facts['mark'] != '':
                    marks.append(facts['mark'])
                else:
                    marks.append(np.nan)
            else:
                marks.append(np.nan)
                
            # no._of_bottles
            if 'no._of_bottles' in facts.keys():
                if facts['no._of_bottles'] != '':
                    num_bottles.append(facts['no._of_bottles'])
                else:
                    num_bottles.append(np.nan)
            else:
                num_bottles.append(np.nan)
                
            # price_range
            if 'price_range' in facts.keys():
                if facts['price_range'] != '':
                    price_ranges.append(facts['price_range'])
                else:
                    price_ranges.append(np.nan)
            else:
                price_ranges.append(np.nan)
                
            # vintage
            if 'vintage' in facts.keys():
                if facts['vintage'] != '':
                    vintages.append(facts['vintage'])
                else:
                    vintages.append(np.nan)
            else:
                vintages.append(np.nan)

        except:
            abvs.append(np.nan)
            ages.append(np.nan)
            bottle_volumes.append(np.nan)
            bottlers.append(np.nan)
            brands.append(np.nan)
            cask_numbers.append(np.nan)
            categories.append(np.nan)
            countries.append(np.nan)
            distillations.append(np.nan)
            distilleries.append(np.nan)
            made_froms.append(np.nan)
            marks.append(np.nan)
            num_bottles.append(np.nan)
            price_ranges.append(np.nan)
            vintages.append(np.nan)
            
        # handle ratings and reviews
        try:
            rumRating = float(soup.find_all('a', {'href': '#community-reviews'})[1].find_all('span', {'class': 'overall-rating-text'})[0].getText().replace(',', ''))
            ratings.append(rumRating)
        except:
            ratings.append(np.nan)
            
        try:
            rumNumReviews = int(soup.find_all('a', {'href': '#community-reviews'})[1].find_all('span', {'class': 'overall-rating-max'})[2].getText().split()[0].replace(',', ''))
            num_reviews.append(rumNumReviews)
        except:
            num_reviews.append(np.nan)
            
        # handle prices and units
        try:
            rumPrice = float(soup.find_all('div', {'class': 'rum-collection'})[0].find_all('span', {'class': 'overall-rating-text'})[0].getText().split()[0])
            prices.append(rumPrice)
        except:
            prices.append(np.nan)
            
        try:
            rumPriceUnit = soup.find_all('div', {'class': 'rum-collection'})[0].find_all('span', {'class': 'overall-rating-text'})[0].getText().split()[1]
            price_units.append(rumPriceUnit)
        except:
            price_units.append(np.nan)

        smells = []
        tastes = []
        finishes = []
        last_tag = ''
        try:
            for tag in soup.find_all('div', {'class': 'mt-4'})[0].find_all(['h4', 'span']):
                if tag.name == 'h4':
                    last_tag = tag.text
                elif tag.name == 'span':
                    if last_tag == 'Aroma':
                        smells.append(tag.text)
                    elif last_tag == 'Taste':
                        tastes.append(tag.text)
                    elif last_tag == 'Finish':
                        finishes.append(tag.text)
            smell_notes.append(smells)
            taste_notes.append(tastes)
            finish_notes.append(finishes)
        except:
            smell_notes.append(np.nan)
            taste_notes.append(np.nan)
            finish_notes.append(np.nan)
                
            
        # handle closed, opened, emptied, rate
        try:
            rumNumClosed = float(soup.find_all('div', {'class': 'rum-collection'})[0].find_all('div', {'class': 'col'})[0].getText().strip().split()[0].replace(',', ''))
            num_closed.append(rumNumClosed)
        except:
            num_closed.append(np.nan)
            
        try:
            rumNumOpened = float(soup.find_all('div', {'class': 'rum-collection'})[0].find_all('div', {'class': 'col'})[1].getText().strip().split()[0].replace(',', ''))
            num_opened.append(rumNumOpened)
        except:
            num_opened.append(np.nan)
            
        try:
            rumNumEmptied = float(soup.find_all('div', {'class': 'rum-collection'})[0].find_all('div', {'class': 'col'})[2].getText().strip().split()[0].replace(',', ''))
            num_emptied.append(rumNumEmptied)
        except:
            num_emptied.append(np.nan)
            
        try:
            rumOpenRate = round((rumNumOpened) / (rumNumClosed + rumNumOpened + rumNumEmptied), 4)
            open_rates.append(rumOpenRate)
        except:
            open_rates.append(np.nan)
            
        # handle description
        try:
            rumDesc = soup.find_all("p", {"class": "description-read-more__text"})[0].getText().lower()
            descriptions.append(rumDesc)
        except:
            descriptions.append(np.nan)
        
    # case for page not loaded 
    else:
        print(f'Page not loaded: {url}')
        abvs.append(np.nan)
        ages.append(np.nan)
        bottle_volumes.append(np.nan)
        bottlers.append(np.nan)
        brands.append(np.nan)
        cask_numbers.append(np.nan)
        categories.append(np.nan)
        countries.append(np.nan)
        distillations.append(np.nan)
        distilleries.append(np.nan)
        made_froms.append(np.nan)
        marks.append(np.nan)
        num_bottles.append(np.nan)
        price_ranges.append(np.nan)
        vintages.append(np.nan)
        descriptions.append(np.nan)
        ratings.append(np.nan)
        num_reviews.append(np.nan)
        prices.append(np.nan)
        price_units.append(np.nan)
        smell_notes.append(np.nan)
        taste_notes.append(np.nan)
        finish_notes.append(np.nan)
        num_closed.append(np.nan)
        num_opened.append(np.nan)
        num_emptied.append(np.nan)
        open_rates.append(np.nan)
  
    
moreData = pd.DataFrame({
    'Category': categories,
    'Bottler': bottlers,
    'Distillery': distilleries,
    'Brand': brands,
    'Country': countries,
    'Distillation': distillations,
    'Raw_Material': made_froms,
    'Age': ages,
    'Abv': abvs,
    'Bottle_Volume': bottle_volumes,
    'Number_Casks': cask_numbers,
    'Mark': marks,
    'Number_Bottles': num_bottles,
    'Vintage': vintages,
    'Price_Range': price_ranges,
    'Price': prices,
    'Price_Units': price_units,
    'Rating': ratings,
    'Number_Reviews': num_reviews,
    'Smell_Notes': smell_notes,
    'Taste_Notes': taste_notes,
    'Finish_Notes': finish_notes,
    'Number_Closed': num_closed,
    'Number_Opened': num_opened,
    'Number_Emptied': num_emptied,
    'Open_Rate': open_rates,
    'Description': descriptions
}) 

rumFullDF = pd.concat([rumPrelim, moreData], axis=1)
rumFullDF.to_csv('rum_12_2023.csv', index=False)

In [85]:
rumFullDF

Unnamed: 0,ID,Link,Name,Category,Bottler,Distillery,Brand,Country,Distillation,Raw_Material,...,Rating,Number_Reviews,Smell_Notes,Taste_Notes,Finish_Notes,Number_Closed,Number_Opened,Number_Emptied,Open_Rate,Description
0,1,https://www.rum-x.com/rums/1/maison-ferrand-we...,maison-ferrand-west-indies-plantation-extra-ol...,pot & column still rum,,west indies,plantation,barbados,pot and column still,molasses,...,8.0,1023.0,"[Vanilla, Caramel, Coconut, Sweet, Tropical fr...","[Vanilla, Sweet, Caramel, Coconut, Banana, Woody]","[Vanilla, Sweet, Caramel, Coconut, Caramelized...",2090.0,721.0,485.0,0.2188,discover the rich and indulgent flavors of pla...
1,2,https://www.rum-x.com/rums/2/bleeding-heart-ru...,bleeding-heart-rum-company-don-papa-rum,rum based spirit,,bleeding heart rum company,don papa,philippines,column still,molasses,...,7.0,661.0,"[Vanilla, Caramel, Sweet, Synthetic, Orange, F...","[Vanilla, Sweet, Caramel, Orange, Synthetic, S...","[Vanilla, Sweet, Caramel, Synthetic, Sugar, Fr...",1447.0,498.0,387.0,0.2136,discover the captivating flavors of don papa r...
2,3,https://www.rum-x.com/rums/3/destilerias-unida...,destilerias-unidas-s-a-diplomatico-botucal-res...,pot & column still rum,,destilerías unidas s. a.,diplomático / botucal,venezuela,pot and column still,sugar cane honey,...,7.5,1033.0,"[Vanilla, Caramel, Sweet, Raisin, Honey, Dried...","[Sweet, Caramel, Vanilla, Mild, Sugar, Raisin]","[Sweet, Caramel, Vanilla, Raisin, Chocolate, W...",2374.0,858.0,600.0,0.2239,introducing the diplomático / botucal reserva ...
3,4,https://www.rum-x.com/rums/4/industrias-licore...,industrias-licoreras-de-guatemala-ron-zacapa-s...,column still rum,,industrias licoreras de guatemala,ron zacapa,guatemala,column still,sugar cane honey,...,7.5,757.0,"[Caramel, Vanilla, Woody, Honey, Dried fruit, ...","[Caramel, Sweet, Vanilla, Woody, Alcoholic, Oak]","[Sweet, Spice, Woody, Caramel, Chocolate, Barrel]",1416.0,1353.0,338.0,0.4355,discover the enticing world of guatemalan rum ...
4,5,https://www.rum-x.com/rums/5/kraken-black-spic...,kraken-black-spiced-rum,spiced rum,,,,trinidad,,molasses,...,6.0,558.0,"[Caramel, Vanilla, Spice, Cinnamon, Coffee, Ch...","[Vanilla, Caramel, Sweet, Spice, Cinnamon, Dar...","[Vanilla, Caramel, Sweet, Spice, Diluted, Dark...",1119.0,391.0,306.0,0.2153,discover the enchanting world of kraken black ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17318,17860,https://www.rum-x.com/rums/17860/famille-ricci...,famille-ricci-ten-cane-zodiac-vierge-2023,pot still rum,,ten cane,,trinidad,pot still,molasses,...,9.1,4.0,[],[],[],,,,0.0000,"the ""zodiac vierge 2023"" was produced in trini..."
17319,17861,https://www.rum-x.com/rums/17861/the-duchess-c...,the-duchess-caroni-nautilus,,,caroni,,trinidad,,molasses,...,9.0,1.0,[],[],[],1.0,1.0,0.0,0.5000,"the ""nautilus"" was produced in trinidad at the..."
17320,17862,https://www.rum-x.com/rums/17862/kyoto-fine-wi...,kyoto-fine-wine-and-spirits-caroni-shinanoya-t...,,,caroni,,trinidad,,molasses,...,9.3,1.0,[],[],[],,,,0.5000,"the ""shinanoya (the purple haze)"" was produced..."
17321,17863,https://www.rum-x.com/rums/17863/vb-j-bally-br...,vb-j-bally-brut-de-fut-selection-vb,rhum agricole aoc,,j. bally,,martinique,creole column,sugar cane juice,...,8.6,7.0,"[Demerara sugar, Citrus, Vanilla, Woody, Allsp...","[Dark chocolate, Woody, Espresso, Roasted, Iod...","[Coffee, Allspice, Grapes, Tannins, Citrus, Io...",4.0,1.0,0.0,0.2000,"the ""brut de fût (sélection v&amp;b)"" was prod..."


In [86]:
# exploring new columns
rumFullDF.head(5)

Unnamed: 0,ID,Link,Name,Category,Bottler,Distillery,Brand,Country,Distillation,Raw_Material,...,Rating,Number_Reviews,Smell_Notes,Taste_Notes,Finish_Notes,Number_Closed,Number_Opened,Number_Emptied,Open_Rate,Description
0,1,https://www.rum-x.com/rums/1/maison-ferrand-we...,maison-ferrand-west-indies-plantation-extra-ol...,pot & column still rum,,west indies,plantation,barbados,pot and column still,molasses,...,8.0,1023.0,"[Vanilla, Caramel, Coconut, Sweet, Tropical fr...","[Vanilla, Sweet, Caramel, Coconut, Banana, Woody]","[Vanilla, Sweet, Caramel, Coconut, Caramelized...",2090.0,721.0,485.0,0.2188,discover the rich and indulgent flavors of pla...
1,2,https://www.rum-x.com/rums/2/bleeding-heart-ru...,bleeding-heart-rum-company-don-papa-rum,rum based spirit,,bleeding heart rum company,don papa,philippines,column still,molasses,...,7.0,661.0,"[Vanilla, Caramel, Sweet, Synthetic, Orange, F...","[Vanilla, Sweet, Caramel, Orange, Synthetic, S...","[Vanilla, Sweet, Caramel, Synthetic, Sugar, Fr...",1447.0,498.0,387.0,0.2136,discover the captivating flavors of don papa r...
2,3,https://www.rum-x.com/rums/3/destilerias-unida...,destilerias-unidas-s-a-diplomatico-botucal-res...,pot & column still rum,,destilerías unidas s. a.,diplomático / botucal,venezuela,pot and column still,sugar cane honey,...,7.5,1033.0,"[Vanilla, Caramel, Sweet, Raisin, Honey, Dried...","[Sweet, Caramel, Vanilla, Mild, Sugar, Raisin]","[Sweet, Caramel, Vanilla, Raisin, Chocolate, W...",2374.0,858.0,600.0,0.2239,introducing the diplomático / botucal reserva ...
3,4,https://www.rum-x.com/rums/4/industrias-licore...,industrias-licoreras-de-guatemala-ron-zacapa-s...,column still rum,,industrias licoreras de guatemala,ron zacapa,guatemala,column still,sugar cane honey,...,7.5,757.0,"[Caramel, Vanilla, Woody, Honey, Dried fruit, ...","[Caramel, Sweet, Vanilla, Woody, Alcoholic, Oak]","[Sweet, Spice, Woody, Caramel, Chocolate, Barrel]",1416.0,1353.0,338.0,0.4355,discover the enticing world of guatemalan rum ...
4,5,https://www.rum-x.com/rums/5/kraken-black-spic...,kraken-black-spiced-rum,spiced rum,,,,trinidad,,molasses,...,6.0,558.0,"[Caramel, Vanilla, Spice, Cinnamon, Coffee, Ch...","[Vanilla, Caramel, Sweet, Spice, Cinnamon, Dar...","[Vanilla, Caramel, Sweet, Spice, Diluted, Dark...",1119.0,391.0,306.0,0.2153,discover the enchanting world of kraken black ...


In [87]:
rumFullDF.columns

Index(['ID', 'Link', 'Name', 'Category', 'Bottler', 'Distillery', 'Brand',
       'Country', 'Distillation', 'Raw_Material', 'Age', 'Abv',
       'Bottle_Volume', 'Number_Casks', 'Mark', 'Number_Bottles', 'Vintage',
       'Price_Range', 'Price', 'Price_Units', 'Rating', 'Number_Reviews',
       'Smell_Notes', 'Taste_Notes', 'Finish_Notes', 'Number_Closed',
       'Number_Opened', 'Number_Emptied', 'Open_Rate', 'Description'],
      dtype='object')

In [88]:
# converting comma notation into period for float conversion
rumFullDF['Abv'] = rumFullDF['Abv'].apply(lambda x: float(str(x).replace(',', '.')))

In [89]:
# checking the category column, since it only contains 1 value i will drop it
rumFullDF['Category'].value_counts()

column still rum          4091
pot still rum             2292
rhum agricole aoc         1103
rhum agricole              957
pot & column still rum     755
spiced rum                 743
white rum                  733
flavoured rum              696
rum based spirit           330
clairin                    112
navy rum                    82
rumverschnitt               78
grogue                       9
tuzemský                     7
Name: Category, dtype: int64

In [90]:
# bottler values were correctly scraped, will drop this column and revist it later
rumFullDF['Bottler'].value_counts()

Series([], Name: Bottler, dtype: int64)

In [91]:
# all price units are null or euros, we will work with the assumption all prices are euros
rumFullDF['Price_Units'].value_counts()

€    11003
Name: Price_Units, dtype: int64

In [92]:
# drop these 3 columns for reduced storage
rumFullDF.drop(columns=['Bottler', 'Price_Units'] , inplace=True)

In [93]:
# save as csv
rumFullDF.to_csv('rum_12_2023.csv', index=False)