In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [25]:
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/boston_cocktails.csv"

df = pd.read_csv(url)

df

df.to_csv('original.csv')

In [4]:
df.columns

Index(['name', 'category', 'row_id', 'ingredient_number', 'ingredient',
       'measure'],
      dtype='object')

In [20]:
df['ingredient'].value_counts()

Gin                                       176
Fresh lemon juice                         138
Simple Syrup                              115
Vodka                                     114
Light Rum                                 113
                                         ... 
Lemon or Orange Sherbet                     1
Wide spiral of lemon zest                   1
Sparkling wine, raspberry and cucumber      1
Acai berry flavored vodka                   1
Apple Flavored Brandy                       1
Name: ingredient, Length: 569, dtype: int64

In [6]:
# na values have been checked... none exist

df.isna().sum()

name                 0
category             0
row_id               0
ingredient_number    0
ingredient           0
measure              0
dtype: int64

In [7]:
# checking / removing duplicates 

df_lower = df.applymap(lambda s: s.lower() if type(s) == str else s)

df_lower.duplicated().sum()

0

In [8]:
martini_df = df[df['name'].str.contains(('martini'), case=False, na=False)]
martini_df

# locations to drop by row_id: (review martini and think of other cocktails that may need to be reviewed)

Unnamed: 0,name,category,row_id,ingredient_number,ingredient,measure
189,Martini (Medium),Cocktail Classics,54,1,Gin,1 1/2 oz
190,Martini (Medium),Cocktail Classics,54,2,Dry Vermouth,1/2 oz
191,Martini (Medium),Cocktail Classics,54,3,Sweet Vermouth,1/2 oz
192,Martini (Dry) (5-to-1),Cocktail Classics,55,1,Gin,1 2/3 oz
193,Martini (Dry) (5-to-1),Cocktail Classics,55,2,Dry Vermouth,1/3 oz
328,Dry Martini (5-to-1),Cocktail Classics,98,1,Gin,1 2/3 oz
329,Dry Martini (5-to-1),Cocktail Classics,98,2,Dry Vermouth,1/3 oz
452,Hennessy Martini,Cocktail Classics,136,1,Hennessy V.S Cognac,2 oz
453,Hennessy Martini,Cocktail Classics,136,2,Lemon Juice,1/2 oz
579,Sake Martini,Cocktail Classics,178,1,Sake,1 oz


In [9]:
df = df[df['category'] != "Non-alcoholic Drinks"]
df['category'].unique()

array(['Cocktail Classics', 'Cordials and Liqueurs', 'Whiskies', 'Brandy',
       'Vodka', 'Rum - Daiquiris', 'Rum', 'Tequila', 'Shooters', 'Gin'],
      dtype=object)

In [10]:
df.columns

Index(['name', 'category', 'row_id', 'ingredient_number', 'ingredient',
       'measure'],
      dtype='object')

In [11]:
# dropping columns: measure, ingredient number, row_id, category
df = df.drop(columns=['row_id', 'measure', 'ingredient_number'])

In [21]:
df['ingredient'].value_counts()

Gin                                       176
Fresh lemon juice                         138
Simple Syrup                              115
Vodka                                     114
Light Rum                                 113
                                         ... 
Lemon or Orange Sherbet                     1
Wide spiral of lemon zest                   1
Sparkling wine, raspberry and cucumber      1
Acai berry flavored vodka                   1
Apple Flavored Brandy                       1
Name: ingredient, Length: 569, dtype: int64

In [23]:
df[df['ingredient'] == 'Apple Flavored Brandy'] 

Unnamed: 0,name,category,ingredient
3616,Apple Pie Cocktail,Cocktail Classics,Apple Flavored Brandy


In [24]:
df[df['name'] == 'Apple Pie Cocktail']

Unnamed: 0,name,category,ingredient
3614,Apple Pie Cocktail,Cocktail Classics,Old Mr. Boston Rum
3615,Apple Pie Cocktail,Cocktail Classics,Sweet Vermouth
3616,Apple Pie Cocktail,Cocktail Classics,Apple Flavored Brandy
3617,Apple Pie Cocktail,Cocktail Classics,Grenadine
3618,Apple Pie Cocktail,Cocktail Classics,Lemon Juice


In [29]:
df_onehot = df.groupby('name')['ingredient'].apply(lambda x: '|'.join(x)).str.get_dummies(sep='|')
df_onehot.corr()

Unnamed: 0,100-proof Vodka,151-Proof Rum,17-year-old J. Wray and Nephew Ltd. Rum,7-Up,Absinthe,Absinthe Substitute,Absinthe or pastis,Acai berry flavored vodka,African rum,Agave nectar,...,orgeat or almond syrup,pineapple,pomegranate molasses (available at Middle Eastern grocers),port,powdered sugar,red wine,springs mint,sweet vermouth,thin Peeled fresh ginger,"tiki bitters, such as Bittermen's Elemakule"
100-proof Vodka,1.000000,-0.002268,-0.001012,-0.001012,-0.002686,-0.002686,-0.002027,-0.001012,-0.001012,-0.003215,...,-0.002486,-0.001012,-0.001012,-0.001012,-0.001432,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012
151-Proof Rum,-0.002268,1.000000,-0.002268,-0.002268,-0.006018,-0.006018,-0.004543,-0.002268,-0.002268,-0.007204,...,-0.005569,-0.002268,-0.002268,-0.002268,-0.003209,-0.002268,-0.002268,-0.002268,-0.002268,-0.002268
17-year-old J. Wray and Nephew Ltd. Rum,-0.001012,-0.002268,1.000000,-0.001012,-0.002686,-0.002686,-0.002027,-0.001012,-0.001012,-0.003215,...,-0.002486,-0.001012,-0.001012,-0.001012,-0.001432,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012
7-Up,-0.001012,-0.002268,-0.001012,1.000000,-0.002686,-0.002686,-0.002027,-0.001012,-0.001012,-0.003215,...,-0.002486,-0.001012,-0.001012,-0.001012,-0.001432,-0.001012,-0.001012,-0.001012,-0.001012,-0.001012
Absinthe,-0.002686,-0.006018,-0.002686,-0.002686,1.000000,-0.007128,-0.005380,-0.002686,-0.002686,0.112026,...,-0.006596,-0.002686,-0.002686,-0.002686,-0.003801,-0.002686,-0.002686,-0.002686,-0.002686,-0.002686
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
red wine,-0.001012,-0.002268,-0.001012,-0.001012,-0.002686,-0.002686,-0.002027,-0.001012,-0.001012,-0.003215,...,-0.002486,-0.001012,-0.001012,-0.001012,-0.001432,1.000000,-0.001012,-0.001012,-0.001012,-0.001012
springs mint,-0.001012,-0.002268,-0.001012,-0.001012,-0.002686,-0.002686,-0.002027,-0.001012,-0.001012,-0.003215,...,-0.002486,-0.001012,-0.001012,-0.001012,-0.001432,-0.001012,1.000000,-0.001012,-0.001012,-0.001012
sweet vermouth,-0.001012,-0.002268,-0.001012,-0.001012,-0.002686,-0.002686,-0.002027,-0.001012,-0.001012,-0.003215,...,-0.002486,1.000000,-0.001012,-0.001012,-0.001432,-0.001012,-0.001012,1.000000,-0.001012,-0.001012
thin Peeled fresh ginger,-0.001012,-0.002268,-0.001012,-0.001012,-0.002686,-0.002686,-0.002027,-0.001012,-0.001012,-0.003215,...,-0.002486,-0.001012,-0.001012,-0.001012,-0.001432,-0.001012,-0.001012,-0.001012,1.000000,-0.001012


In [35]:
# df_onehot.set_index('name', inplace=True)
df_onehot.index


Index(['1626', '19th Century', 'A. J.', 'Absinthe Cocktail',
       'Absinthe Drip Cocktail', 'Absinthe Special Cocktail',
       'Academic Review', 'Acapulco', 'Adam and Eve', 'Adderly Cocktail',
       ...
       'Xanthia Cocktail', 'Yale Cocktail', 'Yellow Rattler',
       'Yellow Rose Of Texas', 'Yellowjacket', 'Yokahama Romance', 'Yolanda',
       'Zaza Cocktail', 'Zero Mint', 'Zombie'],
      dtype='object', name='name', length=989)

In [36]:
df_onehot.to_csv('onehot.csv', index=True)