In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import gmaps
import os
import json
import scipy.stats as stats

In [25]:
# Tempory data frame to pull in the drinks.csv file
temp_df = pd.read_csv("output_data/drinks.csv")

In [13]:
# Display head of data frame. Feel free to delete
temp_df.head()

Unnamed: 0,Drink ID,Name,Category,Alcoholic,Glass,Ingredients,Measure
0,15567,Adam Sunrise,Ordinary Drink,True,Collins Glass,"['Vodka', 'Lemonade', 'Water', 'Sugar']","['1/2 ', '1/2 can ', '1/2 ', '10 tsp ']"
1,17141,Smut,Punch / Party Drink,True,Beer mug,"['Red wine', 'Peach schnapps', 'Pepsi Cola', '...","['1/3 part ', '1 shot ', '1/3 part ', '1/3 par..."
2,15200,A Day at the Beach,Ordinary Drink,True,Highball glass,"['Coconut rum', 'Amaretto', 'Orange juice', 'G...","['1 oz ', '1/2 oz ', '4 oz ', '1/2 oz ']"
3,15743,Fuzzy Asshole,Coffee / Tea,True,Coffee mug,"['Coffee', 'Peach schnapps']","['1/2 ', '1/2 ']"
4,17168,Amaretto Sunset,Ordinary Drink,True,Collins Glass,"['Triple sec', 'Amaretto', 'Cider', 'Ice']","['1/2 jigger ', '3 shots ', '1/2 cup ', 'Add 1..."


In [23]:
# List conversion function. Let me know if there are any issues
def convert(s):
    lst = []
    temp = str(s).split(", ")
    
    for x in temp:
        lst.append(x.translate({ord(i): None for i in "[']"}))
    
    return lst

In [14]:
# Necessary copy of data frame. The value count is just a test. Feel free to delete that line
drink_df = temp_df.copy()

['Red wine', 'Peach schnapps', 'Pepsi Cola', 'Orange juice']

In [24]:
# Creation of main data frame. All plots should use this data (i.e. drink_df)
for i in range(0, len(drink_df)):
    drink_df.at[i,'Ingredients'] = convert(temp_df.loc[i,'Ingredients'])

In [18]:
# Creation of sorted list of tuples. Useful for most used ingredient
# ing_count = {}
# for i in range(0, len(drink_df)):
#     for ing in drink_df.loc[i,'Ingredients']:
#         if ing in ing_count:
#             ing_count[ing] = ing_count[ing] + 1
#         else:
#             ing_count[ing] = 1
            
# #print(ing_count)
# a=sorted(ing_count.items(), key=lambda x: x[1], reverse = True)
# print(a)
# #print(drink2_df['Ingredients'].to_string())

In [None]:
# Creation of new series. Feel free to delete if not needed.
drink_df['Ingredients Count'] = 0
for i in range(0, len(drink_df)):
    drink_df.loc[i,'Ingredients Count'] = len(drink_df.loc[i,'Ingredients'])
drink_df.head()

In [None]:
# split list into Alcoholic and Non-alcoholic data frames. Sort lists on Ingredients Count and reset index
drink_al_df = drink_df.loc[drink_df['Alcoholic'] == True].sort_values('Ingredients Count', ascending = False).reset_index(drop = True)
drink_non_df = drink_df.loc[drink_df['Alcoholic'] != True].sort_values('Ingredients Count', ascending = False).reset_index(drop = True)

In [None]:
print(f"Total alcoholic cocktails = {len(drink_al_df)}.")
drink_al_df.head()

In [None]:
print(f"Total non-alcoholic cocktails = {len(drink_non_df)}.")
drink_non_df.head()

In [None]:
drink_al_df.loc[0:9,:].plot("Name","Ingredients Count", kind = 'bar', legend = False, title = "Number of Ingredients in Top 10 Alcoholic Cocktails")
plt.xticks(rotation = 45, ha = 'right')
plt.xlabel("Cocktail Name")
plt.ylabel("Ingredients Count")
plt.savefig('output_data/Ingredients_Count_Alcohol.png', bbox_inches="tight")
plt.show()

In [None]:
drink_non_df.loc[0:9,:].plot("Name","Ingredients Count", kind = 'bar', legend = False, title = "Number of Ingredients in Top 10 Non-Alcoholic Cocktails")
plt.xticks(rotation = 45, ha = 'right')
plt.xlabel("Cocktail Name")
plt.ylabel("Ingredients Count")
plt.savefig('output_data/Ingredients_Count_NonAlcohol.png', bbox_inches="tight")
plt.show()

In [None]:
# Create dataframe to do statistics on

complex_df = drink_al_df[["Alcoholic", "Ingredients Count"]]
complex_df = complex_df.append(drink_non_df[["Alcoholic", "Ingredients Count"]], ignore_index=True)
complex_df.replace({True: "Alcoholic", False: "Non-alcoholic"}, inplace=True)
complex_df

In [None]:
# Create the boxplot to see any statistically significant difference

complex_df.boxplot("Ingredients Count", by="Alcoholic", figsize=(10, 5))
plt.xlabel("")
plt.suptitle('')
plt.title("Statistical Means of Ingredients Count")
plt.savefig('output_data/Box_Plot_AverageIng.png', bbox_inches="tight")
plt.show()

In [None]:
# Extract groups and perform ANOVA
group1 = complex_df[complex_df["Alcoholic"] == "Alcoholic"]["Ingredients Count"]
group2 = complex_df[complex_df["Alcoholic"] == "Non-alcoholic"]["Ingredients Count"]

stats.f_oneway(group1, group2)

In [None]:
# Restrict to the top 10 from both lists
complex2_df = drink_al_df.loc[0:9,:][["Alcoholic", "Ingredients Count"]]
complex2_df = complex2_df.append(drink_non_df.loc[0:9,:][["Alcoholic", "Ingredients Count"]], ignore_index=True)
complex2_df.replace({True: "Alcoholic", False: "Non-alcoholic"}, inplace=True)

complex2_df.boxplot("Ingredients Count", by="Alcoholic", figsize=(10, 5))
plt.suptitle('')
plt.title("Statistical Means of Ingredients Count")
plt.savefig('output_data/Box_Plot_AverageIngTop10.png', bbox_inches="tight")
plt.show()

group1 = complex2_df[complex2_df["Alcoholic"] == "Alcoholic"]["Ingredients Count"]
group2 = complex2_df[complex2_df["Alcoholic"] == "Non-alcoholic"]["Ingredients Count"]

stats.f_oneway(group1, group2)