In [31]:
import pandas as pd
df = pd.read_csv('Chocolate-Data-Set.csv')
# Rename columns so their names are more appropriate for analysis.
columns = {'REF'                             :'ref',
           'Company (Manufacturer)'          :'company',
           'Company Location'                :'company_location', 
           'Review Date'                     :'review_year',
           'Country of Bean Origin'          :'country_bean_origin',
           'Specific Bean Origin or Bar Name':'specific_bean_origin_or_bar_name',
           'Cocoa Percent'                   :'percent_cocoa',
           'Ingredients'                     :'ingredients', 
           'Most Memorable Characteristics'  :'characteristics',
           'Rating'                          :'rating'}

df.rename(columns = columns, inplace = True)
# Remove percent sign and change value from integer to floating point.
for idx, value in enumerate(df['percent_cocoa']):
    df.loc[idx, 'percent_cocoa'] = float(df.loc[idx, 'percent_cocoa'].split('%')[0]) / 100.0
# Replace nulls with an appropriate value.
df.loc[:, 'ingredients'].fillna(value = 'unknown', inplace = True)
df.drop(labels = 'ref', axis = 1, inplace = True)
# Create a new column to store 'aka' names.
df.insert(1, 'alt_company_name', '')
for idx, value in enumerate(df.loc[:, 'company'].unique()):
    if value.__contains__(' aka '):
        df.loc[df['company'] == value, 'alt_company_name'] = value.split('aka')[1].strip()
        df.loc[df['company'] == value, 'company']          = value.split('aka')[0].strip()
# Fix several of the values so that they are the same level of granularity.
old_values             = ['Wales', 'Amsterdam', 'Scotland', 'Sao Tome']
new_values             = ['U.K.', 'Netherlands', 'U.K.', 'Sao Tome & Principe']
df['company_location'] = df['company_location'].replace(old_values, new_values)
# Create a new column to store 'intra' chocolate bars.
df.insert(5, 'intra_country_prod', 0)
for idx, value in enumerate(df.loc[:, 'company_location']):
    if value == df.loc[idx, 'country_bean_origin']:
        df.loc[idx, 'intra_country_prod'] = 1
df.insert(9,  'num_ingredients', 0)
df.insert(10, 'beans',           0)
df.insert(11, 'sugar',           0)
df.insert(12, 'sweetener_other', 0)
df.insert(13, 'cocoa_butter',    0)
df.insert(14, 'vanilla',         0)
df.insert(15, 'lecithin',        0)
df.insert(16, 'salt',            0)
for idx, value in enumerate(df.loc[:, 'ingredients']):
    if value == 'unknown': continue
    else:
        df.loc[idx, 'num_ingredients'] = value.split('-')[0].strip()
        temp_ingredients_list          = value.split('-')[1].strip().split(',')
        if 'B'  in temp_ingredients_list: df.loc[idx, 'beans']           = 1
        if 'S'  in temp_ingredients_list: df.loc[idx, 'sugar']           = 1
        if 'S*' in temp_ingredients_list: df.loc[idx, 'sweetener_other'] = 1
        if 'C'  in temp_ingredients_list: df.loc[idx, 'cocoa_butter']    = 1
        if 'V'  in temp_ingredients_list: df.loc[idx, 'vanilla']         = 1
        if 'L'  in temp_ingredients_list: df.loc[idx, 'lecithin']        = 1
        if 'Sa' in temp_ingredients_list: df.loc[idx, 'salt']            = 1
df.drop('ingredients', axis = 1, inplace = True)
def unique_words(df, column_name):
    list1 = [df[column_name][idx].split(',') for idx in range(0, len(df[column_name]))]
    list2 = [list1[idx1][idx2].strip().lower() for idx1, sublist in enumerate(list1) for idx2, word in enumerate(sublist)]
    return list(set(list2))
unique_characteristics = unique_words(df, 'characteristics')
df.loc[:, 'review_year'].value_counts().sort_index()
continents = pd.read_csv('Countries-Continents.csv')
continents.head()
continents.iloc[12]['Country']  = 'DR Congo'
continents.iloc[39]['Country']  = 'Sao Tome & Principe'
continents.iloc[72]['Country']  = 'South Korea'
continents.iloc[85]['Country']  = 'Russia'
continents.iloc[94]['Country']  = 'U.A.E.'
continents.iloc[109]['Country'] = 'Czech Republic'
continents.iloc[143]['Country'] = 'U.K.'
continents.iloc[164]['Country'] = 'St. Lucia'
continents.iloc[165]['Country'] = 'St.Vincent-Grenadines'
continents.iloc[167]['Country'] = 'U.S.A.'

continents.loc[len(continents.index)] = ['Asia', 'Taiwan']
continents.loc[len(continents.index)] = ['North America', 'Puerto Rico']
continents.loc[len(continents.index)] = ['North America', 'Martinique']
df = df.merge(right = continents, how = 'left', left_on = 'company_location', right_on = 'Country')
df.drop('Country', axis = 1, inplace = True)
columns = {'Continent':'continent'}
df.rename(columns = columns, inplace = True)

In [9]:
df["country_bean_origin"].unique()

array(['Tanzania', 'Dominican Republic', 'Madagascar', 'Fiji',
       'Venezuela', 'Uganda', 'India', 'Bolivia', 'Peru', 'Panama',
       'Colombia', 'Burma', 'Brazil', 'Papua New Guinea', 'Ecuador',
       'Cuba', 'Togo', 'Sao Tome', 'Mexico', 'Vanuatu', 'Indonesia',
       'Trinidad', 'Vietnam', 'Nicaragua', 'Ghana', 'Belize', 'Blend',
       'Jamaica', 'Grenada', 'Guatemala', 'Honduras', 'Costa Rica',
       'Haiti', 'Congo', 'Philippines', 'Solomon Islands', 'Malaysia',
       'Sri Lanka', 'Ivory Coast', 'Gabon', 'Taiwan', 'Puerto Rico',
       'Martinique', 'St. Lucia', 'Australia', 'Liberia', 'Sierra Leone',
       'U.S.A.', 'Nigeria', 'St.Vincent-Grenadines', 'Thailand', 'Tobago',
       'Sao Tome & Principe', 'Sumatra', 'El Salvador', 'Cameroon',
       'Samoa', 'China', 'Principe', 'Sulawesi', 'Suriname', 'DR Congo'],
      dtype=object)

In [51]:
def update(liste,coef):
    return [elt*coef for elt in liste]

In [3]:
df.head()

Unnamed: 0,company,alt_company_name,company_location,review_year,country_bean_origin,intra_country_prod,specific_bean_origin_or_bar_name,percent_cocoa,num_ingredients,beans,sugar,sweetener_other,cocoa_butter,vanilla,lecithin,salt,characteristics,rating,continent
0,5150,,U.S.A.,2019,Tanzania,0,"Kokoa Kamili, batch 1",0.76,3,1,1,0,1,0,0,0,"rich cocoa, fatty, bready",3.25,North America
1,5150,,U.S.A.,2019,Dominican Republic,0,"Zorzal, batch 1",0.76,3,1,1,0,1,0,0,0,"cocoa, vegetal, savory",3.5,North America
2,5150,,U.S.A.,2019,Madagascar,0,"Bejofo Estate, batch 1",0.76,3,1,1,0,1,0,0,0,"cocoa, blackberry, full body",3.75,North America
3,5150,,U.S.A.,2021,Fiji,0,"Matasawalevu, batch 1",0.68,3,1,1,0,1,0,0,0,"chewy, off, rubbery",3.0,North America
4,5150,,U.S.A.,2021,Venezuela,0,"Sur del Lago, batch 1",0.72,3,1,1,0,1,0,0,0,"fatty, earthy, moss, nutty,chalky",3.0,North America


In [67]:
#Create the data for the chart
#Random Examples depending on the choice of the user
country_1 = "Tanzania"
country_2 = "Cuba"
countries = [country_1,country_2]
#Filter the dataframe depending on these country
df_countries_filtered = df[df.country_bean_origin.isin(countries)]
#Keep the needed columns and groupby country to get averaged values
df_countries_filtered = df_countries_filtered[["country_bean_origin","salt","lecithin","sugar","vanilla","cocoa_butter","sweetener_other"]].groupby("country_bean_origin").mean()
#store values in a list
list_countries = df_countries_filtered.to_numpy().tolist()
country_1_values = update(list_countries[0],100)
country_2_values = update(list_countries[1],100)

In [68]:
#Get the average by continent 
continent_1 = continents.loc[continents['Country'] == country_1, 'Continent'].iloc[0]
continent_2 = continents.loc[continents['Country'] == country_2, 'Continent'].iloc[0]
print(f"Continent 1 : {continent_1} , Continent 2 : {continent_2}")
continents_list = [continent_1,continent_2]
#Filter the dataframe depending on these country
df_continents_filtered = df[df.continent.isin(continents_list)]
#Keep the needed columns and groupby country to get averaged values
df_continents_filtered = df_continents_filtered[["continent","salt","lecithin","sugar","vanilla","cocoa_butter","sweetener_other"]].groupby("continent").mean()
#Usable values
list_continents = df_continents_filtered.to_numpy().tolist()
continents_1_values = update(list_continents[0],100)
continents_2_values = update(list_continents[1],100)


Continent 1 : Africa , Continent 2 : North America


In [90]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'polar'}]*2])

categories = ["Salt","Lecithin","Sugar","Vanilla","Cocoa Butter","Other Sweetener"]


fig.add_trace(go.Scatterpolar(
      r=country_1_values,
      theta=categories,
      fill='toself',
      name=country_1
))
fig.add_trace(go.Scatterpolar(
      r=continents_1_values,
      theta=categories,
      fill='none',
      name=f'Average in {continent_1}'
))

fig.add_trace(go.Scatterpolar(
      r=country_2_values,
      theta=categories,
      fill='toself',
      name=country_2,subplot = "polar2"
    ), 1, 2)
fig.add_trace(go.Scatterpolar(
      r=continents_2_values,
      theta=categories,
      fill='none',
      name=f'Average in {continent_2}',subplot = "polar2"
    ), 1, 2)




fig.update_layout(
  title=dict(text=f"Average use of sweeteners in chocolate bars from beans of {country_1} and {country_2}",xref='container',x=0.5),
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 100]
    )),
    polar2 = dict(
      radialaxis = dict(visible=True,
      range = [0, 100]
    )),
  showlegend=True,
    legend=dict(orientation ='h',xanchor='auto',yanchor='auto',x=0.12)
)

fig.show()

In [2]:
import streamlit as st
categories = st.multiselect(
     'Select your ingredients',
     ['Lecithin','Salt','Sugar',
           'Vanilla', 'Cocoa Butter', 'Other Sweetener'])
#This allows the user to select suboptions for the charts (replace categories with the variable categories)
#st.write('You selected:', categories)

2022-03-28 11:22:42.266 
  command:

    streamlit run C:\Users\axeld\anaconda3\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [9]:
df.head()

Unnamed: 0,company,alt_company_name,company_location,review_year,country_bean_origin,intra_country_prod,specific_bean_origin_or_bar_name,percent_cocoa,num_ingredients,beans,sugar,sweetener_other,cocoa_butter,vanilla,lecithin,salt,characteristics,rating,continent
0,5150,,U.S.A.,2019,Tanzania,0,"Kokoa Kamili, batch 1",0.76,3,1,1,0,1,0,0,0,"rich cocoa, fatty, bready",3.25,North America
1,5150,,U.S.A.,2019,Dominican Republic,0,"Zorzal, batch 1",0.76,3,1,1,0,1,0,0,0,"cocoa, vegetal, savory",3.5,North America
2,5150,,U.S.A.,2019,Madagascar,0,"Bejofo Estate, batch 1",0.76,3,1,1,0,1,0,0,0,"cocoa, blackberry, full body",3.75,North America
3,5150,,U.S.A.,2021,Fiji,0,"Matasawalevu, batch 1",0.68,3,1,1,0,1,0,0,0,"chewy, off, rubbery",3.0,North America
4,5150,,U.S.A.,2021,Venezuela,0,"Sur del Lago, batch 1",0.72,3,1,1,0,1,0,0,0,"fatty, earthy, moss, nutty,chalky",3.0,North America


In [10]:
option = st.selectbox(
     'Choose directly your favorite bar',df['specific_bean_origin_or_bar_name']
     )

# The dropdown gives you an option to replace the filtering made from the map