# Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
sb.set()

# Importing the cleaned data set

In [None]:
choc = pd.read_csv('cleaned_chocolate.csv')

# Visualizing the distribution of cocoa percent

In [None]:
sb.displot(choc['cocoa_percent'], kde = True, binwidth = 2, height = 7, aspect = 2)
plt.show()

In [None]:
f = plt.figure(figsize=(16,4))
sb.boxplot(data = choc["cocoa_percent"], orient = 'h')

## Removing outliers for cocoa_percent using IQR rule

In [None]:
q3 = choc["cocoa_percent"].quantile(0.75)
q1 = choc["cocoa_percent"].quantile(0.25)
iqr = q3-q1
choc = choc[(choc["cocoa_percent"]>=(q1-1.5*iqr)) & (choc["cocoa_percent"]<=(q3+1.5*iqr))]

In [None]:
f = plt.figure(figsize=(16,4))
sb.boxplot(data = choc["cocoa_percent"], orient = 'h')

## Relationship between cocoa_percent and rating_category

In [None]:
#from this we can see that generally cocoa percent similar, so not as important of a factor to affect ratings
f = plt.figure(figsize=(16,12))
sb.boxplot(x = "cocoa_percent", y = "rating_category", data = choc)

# Relationship between Ingredient and proportion of each rating category

We are examining the changes in the distribution of rating category when there is versus when there is not a specific ingredient

## Creating helper functions to calculate the percentages

In [None]:
# helper functions for plotting
def ratingProportionPlot(data, y, ylabel = None, title = None):
    i = len(data[y].value_counts())
    graph = pd.crosstab(index = data[y],
                        columns = data['rating_category'],
                        normalize = 'index')[['2', '1', '0']]
    graph.plot(kind = 'barh',
               stacked = True,
               figsize = (15,i*1.5),
               title = title,
               xlabel = ylabel)
    plt.show()

def ratingPercentagePlot(data, x, xlabel, height = 5, aspect = 1, ylim = 50):
    palette = {'2':(0.2980392156862745, 0.4470588235294118, 0.6901960784313725),
               '1':(0.8666666666666667, 0.5176470588235295, 0.3215686274509804),
               '0':(0.3333333333333333, 0.6588235294117647, 0.40784313725490196)}

    new = data.groupby(data[x])['rating_category'].value_counts(normalize=True)
    new = new.mul(100).rename('Percentage').reset_index()
    new = new.rename(columns = {x:xlabel})
    new = new.rename(columns = {"level_1":'Rating Category'})

    g = sb.catplot(x=xlabel, y='Percentage', hue='Rating Category', kind='bar', data=new,
                   palette=palette, height=height, aspect=aspect)
    g.ax.set_ylim(0,ylim)

    for p in g.ax.patches:
        txt = str(p.get_height().round(1)) + '%'
        txt_x = p.get_x()
        txt_y = p.get_height()
        g.ax.text(txt_x,txt_y,txt)
    
    plt.show()

Cocoa Butter

In [None]:
ratingProportionPlot(data = choc, y = 'cocoa_butter', ylabel = 'cocoa butter',
                     title = 'Proportion of each rating category by cocoa butter')

In [None]:
ratingPercentagePlot(data = choc, x = 'cocoa_butter', xlabel = 'Cocoa Butter')

Vanilla

In [None]:
ratingProportionPlot(data = choc, y = 'vanilla', ylabel = 'vanilla',
                     title = 'Proportion of each rating category by vanilla')

In [None]:
ratingPercentagePlot(data = choc, x = 'vanilla', xlabel = 'Vanilla')

Lecithin

In [None]:
ratingProportionPlot(data = choc, y = 'lecithin', ylabel = 'lecithin',
                     title = 'Proportion of each rating category by lecithin')

In [None]:
ratingPercentagePlot(data = choc, x = 'lecithin', xlabel = 'Lecithin')

Salt

In [None]:
ratingProportionPlot(data = choc, y = 'salt', ylabel = 'salt',
                     title = 'Proportion of each rating category by salt')

In [None]:
ratingPercentagePlot(data = choc, x = 'salt', xlabel = 'Salt')

Sugar

In [None]:
ratingProportionPlot(data = choc, y = 'sugar', ylabel = 'sugar',
                     title = 'Proportion of each rating category by sugar')

In [None]:
ratingPercentagePlot(data = choc, x = 'sugar', xlabel = 'Sugar')

Sweetener

In [None]:
ratingProportionPlot(data = choc, y = 'sweetener_without_sugar', ylabel = 'sweetener',
                     title = 'Proportion of each rating category by sweetener')

In [None]:
ratingPercentagePlot(data = choc, x = 'sweetener_without_sugar', xlabel = 'Sweetener w/o Sugar')

Correlation between Sugar and Sweetener without Sugar

In [None]:
sugars = pd.DataFrame(choc[['sugar', 'sweetener_without_sugar']]).astype('int')
sb.heatmap(sugars.corr(), annot=True)
plt.show()

We observe that "sugar" and "sweetener_without_sugar" have perfect negative correlation with each other, therefore we can drop one column.
Interesting note: All dark chocolate in our dataset requires some forms of sweetener in it.

In [None]:
choc = choc.drop(columns='sweetener_without_sugar')

## Frequency of the number of ingredients

In [None]:
sb.catplot(x = 'counts_of_ingredients', data = choc, kind = 'count')

Most dark chocolate bars have between 2-5 ingredients, with chocolate bars having 5 ingredients being quite rare.
Chocolate bars with all 6 ingredients are clear outliers, therefore we can remove it. This could be due to the fact that the columns 'sugar' and 'sweetener_without_sugar' are negatively correlated with each other.

In [None]:
choc = choc[choc['counts_of_ingredients'] <= 5]
sb.catplot(x = 'counts_of_ingredients', data = choc, kind = 'count')

## Number of ingredients and the rating proportions

In [None]:
ratingProportionPlot(data = choc, y = 'counts_of_ingredients', ylabel = 'Number of ingredients',
                     title = 'Proportion of each rating category by number of ingredients')

In [None]:
ratingPercentagePlot(data = choc, x = 'counts_of_ingredients', xlabel = 'Number of Ingredients',
                     height = 7, aspect = 1.4)

## Number of taste

In [None]:
sb.catplot(x = 'number_of_taste', data = choc, kind = 'count')

In [None]:
ratingProportionPlot(data = choc, y = 'number_of_taste', ylabel = 'Number of Taste')

In [None]:
ratingPercentagePlot(data = choc, x = 'number_of_taste', xlabel = 'Number of Taste',
                     height = 7, aspect = 1.4, ylim = 60)

## Correlation between different ingredients and rating category

In [None]:
fig = plt.figure(figsize=(10,10))
ingredients = pd.DataFrame(choc[['cocoa_butter', 'vanilla', 'lecithin', 'salt',
                                 'sugar', 'rating_category']]).astype('int')
sb.heatmap(ingredients.corr(), annot=True, cmap='PuBu')
plt.show()

## Further investigation using the Chi-squared Test

Importing the library

In [None]:
from scipy.stats import chi2_contingency

In [None]:
names = []
pvalues = []

for i in ingredients.columns:
    if i == 'rating_category':
        continue
    chi2, p, dof, expected = chi2_contingency(pd.crosstab(choc[i], choc['rating_category']))
    names.append(i)
    pvalues.append(p)

pvalues_df = pd.DataFrame({'ingredients':names, 'p-value':pvalues})
pvalues_df

In [None]:
sb.barplot(x = 'ingredients', y = 'p-value', data = pvalues_df)
plt.show()

Only 'vanilla' and 'sugar' have small enough p values, that shows signifcant statistical relevance for 'rating_category'

## Relationship between Country of Bean Origin and Proportion of Rating Category

Extracting countries with at least 40 data points for visualization

In [None]:
beanOriginCounts = pd.DataFrame(choc['country_of_bean_origin'].value_counts())
beanOriginTop   = beanOriginCounts[beanOriginCounts['country_of_bean_origin'] > 40].reset_index()
beanOriginTop

In [None]:
j = 0
for country in beanOriginTop['index']:
    temp = choc[choc['country_of_bean_origin'] == country].copy()
    temp.insert(0, column = '', value = [country]*len(temp))
    temp = pd.crosstab(index = temp[''],
                      columns = choc['rating_category'],
                      normalize = 'index')
    if j == 0:
        bean = temp.copy()
        j += 1
    else:
        bean = pd.concat([bean, temp], axis = 0)

bean = bean[['2', '1', '0']]
bean = bean.sort_values('2')

bean.plot(kind = 'barh', stacked = True, figsize = (15,7))
plt.show()

This time we observe quite a clear impact on rating category for each bean origins

## Relationship between Taste and Proportion of Rating Category

Extracting tastes with at least 50 data points for visualization

In [None]:
chocoTasteTop = chocoTaste[chocoTaste['count_of_taste'] >= 50]
chocoTasteTop

In [None]:
j = 0
for taste in chocoTasteTop['taste']:
    temp = choc[(choc['first_taste'] == taste) |
                (choc['second_taste'] == taste) |
                (choc['third_taste'] == taste) |
                (choc['fourth_taste'] == taste)].copy()
    temp.insert(0, column = '', value = [taste]*len(temp))
    temp = pd.crosstab(index = temp[''],
                       columns = choc['rating_category'],
                       normalize = 'index')
    if j == 0:
        tastes = temp.copy()
        j = j+1
    else:
        tastes = pd.concat([tastes, temp], axis = 0)

tastes = tastes[['2', '1', '0']]
tastes = tastes.sort_values('2')

tastes.plot(kind = 'barh', stacked = True, figsize = (15,15))
plt.show()

Once again, we observe a clear impact on rating category based on its described taste and feeling

## Examining the relationships between different groups of variables

Between cocoa_percent, counts_of_ingredients and rating_category

In [None]:
f = plt.figure(figsize=(15,10))
sb.boxplot(y = "cocoa_percent", x = "rating_category", data = choc, hue = "counts_of_ingredients", orient = "v")

Some combinations of counts_of_ingredients and cocoa_percent are more favorable for higher ratings. Such as cocoa_percent in the 66-70% range with 5 ingredients

Between cocoa_percent, number_of_taste and rating_category

In [None]:
f = plt.figure(figsize=(15,10))
sb.boxplot(y = "cocoa_percent", x = "rating_category", data = choc, hue = "number_of_taste", orient = "v")

Investigating different rating categories

In [None]:
high = choc[choc["rating_category"]=="2"]
mid = choc[choc["rating_category"]=="1"]
low = choc[choc["rating_category"]=="0"]

In [None]:
new = high.groupby(high['number_of_taste'])['counts_of_ingredients'].value_counts()
new = new.rename('Count').reset_index()
new = new.rename(columns = {'number_of_taste':'Number of Taste'})

g = sb.catplot(x='Number of Taste', y='Count', hue='counts_of_ingredients', kind='bar', data=new,
               height=7, aspect=1.4)

for p in g.ax.patches:
    if p.get_height() != p.get_height():
        txt = '          0'
    else:
        txt = '          '+str(int(p.get_height()))
    txt_x = p.get_x()
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt,ha='center')

plt.show()

In [None]:
new = mid.groupby(mid['number_of_taste'])['counts_of_ingredients'].value_counts()
new = new.rename('Count').reset_index()
new = new.rename(columns = {'number_of_taste':'Number of Taste'})

g = sb.catplot(x='Number of Taste', y='Count', hue='counts_of_ingredients', kind='bar', data=new,
               height=7, aspect=1.4)

for p in g.ax.patches:
    txt = '          '+str(int(p.get_height()))
    txt_x = p.get_x()
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt,ha='center')

plt.show()

In [None]:
new = low.groupby(low['number_of_taste'])['counts_of_ingredients'].value_counts()
new = new.rename('Count').reset_index()
new = new.rename(columns = {'number_of_taste':'Number of Taste'})

g = sb.catplot(x='Number of Taste', y='Count', hue='counts_of_ingredients', kind='bar', data=new,
               height=7, aspect=1.4)

for p in g.ax.patches:
    txt = '          '+str(int(p.get_height()))
    txt_x = p.get_x()
    txt_y = p.get_height()
    g.ax.text(txt_x,txt_y,txt,ha='center')

plt.show()