In [None]:
import numpy as np
import pandas as pd

%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)

import matplotlib.pyplot as plt
plt.style.use('ggplot')


df_cacao = pd.read_csv('../input/flavors_of_cacao.csv')

df_cacao.columns


First, let's rename the columns for an easier access and check the data and the columns' type

In [None]:
df_cacao.columns = ['company', 'origin_or_name', 'ref', 'review_date', 'cocoa_percent', 'company_location', 'rating', 'bean_type', 'broad_bean_origin']

In [None]:
df_cacao.head()

In [None]:
df_cacao.dtypes

We can see that 'cocoa_percent' is of type 'object'. Let's translate it to a more appropriate type

In [None]:
df_cacao.cocoa_percent = df_cacao.cocoa_percent.str.replace('%', '').astype(float)

Now we can make some basic plots in order to get ourselves to know the data.

# Companies

In [None]:
df_ten_companies = df_cacao.groupby('company')['company'].count() \
        .sort_values(ascending=False)[:10] \
        .to_frame()

df_ten_companies.columns = ['Count']

df_ten_companies

In [None]:
df_ten_companies.plot.barh(title='Top 10 companies')

# Countries

In [None]:
df_ten_countries = df_cacao.groupby('company_location')['company_location'] \
                            .count() \
                            .sort_values(ascending=False)[:10] \
                            .to_frame()
df_ten_countries.columns = ['Count']
df_ten_countries

In [None]:
df_ten_countries.plot.barh(title='Top 10 countries')

# Cocoa percentage

In [None]:
df_cocoa_percentage = df_cacao.groupby('cocoa_percent') \
                                .count()['ref'] \
                                .sort_values(ascending=False) \
                                .reset_index()[:15]
df_cocoa_percentage.columns = ['Cocoa percentage', 'Number of chocolate']
            
df_cocoa_percentage.plot.bar(x='Cocoa percentage', 
                            title='Distribution of chocolate over their cocoa percentage')

We can see that 70% is far more used than the others. Let's now take a look at the distribution of the percentage for the 10 top countries from part one.

In [None]:
location = df_ten_countries.index
df_vio = [df_cacao[df_cacao['company_location'] == loc]['cocoa_percent'] for loc in location]

fig, axes = plt.subplots()

axes.violinplot(df_vio, showmeans=True)
axes.set_xticks(np.arange(1, len(location) + 1))
axes.set_xticklabels(location)
axes.set_title('Cocoa % distribution of the 10 first countries')

We can see that each mean gravitates just a little bit above the 70% limit. We can validate this point by taking a look at the previous histogram and see that the first three percentages are 70% 75% and 72%.

For now on, I'll explore the several "inspirations" proposed with the dataset 
# Where are the best cocoa beans grown ?

We have to manipulate the 'rating' and 'broad_bean_origin' columns. For this kind of question, the variance is also an interesting insight on the value, so will print it as well


In [None]:
df_best_beans = df_cacao.groupby('broad_bean_origin')['rating'] \
                        .aggregate(['mean', 'var', 'count']) \
                        .replace(np.NaN, 0) \
                        .sort_values(['mean', 'var'], ascending=[False, False])
df_best_beans.head()

As we can see, the origins ranking first are only providing one kind of cocoa beans.

In [None]:
df_best_beans['count'].mean()

On average, a "place" grows 17.94 coca type. If we tweak a little bit the initial question and take into account the number of cocoa beans, we will have the places where we find a great quality and variety.

In [None]:
df_best_beans = df_best_beans.sort_values('count', ascending=False)[:20] \
                            .sort_values('mean', ascending=False)
df_best_beans.head()

Guatemala seems the place to be if you are a beans lover !

In [None]:
df_best_beans['mean'].plot.bar(yerr=df_best_beans['var'], title="Places with high rating beans")

The first places are really close to each other regarding the beans rating. 
Being in a place with great cocoa beans doesn't really matter if you cannot enjoy it as chocolate bars. This leads to the second proposed question.

# Which countries produce the highest-rated bars

This time, we are working with 'company_location' instead of 'broad_bean_origin'.




In [None]:
df_highest = df_cacao.groupby('company_location')['rating'] \
                        .aggregate(['mean', 'var', 'count']) \
                        .replace(np.NaN, 0) \
                        .sort_values(['mean', 'var'], ascending=[False, False])
df_highest.head()

Here we can see the top ranking countries. Unfortunately 4 kind of bars is not what we could call variety. Let’s make the same tweak as before and see what we get.

In [None]:
df_highest = df_highest.sort_values('count', ascending=False)[:20] \
            .sort_values('mean', ascending=False)
    
df_highest.head()

In [None]:
df_highest.plot.bar(y='mean', yerr='var')

At last, let's visualize the rating violins for these locations


In [None]:
location = df_highest.index

df_vio = [df_cacao[df_cacao['company_location'] == loc]['rating'] for loc in location]
fig, axes = plt.subplots(figsize=(14, 10))

axes.violinplot(df_vio, showmeans=True)
axes.set_xticks(np.arange(1, len(location) + 1))
axes.set_xticklabels(location)
axes.xaxis.set_tick_params(rotation=45)
axes.set_title('Rating distribution of the 20 first countries')

From this violin, we can see several things :

- The means are very close to each other and decrease slowly
- Ratings over 4 are rare
- Even in the "best" locations you can get a bad rated chocolat bar

# Conclusion

That's it for our exploration. Feedbacks are much appreciated
