In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [6.0, 6.0] 
plt.rcParams['figure.dpi'] = 200 
plt.rcParams.update({'font.size': 11})

In [None]:
df = pd.read_csv('cacao_ready.csv')

In [None]:
df_best = df.loc[df['Rating'] >=4.0]

In [None]:
df_worst = df.loc[df['Rating'] <=2.0]

In [None]:
print(os.getcwd())

In [None]:
#path = "../vizs"

#try:
#    os.mkdir(path)
#except OSError:
#    pass

In [None]:

plt.hist(x=df['Rating'], color='magenta', bins=17, edgecolor='white')
plt.title('Rating')
plt.ylabel('Number of records');
plt.tight_layout();

plt.savefig('rating_counts.png', dpi=200);

In [None]:
plt.scatter(x=df['Bean Type Group'], y=df['Rating'], marker='o', color='brown')
plt.xticks(rotation=45);
plt.tight_layout();
plt.savefig('Bean_type_rating.png', dpi=800);
# plt.show()

In [None]:
#remove type unknown to better showcase other types

#df_known_type = df[df.['Bean Type Group'] != 'Unknown']
df_known_type = df.drop(df.loc[df['Bean Type Group']=='Unknown'].index, inplace=True)

plt.hist(x=df['Bean Type Group'], color='blue', bins=17, edgecolor='white')
plt.title('Bean Types')
plt.ylabel('Number of records')
plt.xticks(rotation=45);
plt.tight_layout();

plt.savefig('Bean_type_records.png', dpi=900)

In [None]:
plt.hist(x=df_best['Bean Type Group'], color='blue', bins=17, edgecolor='white')
plt.title('Bean Types')
plt.ylabel('Number of records')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.hist(x=df_worst['Bean Type Group'], color='blue', bins=17, edgecolor='white')
plt.title('Bean Types')
plt.ylabel('Number of records')
plt.xticks(rotation=45)
plt.show()

In [None]:
# as there aren't many records for bean types other than the main 4, 
# it does not make sense to analyse the whole set for dependencies, set containing record of best ones could be useful

In [None]:
stats_by_year_general = df.pivot_table(
    index='Review Date',
    values=['Cacao %', 'Rating'],
    aggfunc='mean'
)

stats_by_year_general

In [None]:
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Review Date')
ax1.set_ylabel('Cacao %', color=color)
ax1.plot(stats_by_year_general.index, stats_by_year_general['Cacao %'], color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx() 

color = 'tab:blue'
ax2.set_ylabel('Rating', color=color)  
ax2.plot(stats_by_year_general.index, stats_by_year_general['Rating'], color=color)
ax2.tick_params(axis='y', labelcolor=color)

plt.tight_layout();

plt.savefig('Average_perc_and_rates', dpi=900)




In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

%matplotlib inline

In [None]:
sns.set_palette('colorblind')

%config InlineBackend.figure_format = 'retina'

In [None]:
sns.catplot(x='Bean Origin', 
            y='Rating', 
            data=df, 
            kind='box',
             height=15, aspect=1, orient=None, color=None, palette='colorblind',)

# information about the chart
plt.xticks(rotation=80)
plt.xlabel("Bean Origin")
plt.ylabel("Rating")
plt.title("Distribution of the Rating depending on the Bean Origin")
plt.rcParams['figure.dpi'] = 500
plt.tight_layout();
plt.savefig('origin_rate', dpi=800)

#plt.show()

In [None]:
plt.hist(x=df['Bean Origin'], color='brown', bins = 56,  edgecolor='white')
plt.title('Bean Origin')
plt.ylabel('Number of records')
plt.xticks(rotation=90)
plt.tight_layout();

plt.savefig('origin_records', dpi=200)

In [None]:
# are any of the countries with low count significant in the worst or best groups?


In [None]:
plt.hist(x=df_best['Bean Origin'], color='brown', bins = 25, edgecolor='white')
plt.title('Bean Origin')
plt.ylabel('Number of records')
plt.xticks(rotation=80)
plt.show()

In [None]:
#count_origins = df.pivot_table(values="Bean Origin", index="Y", columns="Z", aggfunc=pd.Series.nunique)



In [None]:
from collections import Counter
count_origins = Counter( df['Bean Origin'] )

In [None]:
count_origins.most_common()

In [None]:
#For further analysis I am choosing only the top five origins, as further the drop is too big to reasonably compare data 

In [None]:
most_com_origins = count_origins.most_common(5)

In [None]:
def comm_occ_list(counter_list):
    comm_occ = []
    number = len(counter_list)
    i = 0
    while i < number:
        comm_occ.append(counter_list[i][0])
        i+=1
    return comm_occ
    
        

In [None]:
most_comm_origins_list = comm_occ_list(most_com_origins)

In [None]:
df_origins = df[df['Bean Origin'].isin(most_comm_origins_list)]

In [None]:
df_origins

In [None]:
sns.catplot(x='Bean Origin', 
            y='Rating', 
            data=df_origins, 
            kind='box',
             height=15, aspect=1, orient=None, color=None, palette='colorblind',)

# information about the chart
plt.xticks(rotation=80)
plt.xlabel("Bean Origin")
plt.ylabel("Rating")
plt.title("Distribution of the Rating depending on the Bean Origin")
plt.rcParams['figure.dpi'] = 500
plt.tight_layout();
plt.savefig('comm_origin_rate', dpi=200)
#plt.show()

In [None]:
stats_by_year = df_origins.pivot_table(
    index='Review Date',
    values=['Cacao %', 'Rating'],
    aggfunc='mean'
)

stats_by_year

In [None]:
from bokeh.plotting import figure, output_file, show, save

In [None]:

p2 = figure(
    title='Average Cacao % for all and most common bean origins',
    plot_height=500,
    plot_width=600,
    y_axis_label='Cacao %'
)

# lines are great for showing trends
p2.line(x=stats_by_year.index, y=stats_by_year_general['Cacao %'], line_width=2)

# and we can combine them in the sample figure with other markers, change the size, and add color to the marker
p2.line(x=stats_by_year.index, y=stats_by_year['Cacao %'], line_width=4)
plt.tight_layout();
#plt.savefig('percentage_comparison', dpi=400)
output_file('percentage_comparison.html');
save(p2)

In [None]:
df['Made in'].unique()

In [None]:
sns.catplot(x='Made in', 
            y='Rating', 
            data=df, 
            kind='box',
             height=15, aspect=1, orient=None, color=None, palette='colorblind',)

# information about the chart
plt.xticks(rotation=80)
plt.xlabel("Made in")
plt.ylabel("Rating")
plt.title("Distribution of the Rating depending on the Country if production")
plt.rcParams['figure.dpi'] = 500


#plt.show()

In [None]:
count_madein = Counter( df['Made in'] )

In [None]:
most_com_madein = count_madein.most_common(5)

In [None]:
most_comm_madein_list = comm_occ_list(most_com_madein)

In [None]:
df_madein = df[df['Made in'].isin(most_comm_madein_list)]

In [None]:
sns.catplot(x='Made in', 
            y='Rating', 
            data=df_madein, 
            kind='box',
             height=15, aspect=1, orient=None, color=None, palette='colorblind',)

# information about the chart
plt.xticks(rotation=80)
plt.xlabel("Made in")
plt.ylabel("Rating")
plt.title("Distribution of the Rating depending on the Country of production")
plt.rcParams['figure.dpi'] = 500
plt.tight_layout();
plt.savefig('comm_madein_rate', dpi=800)

#plt.show()

In [None]:
count_origins_best = Counter( df_best['Bean Origin'] )

In [None]:
#count_origins_best.most_common()

In [None]:
# I am choosing the top 4 as the lowest one is about 25% of the highest count and after that the counts are similiar

In [None]:
count_origins_best.most_common(6)

In [None]:
stats_by_percentage = df.pivot_table(
    index='Cacao %',
    values=['Rating'],
    aggfunc=['mean', 'min', 'max']
)

stats_by_percentage 

In [None]:
from bokeh.models import HoverTool, ColumnDataSource


In [None]:
#!conda install selenium geckodriver firefox -c conda-forge
#Installing selenium has been taking way too long,
#from bokeh.io import export_png

In [None]:
hover = HoverTool(
  tooltips=[
    ('Rating',   '@y'),
    ('Cacao %',  '@x' ), 

  ],
  #formatters={
   # 'Rating' : '@y', 
 #   'Cacao %' : '@x',
                 
#  },
  mode='vline'
)


p3 = figure(
    title='Ratings depending on Cacao %',
    plot_height=900,
    plot_width=800,
    y_axis_label='Rating',
    tools=[hover, 'crosshair']
)

source = ColumnDataSource(df)

# lines are great for showing trends
p3.line(x=stats_by_percentage.index, y=stats_by_percentage['mean']['Rating'], line_width=4, color = 'steelblue')

# and we can combine them in the sample figure with other markers, change the size, and add color to the marker
p3.line(x=stats_by_percentage.index, y=stats_by_percentage['min']['Rating'], line_width=2, color = 'firebrick')

p3.line(x=stats_by_percentage.index, y=stats_by_percentage['max']['Rating'], line_width=2, color = 'lawngreen')

plt.tight_layout();
#plt.savefig('ratings_and_percentage', dpi=800)
output_file('ratings_and_percentage.html');
#export_png(p3, filename = 'ratings_and_percentage' )
save(p3)
#show(p3)

In [None]:
#hover = HoverTool(tooltips=None, mode='hline')

p4 = figure(
title='Cacao % through the years',
plot_height=900,
plot_width=900,
#tools=[hover, 'crosshair']
)

p4.cross(df['Review Date'], df['Cacao %'], size = 10)
plt.tight_layout();
#plt.savefig('percentage_in_time', dpi=800)
output_file('percentage_in_time.html');
save(p4)
#show(p4)