In [2]:
import pandas as pd
import plotly.graph_objects as go

In [3]:
## load data

arabica_data = pd.read_csv('arabica_data_cleaned.csv')
arabica_df = pd.DataFrame(arabica_data)
# Unnamed: 0 is the index of the csv table

In [4]:
#delete the column 'Unnamed: 0'
arabica_df.drop(columns=['Unnamed: 0'], inplace=True)
arabica_df.head()

Unnamed: 0,Species,Owner,Country.of.Origin,Farm.Name,Lot.Number,Mill,ICO.Number,Company,Altitude,Region,...,Color,Category.Two.Defects,Expiration,Certification.Body,Certification.Address,Certification.Contact,unit_of_measurement,altitude_low_meters,altitude_high_meters,altitude_mean_meters
0,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,guji-hambela,...,Green,0,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
1,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,guji-hambela,...,Green,1,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0
2,Arabica,grounds for health admin,Guatemala,"san marcos barrancas ""san cristobal cuch",,,,,1600 - 1800 m,,...,,0,"May 31st, 2011",Specialty Coffee Association,36d0d00a3724338ba7937c52a378d085f2172daa,0878a7d4b9d35ddbf0fe2ce69a2062cceb45a660,m,1600.0,1800.0,1700.0
3,Arabica,yidnekachew dabessa,Ethiopia,yidnekachew dabessa coffee plantation,,wolensu,,yidnekachew debessa coffee plantation,1800-2200,oromia,...,Green,2,"March 25th, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1800.0,2200.0,2000.0
4,Arabica,metad plc,Ethiopia,metad plc,,metad plc,2014/2015,metad agricultural developmet plc,1950-2200,guji-hambela,...,Green,2,"April 3rd, 2016",METAD Agricultural Development plc,309fcf77415a3661ae83e027f7e5f05dad786e44,19fef5a731de2db57d16da10287413f5f99bc2dd,m,1950.0,2200.0,2075.0


In [5]:
## Keep only the columns 'Country.of.Origin','Producer' and 'Processing.Method'

arabica_df = arabica_df[['Country.of.Origin','Producer', 'Processing.Method']]
arabica_df.head()

Unnamed: 0,Country.of.Origin,Producer,Processing.Method
0,Ethiopia,METAD PLC,Washed / Wet
1,Ethiopia,METAD PLC,Washed / Wet
2,Guatemala,,
3,Ethiopia,Yidnekachew Dabessa Coffee Plantation,Natural / Dry
4,Ethiopia,METAD PLC,Washed / Wet


In [7]:
## Rename the columns

arabica_df.columns = ['Country of Origin', 'Producer', 'Processing Method']
arabica_df.head()

Unnamed: 0,Country of Origin,Producer,Processing Method
0,Ethiopia,METAD PLC,Washed / Wet
1,Ethiopia,METAD PLC,Washed / Wet
2,Guatemala,,
3,Ethiopia,Yidnekachew Dabessa Coffee Plantation,Natural / Dry
4,Ethiopia,METAD PLC,Washed / Wet


In [135]:
## Plot the histograms for each remaining column

for column_name in arabica_df:
    data_fig = [go.Bar(x=arabica_df[column_name].unique(), y=arabica_df[column_name].value_counts(), 
                   marker_color='white')]
    fig = go.Figure(data=data_fig)
    fig.update_layout(template='plotly_dark', title='Arabica: ' + column_name, 
                  yaxis=dict(title='Counts'), xaxis=dict(title=column_name))

    fig.show()

In [136]:
## Identify: Which countries have more than 10 and less than 30 entries?

mask_country_thirty = arabica_df["Country of Origin"].value_counts() < 30
mask_country_ten = arabica_df["Country of Origin"].value_counts() > 10
mask = (mask_country_ten == mask_country_thirty)
mask[mask]

Nicaragua      True
Uganda         True
Kenya          True
El Salvador    True
Indonesia      True
China          True
Malawi         True
Name: Country of Origin, dtype: bool

In [137]:
## Identify: Which is the producer with most entries?

max_producer = (arabica_df["Producer"].value_counts() == max(arabica_df["Producer"].value_counts()))
max_producer[max_producer]

La Plata    True
Name: Producer, dtype: bool

In [138]:
## Identify: What is the most common "Processing Method"

most_common_proccessing = (arabica_df["Processing Method"].value_counts() == max(arabica_df["Processing Method"].value_counts()))
most_common_proccessing[most_common_proccessing]

Washed / Wet    True
Name: Processing Method, dtype: bool

In [139]:
## Identify: What is the least common "Processing Method"

least_common_proccessing = (arabica_df["Processing Method"].value_counts() == min(arabica_df["Processing Method"].value_counts()))
least_common_proccessing[least_common_proccessing]

Pulped natural / honey    True
Name: Processing Method, dtype: bool