In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#For interactive plots
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [None]:
#load the data
fertilizers = pd.read_csv('data/Production_Livestock_E_All_Data_(Normalized).csv',  encoding='iso-8859-1')
beehives = fertilizers.loc[fertilizers['Item']=='Beehives']
beehives.head()




Separate the different countries and areas in two different lists

In [None]:
countries_and_areas=beehives['Area'].unique()
countries=countries_and_areas[[x.find('+')==-1 for x in countries_and_areas]]
weird_stuff=countries[-5:]
countries=countries[:-5]

areas=countries_and_areas[[x.find('+')!=-1 for x in countries_and_areas]]
print('Countries: ')
print(countries.tolist())
print('\nAreas: ')
print(areas.tolist())
print('\nWeird stuff: ')
print(weird_stuff.tolist())

I found some weird stuff as you can see, I don't know what it is. We should probably try to find out.

We can split again the areas list: some areas are geographical others depend on the developping situation of the country.

I'll check for the weird stuff first:

In [None]:
beehives.loc[beehives['Area']==weird_stuff[0]]

Ok, so there is no data in the weird stuff, let's forget about it and clean the data.

In [None]:
beehives = beehives[[name not in weird_stuff.tolist() for name in beehives['Area']]]

Let's split again our areas

In [None]:
geo_areas=areas[[x.lower().find('developing')==-1 and x.lower().find('countries')==-1 for x in areas]]
status_areas=areas[[x.lower().find('developing')!=-1 or x.lower().find('countries')!=-1 for x in areas]]

In [None]:
@interact
def show_beehives_for_country(country=countries):
    beehives_country=beehives.loc[beehives['Area']==country]
    plt.plot(beehives_country['Year'],beehives_country['Value'],'-*')
    plt.title('Country: '+country)
    plt.xlabel('Years')
    #plt.ylim(0,1.2e7)
    plt.ylabel('Number')
    plt.xlim(1960, 2010)

In [None]:
@interact
def show_beehives_for_geo_area(area=geo_areas):
    beehives_country=beehives.loc[beehives['country_or_area']==area]
    plt.plot(beehives_country['year'],beehives_country['value'],'-*')
    plt.title('Geographical area: '+area)
    plt.xlabel('Years')
    plt.ylabel('Number')
    plt.xlim(1960, 2010)

In [None]:
@interact
def show_beehives_for_status_area(area=status_areas):
    beehives_country=beehives.loc[beehives['country_or_area']==area]
    plt.plot(beehives_country['year'],beehives_country['value'],'-*')
    plt.title('Status area: '+area)
    plt.xlabel('Years')
    plt.ylabel('Number')
    plt.xlim(1960, 2010)

Other categories in the fertilizer data frame

In [None]:
fertilizers['category'].unique()

Basic ranking of number of beehives per country.  
We could divide by the area of the country to make it more fair?

In [None]:
years=beehives['year'].unique()
@interact
def country_ranking_over_year(year = years ):
    bee_year=beehives[[c in countries for c in beehives['country_or_area']]]
    bee_year=bee_year.loc[beehives['year']==year]
    val=bee_year['value']
    country=bee_year['country_or_area']
    ranking= pd.concat([country, val], axis=1)
    ranking=ranking.sort_values(by='value',ascending=False)
    plt.figure(figsize=(20,10))
    plt.xlim(0, 1.2e7)
    plt.barh(ranking['country_or_area'][:10],ranking['value'][:10])
    plt.xlabel('number of beehives')
    plt.title('top 10')
    plt.gca().invert_yaxis()

Distribution of number of beehives per country over year.
Again we could divide by the area of the country

In [None]:
@interact
def dist_over_year(year=years):
    bee_year=beehives[[c in countries for c in beehives['country_or_area']]]
    bee_year=bee_year.loc[beehives['year']==year]
    plt.hist(bee_year['value'],bins=20)
    plt.yscale('log')
    #plt.xscale('log')
    plt.xlim(1e-20,1.2e7)
    plt.ylabel('number of countries')
    plt.xlabel('number of beehives')
    plt.title('beehives distribuion across countries')

In [None]:
show_outliers=False
beehives.boxplot(column=['value'],by='year',figsize=(10,20),showfliers=show_outliers,vert=False)

In [None]:
land = pd.read_csv('data/fao_data_land_data.csv')
land.head()

In [None]:
land.category.unique()

In [None]:
land.loc[land.category=='land_area'].loc[land.country_or_area=='China'].head()

In [None]:
land.loc[land.category=='land_area'].unit.unique()

In [None]:
#compute normalized number of beehives (number of beehives / land area in 1000 Ha)
land_area=land.loc[land.category=='land_area']
land_area=land_area[['country_or_area','year','value']]
land_area.columns=['country_or_area','year','land_area']

beehives=pd.merge(beehives,land_area,on=['country_or_area','year'])
beehives['norm_value']=beehives['value']/beehives['land_area']
beehives.head()

In [None]:
years=beehives['year'].unique()
@interact(top=(3,100,1))
def normalized_country_ranking_over_year(year = years ,top=10):
    #get the country data
    bee_year=beehives[[c in countries for c in beehives['country_or_area']]]
    #get data of the seected year
    bee_year=bee_year.loc[beehives['year']==year]
    
    #prepare the ranking 
    bee_year=bee_year.sort_values(by='norm_value',ascending=False)
    
    #plotting
    plt.figure(figsize=(15,top))
    plt.barh(bee_year['country_or_area'][:top],bee_year['norm_value'][:top])
    plt.xlabel('number of beehives/land area')
    plt.title('Number of beehives per 1000Ha\ntop %d' % top)
    plt.gca().invert_yaxis()
    plt.xlim(0,250)

In [None]:
show_outliers=False
beehives.boxplot(column=['norm_value'],by='year',figsize=(10,20),showfliers=show_outliers,vert=False)

In [None]:
@interact
def dist_over_year(year=years):
    bee_year=beehives[[c in countries for c in beehives['country_or_area']]]
    bee_year=bee_year.loc[beehives['year']==year]
    plt.hist(bee_year['norm_value'],bins=20)
    plt.yscale('log')
    #plt.xscale('log')
    #plt.xlim(1e-20,1.2e7)
    plt.ylabel('number of countries')
    plt.xlabel('number of beehives per 1000 Ha')
    plt.title('Normalized beehives distribuion across countries')

In [None]:
beehives_onlyCountries = beehives[~beehives.country_or_area.str.contains("\+", na = False)]

Number of Beehives around the world grouped by year 

In [None]:
beehives_onlyCountries.groupby('year')['value'].apply(lambda x : x.astype(int).sum()).plot()

In [None]:
harvest_data = pd.read_csv('data/fao_data_crops_data.csv')


In [None]:
harvest_data.head()

In [None]:
harvest_data.element.unique()

In [None]:
areaHarvested = harvest_data.loc[harvest_data['element']=='Area Harvested']

In [None]:
@interact
def show_beehives_for_country(country=countries):
    areaHarvested_country=areaHarvested.loc[areaHarvested['country_or_area']==country]
    plt.plot(areaHarvested_country['year'],areaHarvested_country['value'],'-*')
    plt.title('Country: '+country)
    plt.xlabel('Years')
    #plt.ylim(0,1.2e7)
    plt.ylabel('Number')
    plt.xlim(1960, 2010)

In [None]:
harvested_onlyCountries = areaHarvested[~areaHarvested.country_or_area.str.contains("\+", na = False)]
harvested_onlyCountries.groupby('year')['value'].apply(lambda x : x.astype(int).sum()).plot()

In [None]:
@interact
def show_beehives_for_geo_area(area=geo_areas):
    areaHarvested_continent=areaHarvested.loc[areaHarvested['country_or_area']==area]
    plt.plot(areaHarvested_continent['year'],areaHarvested_continent['value'],'-*')
    plt.title('Geographical area: '+area)
    plt.xlabel('Years')
    plt.ylabel('Number')
    plt.xlim(1960, 2010)