# Cumulative counts per geographical region

By **Franklin Oliveira**

-----
This notebook contains all code necessary to make charts from `carcinos` database with focus on collection's cumulative spacial increments. Here you'll find some basic data treatment and charts' code. 

Database: <font color='blue'>'Planilha geral Atualizada FINAL 5_GERAL_sendo trabalhada no Google drive.xlsx'</font>.    

In [1]:
import datetime
import numpy as np
import pandas as pd

from collections import defaultdict

# quick visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Waffle Charts
# from pywaffle import Waffle 
# docs: https://pywaffle.readthedocs.io/en/latest/examples/block_shape_distance_location_and_direction.html

# visualization
import altair as alt

# enabling notebook renderer
# alt.renderers.enable('notebook')
# alt.renderers.enable('default')

# disabling rows limit
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Importing data...

In [2]:
NewTable = pd.read_csv('./data/treated_db.csv', sep=';', encoding='utf-8-sig', low_memory=False)

In [3]:
# formatando a string NaN
# NewTable['family'] = NewTable['family'].apply(lambda x: 'NaN' if x=='Nan' else x)
NewTable['continent'] = NewTable['continent'].apply(lambda x: 'NaN' if str(x) == 'nan' else str(x).strip())
NewTable['country'] = NewTable['country'].apply(lambda x: 'NaN' if str(x) == 'nan' else str(x).strip())
NewTable['state'] = NewTable['state'].apply(lambda x: 'NaN' if str(x) == 'nan' else str(x).strip())

## Filtering

At least for now, we'll be considering only specimens of order decapoda (deeply revised by the Museum's crew)

In [4]:
decapoda = NewTable[NewTable['order'] == 'Decapoda'].copy()

<br>

<font size=5>**Color palette**</font>

Colors (per infraorder): 

- <font color='#e26d67'><b>Ascacidae</b></font>
- <font color='#007961'><b>Anomura</b></font>
- <font color='#7a2c39'><b>Achelata</b></font>
- <font color='#b67262'><b>Axiidea</b></font>
- <font color='#ee4454'><b>Brachyura</b></font>
- <font color='#3330b7'><b>Caridea</b></font>
- <font color='#58b5e1'><b>Gebiidea</b></font>
- <font color='#b8e450'><b>Stenopodídea</b></font>
- <font color='#a0a3fd'><b>Astacidae</b></font>
- <font color='#deae9e'><b>Polychelida</b></font>
- <font color='#d867be'><b>Grapsidae</b></font>
- <font color='#fece5f'><b>Xanthoidea</b></font>

In [5]:
# importing customized color palettes
from src.MNViz_colors import *

<br>

## Counting per continent

In [6]:
# corrects some typos
continent_typos = {
    'America do Sul':'América do Sul',
    'America do Norte':'América do Norte',
    'America Central':'América Central',
    'Africa':'África',
    'Asia':'Ásia'
}

decapoda['continent'] = decapoda['continent'].apply(lambda x: continent_typos[x] if x in continent_typos.keys() else x)

In [7]:
# looking good...
# decapoda['continent'].value_counts()

In [8]:
def convert2int(y):
    try:
        return int(y)
    except:
        return y

In [9]:
# p.s.: there are strings and numbers (floats) in this column
# decapoda['start_year'].unique()

In [10]:
# grouping per Year and Continent
decapoda['start_year'] = decapoda['start_year'].apply(convert2int).astype(str)  # forcing to string to keep NaNs
teste = decapoda.groupby(['start_year','continent']).count()['class'].reset_index().rename(columns={
    'class':'counts'
})

# sorting...
teste = teste.sort_values(['continent', 'start_year'])

In [11]:
# cumulatively counting
cumSum = []
for continente in teste['continent'].unique():
    cumSum.extend(list(teste[teste['continent'] == continente]['counts'].cumsum()))
    
teste['cumulative_sum'] = cumSum

### Chart: all continents

In [13]:
# selector
select_continent = alt.selection_multi(fields=['continent'], bind='legend')

g1 = alt.Chart(teste, title="Collection's temporal evolution per continent", 
               width=800).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year'),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending')),
    color= alt.Color('continent:N', title='Continent',
                     scale=alt.Scale(domain=list(cores_continente.keys()), range=list(cores_continente.values()))),
    tooltip= alt.Tooltip(['continent','start_year','counts', 'cumulative_sum']),
    opacity= alt.condition(select_continent, alt.value(1), alt.value(0))
).add_selection(select_continent)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/time-spacial/evolution_per_continent.html')
# g1

### Chart: all continents (ex. South America)

In [14]:
decapoda['continent'].value_counts()

América do Sul      8144
Desconhecido         106
Europa                24
América do Norte      21
Oceania                9
América Central        8
África                 6
NaN                    3
Ásia                   2
Name: continent, dtype: int64

In [17]:
select_continent = alt.selection_multi(fields=['continent'], bind='legend')

# database
db = teste[(teste['continent']!='América do Sul')]

# removing South America from domain and range colors
continents_exSA = [c for c in teste['continent'].unique() if c != 'América do Sul']
colors_exSA = [cores_continente[c] for c in continents_exSA]

g1 = alt.Chart(db,
               title="Collection's temporal evolution per continent (ex. South America and missing values)", 
width=800).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year'),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,120])),
    color= alt.Color('continent:N', title='Continent',
                     scale=alt.Scale(domain= continents_exSA, range= colors_exSA)),
    tooltip= alt.Tooltip(['continent','start_year','counts', 'cumulative_sum']),
    opacity= alt.condition(select_continent, alt.value(1), alt.value(0))
).add_selection(select_continent).configure_point(
    size=50
)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/time-spacial/evolution_per_OTHER_continents.html')
# g1

<br>

## Counting per Country

In [18]:
# OBS: conserto temporário (tem valores NaN na coluna continente para o país Brasil)
temp = decapoda.copy()
idx = temp[temp['country'] == 'Brasil'].index

temp.loc[idx, 'continent'] = 'América do Sul'

In [19]:
# grouping per year, continent and country
teste = temp.groupby(['start_year','continent','country']).count()['class'].reset_index().rename(columns={
    'class':'counts'
})

teste = teste.sort_values(['country', 'start_year'])

In [20]:
# cumulatively counting
cumSum = []
for pais in teste['country'].unique():
    cumSum.extend(list(teste[teste['country'] == pais]['counts'].cumsum()))
    
teste['cumulative_sum'] = cumSum

### Chart: all countries

In [22]:
select_country = alt.selection_multi(fields=['country'], bind='legend')

g1 = alt.Chart(teste, title="Collection's temporal evolution per country", 
width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
            scale= alt.Scale(domain=list(sorted(teste['start_year'].unique())))), # fixed x-axis
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,8200])),
    color= alt.Color('country:N', title='Country',
                     legend= alt.Legend(columns=2, symbolLimit=42),
                     scale= alt.Scale(domain=list(cores_pais.keys()), range=list(cores_pais.values()))),
    tooltip= alt.Tooltip(['country','start_year','counts', 'cumulative_sum']),
#     opacity= alt.condition(select_country, alt.value(1), alt.value(0))
).add_selection(select_country).transform_filter(select_country)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/time-spacial/evolution_per_country.html')
# g1

<font color='red' size=4>TEST: multiple selectors (for country and continent)</font>

In [24]:
# selectors
select_continent = alt.selection_multi(fields=['continent'], bind='legend')
select_country = alt.selection_multi(fields=['country'], bind='legend')

db = teste[teste['country'] != 'Brasil']

# charts
g1 = alt.Chart(db, title="Collection's temporal evolution per country (ex. Brazil)", width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain=sorted(list(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,110])
            ),
    color= alt.Color('country:N', title='Country', 
                legend= alt.Legend(columns=2, symbolLimit=42),
                scale= alt.Scale(domain=list(cores_pais.keys()), range=list(cores_pais.values()))),
    tooltip= alt.Tooltip(['continent', 'country','start_year','counts', 'cumulative_sum']),
    detail= alt.Detail('country:N'),
    opacity= alt.condition(select_country, alt.value(1), alt.value(0.05))
).add_selection(select_country).transform_filter(select_country).transform_filter(select_continent)

g2 = alt.Chart(db, title="Collection's temporal evolution per country (ex. Brazil)", width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain=sorted(list(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,110])
            ),
    color= alt.Color('continent:N', title='Continent', 
                legend= alt.Legend(columns=2, symbolLimit=42),
                scale= alt.Scale(domain=list(cores_continente.keys()), range=list(cores_continente.values()))),
    tooltip= alt.Tooltip(['continent', 'country','start_year','counts', 'cumulative_sum']),
    detail= alt.Detail('country:N'),
    opacity= alt.condition(select_country, alt.value(1), alt.value(0.05))
).add_selection(select_continent).transform_filter(select_continent)


# creating layers (to make different selectors work together)
chart = alt.layer(g2, g1).resolve_scale('independent').configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# chart.save('./graphs/cumCounts/time-spacial/evolution_per_country-TEST.html')
# chart

### Chart: all countries (ex. Brasil)

In [26]:
select_country = alt.selection_multi(fields=['country'], bind='legend')

# database
db = teste[teste['country'] != 'Brasil']

g1 = alt.Chart(db, 
               title="Collection's temporal evolution per country (ex. Brazil)", 
width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
            scale= alt.Scale(domain=list(sorted(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
            scale= alt.Scale(domain=[0,110])),
    color= alt.Color('country:N', title='Country',
                     legend= alt.Legend(columns=2, symbolLimit=42),
                     scale= alt.Scale(domain=list(cores_pais.keys()), range=list(cores_pais.values()))),
    tooltip= alt.Tooltip(['country','start_year','counts', 'cumulative_sum']),
#     opacity= alt.condition(select_country, alt.value(1), alt.value(0))
).add_selection(select_country).transform_filter(select_country)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/time-spacial/evolution_per_country-exBrazil.html')
# g1

<br>

## Counting per Brazilian State

In [27]:
# filtering for Brazil, only
teste2 = decapoda[decapoda['country'] == 'Brasil']
teste2 = teste2.groupby(['start_year','state','braz_region', 'infraorder']).count()['class'].reset_index().rename(columns={
    'class':'counts'
})

### creating column with brazilian regions

In [28]:
# coluna com estado, regiao
teste2['regiao_e_estado'] = teste2['state'] + ', ' + teste2['braz_region']

# ordenando por região e soma das contagens
sorting = teste2.groupby(['regiao_e_estado', 'braz_region']).sum()['counts'].reset_index(
                                                                ).rename(columns={'counts':'soma'})
sorting = sorting.sort_values(['braz_region','soma'], ascending=False)['regiao_e_estado'].unique()

In [29]:
# OBS: variável teste2 tem as informações que precisamos (vide gráfico de contagem por região - time_spacial)
teste = teste2.groupby(['start_year','regiao_e_estado']).count()['infraorder'].reset_index().rename(columns={
    'infraorder':'counts'
})

teste = teste.sort_values(['regiao_e_estado', 'start_year'])

In [30]:
# cumulatively counting
cumSum = []
for reg_est in teste['regiao_e_estado'].unique():
    cumSum.extend(list(teste[teste['regiao_e_estado'] == reg_est]['counts'].cumsum()))
    
teste['cumulative_sum'] = cumSum

In [31]:
teste['state'] = teste['regiao_e_estado'].apply(lambda x:x.split(',')[0])
teste['region'] = teste['regiao_e_estado'].apply(lambda x:x.split(',')[1].strip())

### Chart: cumulative counts per Brazilian State

In [33]:
select_state = alt.selection_multi(fields=['state'], bind='legend')

g1 = alt.Chart(teste, title="Collection's temporal evolution per Brazilian State", 
        width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year'),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,320])),
    color= alt.Color('state:N', title='State', 
                legend= alt.Legend(columns=2, symbolLimit=42),
                scale= alt.Scale(domain=list(cores_estados.keys()), range=list(cores_estados.values()))),
    tooltip= alt.Tooltip(['region', 'state','start_year','counts', 'cumulative_sum']),
#     opacity= alt.condition(select_region, alt.value(1), alt.value(0.2))
).add_selection(select_state).transform_filter(select_state)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
).configure_view(strokeWidth=0)

# g1.save('./graphs/cumCounts/time-spacial/evolution_per_state.html')
# g1

<br>

<font color='red' size='5'>TESTE</font>

gráfico com múltiplos seletores 

In [35]:
# selectors
select_region = alt.selection_multi(fields=['region'], bind='legend')
select_state = alt.selection_multi(fields=['state'], bind='legend')


# charts
g1 = alt.Chart(teste, title="Collection's temporal evolution per Brazilian State", width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain=sorted(list(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,320])),
    color= alt.Color('state:N', title='State', 
                legend= alt.Legend(columns=2, symbolLimit=42),
                scale= alt.Scale(domain=list(cores_estados.keys()), range=list(cores_estados.values()))),
    tooltip= alt.Tooltip(['region', 'state','start_year','counts', 'cumulative_sum']),
    detail= alt.Detail('state:N'),
    opacity= alt.condition(select_state, alt.value(1), alt.value(0.05))
).add_selection(select_state).transform_filter(select_state).transform_filter(select_region)

g2 = alt.Chart(teste, title="Collection's temporal evolution per Brazilian State", width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain=sorted(list(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,320])),
    color= alt.Color('region:N', title='Region', 
                legend= alt.Legend(columns=5, symbolLimit=42),
                scale= alt.Scale(domain=list(cores_regioes.keys()), range=list(cores_regioes.values()))),
    tooltip= alt.Tooltip(['region', 'state','start_year','counts', 'cumulative_sum']),
    detail= alt.Detail('state:N'),
    opacity= alt.condition(select_state, alt.value(1), alt.value(0.05))
).add_selection(select_region).transform_filter(select_region)


# creating layers (to make different selectors work together)
chart = alt.layer(g2, g1).resolve_scale('independent').configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# chart.save('./graphs/cumCounts/time-spacial/evolution_per_state-TEST.html')
# chart

<br>

### Chart: cumulative counts per Brazilian Region

In [36]:
temp = teste2.groupby(['start_year','braz_region']).count()['infraorder'].reset_index().rename(columns={
    'infraorder':'counts'
})

temp = temp.sort_values(['braz_region', 'start_year'])

In [37]:
# cumulatively counting
cumSum = []
for reg in temp['braz_region'].unique():
    cumSum.extend(list(temp[temp['braz_region'] == reg]['counts'].cumsum()))
    
temp['cumulative_sum'] = cumSum

In [40]:
select_region = alt.selection_multi(fields=['region'], bind='legend')

g1 = alt.Chart(temp, title="Collection's temporal evolution per Brazilian Region", 
               width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain=sorted(list(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,550])),
    color= alt.Color('braz_region:N', title='Region', 
                        legend= alt.Legend(columns=1, symbolLimit=42),
                     scale= alt.Scale(domain=list(cores_regioes.keys()), range=list(cores_regioes.values()))),
    tooltip= alt.Tooltip(['braz_region','start_year','counts', 'cumulative_sum']),
#     opacity= alt.condition(select_country, alt.value(1), alt.value(0))
).add_selection(select_region).transform_filter(select_region)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/time-spacial/evolution_per_region.html')
# g1

<br>

**That's it!**