# Cumulative counts per geographical region

By **Franklin Oliveira**

-----
This notebook contains all code necessary to make charts from `poliqueta` database with focus on collection's cumulative spacial increments. Here you'll find some basic data treatment and charts' code. 

Database: <font color='blue'>'IBUFRJ27.07.2020 - visualização.xlsx'</font> and <font color='blue'>'MNRJP27.07.2020 - visualização.xls'</font>.
    

In [1]:
import datetime
import numpy as np
import pandas as pd

from collections import defaultdict

# quick visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Waffle Charts
# from pywaffle import Waffle 
# docs: https://pywaffle.readthedocs.io/en/latest/examples/block_shape_distance_location_and_direction.html

# visualization
import altair as alt

# enabling notebook renderer
# alt.renderers.enable('notebook')
# alt.renderers.enable('default')

# disabling rows limit
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Importing data...

In [2]:
NewTable = pd.read_csv('./data/merged_db.csv', sep=';', encoding='utf-8-sig')

In [3]:
# formatando a string NaN
NewTable['family'] = NewTable['family'].apply(lambda x: 'NaN' if x=='Nan' else x)
NewTable['continent'] = NewTable['continent'].apply(lambda x: 'NaN' if str(x) == 'nan' else str(x).strip())
NewTable['country'] = NewTable['country'].apply(lambda x: 'NaN' if str(x) == 'nan' else str(x).strip())
NewTable['state'] = NewTable['state'].apply(lambda x: 'NaN' if str(x) == 'nan' else str(x).strip())

<br>

<font size=5>**Color Palette**</font>


<!-- <div class='row' style='padding-top:20px;'>
    <div class='col-md-6'>
        <img src="./src/img1.jpg" width='400px'>
    </div>
    <div class='col-md-6'>
        <img src="./src/img2.jpg" width='400px'>
    </div>
</div>

<br>

A partir das imagens acima, selecionamos cores (centróides) para criar a paleta de cores. Foram elas: 
<ul>
    <li style='color:#3CA67F'><b> #3CA67F </b># verde</li>
    <li style='color:#7A9FBF'><b> #7A9FBF </b># azul</li>
    <li style='color:#D94814'><b> #D94814 </b># laranja</li>
    <li style='color:#D96236'><b> #D96236 </b># laranja 2</li>
    <li style='color:#F2B999'><b> #F2B999 </b># 'cor de pele'</li>
    <li style='color:#A66C4B'><b> #A66C4B </b># marrom 1</li>
    <li style='color:#732C02'><b> #732C02 </b># marrom 2</li>
</ul>

A partir das cores "centróides", utilizamos a ferramenta Color Crafter para selecionar diferentes "shades" e auxiliar para categorização em diferentes grupos sugeridos pela equipe de Poliquetas do Museu Nacional. 

<ul>
    <li style='color:#3CA67F'><b> Verde: </b> ['#daffef', '#bbebd3', '#9adabc', '#77c8a5', '#57b791', '#3ca67f', '#2a9670', '#238762', '#257a56']</li>
    <li style='color:#7A9FBF'><b> Azul: </b> ['#e7e5df', '#ccd2d8', '#b2c0d0', '#96afc8', '#7a9fbf', '#5d90b6', '#3c81ae', '#0673a4', '#00669a']</li>
    <li style='color:#D94814'><b> laranja: </b> ['#ffbd84', '#ffaa74', '#ff9760', '#ff814b', '#fc6b36', '#eb5824', '#d94814', '#c83b03', '#b73000']</li>
    <li style='color:#D96236'><b> laranja 2: ['#ffeba9', '#ffd391', '#ffbb7b', '#fda468', '#f18e56', '#e57846', '#d96236', '#cc4d28', '#bf381b']</b> </li>
    <li style='color:#F2B999'><b> cor de pele: ['#ffe9c3', '#fbd0ad', '#f2b999', '#e8a287', '#dd8c76', '#d27666', '#c76158', '#bb4d4b', '#ae393e']</b> </li>
    <li style='color:#A66C4B'><b> marrom 1: ['#d9c6af', '#ccad96', '#c1977c', '#b48061', '#a66c4b', '#975b39', '#874c2c', '#774124', '#683720']</b> </li>
    <li style='color:#732C02'><b> marrom 2: ['#eebd93', '#dfa47a', '#d28d60', '#c37746', '#b4622f', '#a3501d', '#92420e', '#823606', '#732c02']</b> </li>
</ul>



**Colors  (antigas):** 

<ul>
    <li style='color:#41A681'><b> #41A681 </b># verde1</li>
    <li style='color:#3CA67F'><b> #3CA67F </b># verde2</li>
    <li style='color:#7ACAAB'><b> #7ACAAB </b># verde claro</li>
    <li style='color:#78a1a1'><b> #78a1a1 </b># azul</li>
    <li style='color:#8ABFB0'><b> #8ABFB0 </b># azul claro</li>
    <li style='color:#FFB27C'><b> #FFB27C </b># cor de pele clara</li>
    <li style='color:#F29877'><b> #F29877 </b># cor de pele</li>
    <li style='color:#ed845e'><b> #ed845e </b># laranja claro1</li>
    <li style='color:#D96236'><b> #D96236 </b># laranja claro2</li>
    <li style='color:#D95323'><b> #D95323 </b># laranja 1</li>
    <li style='color:#D94B18'><b> #D94B18 </b># laranja 2</li>
    <li style='color:#D9C2AD'><b> #D9C2AD </b># bege</li>
    <li style='color:#A66C4B'><b> #A66C4B </b># marrom claro</li>
    <li style='color:#86471B'><b> #86471B </b># marrom1</li>
    <li style='color:#732C02'><b> #732C02 </b># marrom2</li>
    <li style='color:#592202'><b> #592202 </b># marrom escuro1</li>
    <li style='color:#3D1806'><b> #3D1806 </b># marrom escuro2</li>
    <li style='color:#0D0D0D'><b> #0D0D0D </b># preto</li>
</ul> -->

In [4]:
# importing customized color palettes
from src.MNViz_colors import *

In [5]:
ordens = NewTable['order'].unique()
familias = NewTable['family'].unique()

<br>

## Counting per continent

In [6]:
# forces Country column to be in string format
NewTable['country'] = NewTable['country'].apply(lambda x:str(x))

In [7]:
# looking good...
# NewTable['continent'].value_counts()

In [8]:
# grouping per Year and Continent
teste = NewTable.groupby(['start_year','continent']).count()['class'].reset_index().rename(columns={
    'class':'counts'
})

# sorting...
teste = teste.sort_values(['continent', 'start_year'])

In [9]:
# cumulatively counting
cumSum = []
for continente in teste['continent'].unique():
    cumSum.extend(list(teste[teste['continent'] == continente]['counts'].cumsum()))
    
teste['cumulative_sum'] = cumSum

### Chart: all continents

In [11]:
select_continent = alt.selection_multi(fields=['continent'], bind='legend')

g1 = alt.Chart(teste, title="Collection's temporal evolution per continent", width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year'),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending')),
    color= alt.Color('continent:N', title='Continent',
                     scale=alt.Scale(domain=list(cores_continente.keys()), range=list(cores_continente.values()))),
    tooltip= alt.Tooltip(['continent','start_year','counts', 'cumulative_sum']),
    opacity= alt.condition(select_continent, alt.value(1), alt.value(0))
).add_selection(select_continent)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/time-spacial/evolution_per_continent.html')
# g1

### Chart: all continents (ex. South America)

In [13]:
select_continent = alt.selection_multi(fields=['continent'], bind='legend')

# removing South America from domain and range colors
continents_exSA = [c for c in teste['continent'].unique() if c != 'South America' and c != "NaN"]
colors_exSA = [cores_continente[c] for c in continents_exSA]

g1 = alt.Chart(teste[(teste['continent']!='South America') & (teste['continent']!='NaN')],
               title="Collection's temporal evolution per continent (ex. South America and missing values)", 
width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year'),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,80])),
    color= alt.Color('continent:N', title='Continent',
                     scale=alt.Scale(domain= continents_exSA, range= colors_exSA)),
    tooltip= alt.Tooltip(['continent','start_year','counts', 'cumulative_sum']),
    opacity= alt.condition(select_continent, alt.value(1), alt.value(0))
).add_selection(select_continent).configure_point(
    size=50
)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/time-spacial/temporal_evolution_per_OTHER_continents.html')
# g1

<br>

## Counting per Country

In [14]:
# OBS: conserto temporário (tem valores NaN na coluna continente para o país Brasil)
temp = NewTable.copy()
idx = temp[temp['country'] == 'Brazil'].index

temp.loc[idx, 'continent'] = 'South America'

In [15]:
# grouping per year, continent and country
teste = temp.groupby(['start_year','continent','country']).count()['class'].reset_index().rename(columns={
    'class':'counts'
})

teste = teste.sort_values(['country', 'start_year'])

In [16]:
# cumulatively counting
cumSum = []
for pais in teste['country'].unique():
    cumSum.extend(list(teste[teste['country'] == pais]['counts'].cumsum()))
    
teste['cumulative_sum'] = cumSum

### Chart: all countries

In [18]:
select_country = alt.selection_multi(fields=['country'], bind='legend')

g1 = alt.Chart(teste, title="Collection's temporal evolution per country", 
width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
            scale= alt.Scale(domain=list(sorted(teste['start_year'].unique())))), # fixed x-axis
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,6000])),
    color= alt.Color('country:N', title='Country',
                     legend= alt.Legend(columns=2, symbolLimit=42),
                     scale= alt.Scale(domain=list(cores_pais.keys()), range=list(cores_pais.values()))),
    tooltip= alt.Tooltip(['country','start_year','counts', 'cumulative_sum']),
#     opacity= alt.condition(select_country, alt.value(1), alt.value(0))
).add_selection(select_country).transform_filter(select_country)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/time-spacial/temporal_evolution_per_country.html')
# g1

<font color='red' size=4>TEST: multiple selectors (for country and continent)</font>

In [20]:
# selectors
select_continent = alt.selection_multi(fields=['continent'], bind='legend')
select_country = alt.selection_multi(fields=['country'], bind='legend')

db = teste[teste['country'] != 'Brazil']

# charts
g1 = alt.Chart(db, title="Collection's temporal evolution per country (ex. Brazil)", width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain=sorted(list(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,180])
            ),
    color= alt.Color('country:N', title='Country', 
                legend= alt.Legend(columns=2, symbolLimit=42),
                scale= alt.Scale(domain=list(cores_pais.keys()), range=list(cores_pais.values()))),
    tooltip= alt.Tooltip(['continent', 'country','start_year','counts', 'cumulative_sum']),
    detail= alt.Detail('country:N'),
    opacity= alt.condition(select_country, alt.value(1), alt.value(0.05))
).add_selection(select_country).transform_filter(select_country).transform_filter(select_continent)

g2 = alt.Chart(db, title="Collection's temporal evolution per country (ex. Brazil)", width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain=sorted(list(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,180])
            ),
    color= alt.Color('continent:N', title='Continent', 
                legend= alt.Legend(columns=2, symbolLimit=42),
                scale= alt.Scale(domain=list(cores_continente.keys()), range=list(cores_continente.values()))),
    tooltip= alt.Tooltip(['continent', 'country','start_year','counts', 'cumulative_sum']),
    detail= alt.Detail('country:N'),
    opacity= alt.condition(select_country, alt.value(1), alt.value(0.05))
).add_selection(select_continent).transform_filter(select_continent)


# creating layers (to make different selectors work together)
chart = alt.layer(g2, g1).resolve_scale('independent').configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# chart.save('./graphs/cumCounts/time-spacial/evolution_per_country-TEST.html')
# chart

### Chart: all countries (ex. Brasil)

In [22]:
select_country = alt.selection_multi(fields=['country'], bind='legend')

g1 = alt.Chart(teste[teste['country'] != 'Brazil'], title="Collection's temporal evolution per country (ex. Brazil)", 
width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
            scale= alt.Scale(domain=list(sorted(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
            scale= alt.Scale(domain=[0,200])),
    color= alt.Color('country:N', title='Country',
                     legend= alt.Legend(columns=2, symbolLimit=42),
                     scale= alt.Scale(domain=list(cores_pais.keys()), range=list(cores_pais.values()))),
    tooltip= alt.Tooltip(['country','start_year','counts', 'cumulative_sum']),
#     opacity= alt.condition(select_country, alt.value(1), alt.value(0))
).add_selection(select_country).transform_filter(select_country)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/time-spacial/temporal_evolution_per_country-exBrasil.html')
# g1

<br>

## Counting per Brazilian State

In [23]:
# filtering for Brazil, only
teste2 = NewTable[NewTable['country'] == 'Brazil']
teste2 = teste2.groupby(['start_year','state', 'order']).count()['class'].reset_index().rename(columns={
    'class':'counts'
})

### creating column with brazilian regions

In [24]:
regioes = {
    'NaN':'NaN',
    'Rio de Janeiro':'SE',
    'São Paulo':'SE',
    'Espírito Santo': 'SE',
    'Pernambuco':'NE',
    'Santa Catarina':'S',
    'Amazonas':'N',
    'Goiás':'CO',
    'Roraima':'N',
    'Pará':'N',
    'Mato Grosso':'CO',
    'Acre': 'N',
    'Bahia': 'NE',
    'Minas Gerais': 'SE',
    'Mato Grosso do Sul': 'CO',
    'Paraná': 'S',
    'Rondônia': 'N',
    'Ceará': 'NE',
    'Maranhão': 'N',
    'Rio Grande do Sul': 'S',
    'Paraíba': 'NE',
    'Distrito Federal': 'CO',
    'Alagoas': 'NE',
    'Amapá':'N',
    'Piauí': 'NE',
    'Brasília': 'CO',
    'Tocantins': 'N',
    'Rio Grande do Norte': 'NE',
    'Sergipe': 'NE',
    'Minas Gerais/Goiás/Distrito Federal': 'CO',
    'Santa Catarina-Rio Grande do Sul': 'S'
}

# criando coluna com as regiões
teste2['region'] = teste2['state'].apply(lambda x: regioes[str(x)])

# coluna com estado, regiao
teste2['regiao_e_estado'] = teste2['state'] + ', ' + teste2['region']

# ordenando por região e soma das contagens
sorting = teste2.groupby(['regiao_e_estado', 'region']).sum()['counts'].reset_index(
                                                                ).rename(columns={'counts':'soma'})
sorting = sorting.sort_values(['region','soma'], ascending=False)['regiao_e_estado'].unique()

In [25]:
# OBS: variável teste2 tem as informações que precisamos (vide gráfico de contagem por região - time_spacial)
teste = teste2.groupby(['start_year','regiao_e_estado']).count()['order'].reset_index().rename(columns={
    'order':'counts'
})

teste = teste.sort_values(['regiao_e_estado', 'start_year'])

In [26]:
# cumulatively counting
cumSum = []
for reg_est in teste['regiao_e_estado'].unique():
    cumSum.extend(list(teste[teste['regiao_e_estado'] == reg_est]['counts'].cumsum()))
    
teste['cumulative_sum'] = cumSum

In [27]:
teste['state'] = teste['regiao_e_estado'].apply(lambda x:x.split(',')[0])
teste['region'] = teste['regiao_e_estado'].apply(lambda x:x.split(',')[1].strip())

### Chart: cumulative counts per Brazilian State

In [29]:
select_state = alt.selection_multi(fields=['state'], bind='legend')

g1 = alt.Chart(teste, title="Collection's temporal evolution per Brazilian State", 
        width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain=sorted(list(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,140])),
    color= alt.Color('state:N', title='State', 
                legend= alt.Legend(columns=2, symbolLimit=42),
                scale= alt.Scale(domain=list(cores_estados.keys()), range=list(cores_estados.values()))),
    tooltip= alt.Tooltip(['region', 'state','start_year','counts', 'cumulative_sum']),
#     opacity= alt.condition(select_region, alt.value(1), alt.value(0.2))
).add_selection(select_state).transform_filter(select_state)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
).configure_view(strokeWidth=0)

# g1.save('./graphs/cumCounts/time-spacial/temporal_evolution_per_brazilian_state.html')
# g1

<br>

<font color='red' size='5'>TEST</font>

chart with multiple selectors

In [31]:
# selectors
select_region = alt.selection_multi(fields=['region'], bind='legend')
select_state = alt.selection_multi(fields=['state'], bind='legend')


# charts
g1 = alt.Chart(teste, title="Collection's temporal evolution per Brazilian State", width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain=sorted(list(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,140])),
    color= alt.Color('state:N', title='State', 
                legend= alt.Legend(columns=2, symbolLimit=42),
                scale= alt.Scale(domain=list(cores_estados.keys()), range=list(cores_estados.values()))),
    tooltip= alt.Tooltip(['region', 'state','start_year','counts', 'cumulative_sum']),
    detail= alt.Detail('state:N'),
    opacity= alt.condition(select_state, alt.value(1), alt.value(0.05))
).add_selection(select_state).transform_filter(select_state).transform_filter(select_region)

g2 = alt.Chart(teste, title="Collection's temporal evolution per Brazilian State", width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain=sorted(list(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,140])),
    color= alt.Color('region:N', title='Region', 
                legend= alt.Legend(columns=5, symbolLimit=42),
                scale= alt.Scale(domain=list(cores_regioes.keys()), range=list(cores_regioes.values()))),
    tooltip= alt.Tooltip(['region', 'state','start_year','counts', 'cumulative_sum']),
    detail= alt.Detail('state:N'),
    opacity= alt.condition(select_state, alt.value(1), alt.value(0.05))
).add_selection(select_region).transform_filter(select_region)


# creating layers (to make different selectors work together)
chart = alt.layer(g2, g1).resolve_scale('independent').configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# chart.save('./graphs/cumCounts/time-spacial/temporal_evolution_per_state-TEST.html')
# chart

<br>

### Chart: cumulative counts per Brazilian Region

In [32]:
temp = teste2.groupby(['start_year','region']).count()['order'].reset_index().rename(columns={
    'order':'counts'
})

temp = temp.sort_values(['region', 'start_year'])

In [33]:
# cumulatively counting
cumSum = []
for reg in temp['region'].unique():
    cumSum.extend(list(temp[temp['region'] == reg]['counts'].cumsum()))
    
temp['cumulative_sum'] = cumSum

In [35]:
select_region = alt.selection_multi(fields=['region'], bind='legend')

g1 = alt.Chart(temp, title="Collection's temporal evolution per Brazilian Region", 
               width=600).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain=sorted(list(teste['start_year'].unique())))),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending'),
             scale= alt.Scale(domain=[0,150])),
    color= alt.Color('region:N', title='Region', 
                        legend= alt.Legend(columns=1, symbolLimit=42),
                     scale= alt.Scale(domain=list(cores_regioes.keys()), range=list(cores_regioes.values()))),
    tooltip= alt.Tooltip(['region','start_year','counts', 'cumulative_sum']),
#     opacity= alt.condition(select_country, alt.value(1), alt.value(0))
).add_selection(select_region).transform_filter(select_region)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/time-spacial/temporal_evolution_per_region.html')
# g1

<br>

**That's it!**