# Counts per researcher

By **Franklin Oliveira**

-----
This notebook contains all code necessary to make the "type" charts from `poliqueta` database. Here you'll find some basic data treatment and charts' code. 

Database: <font color='blue'>'IBUFRJ27.07.2020 - visualização.xlsx'</font>.

In [1]:
import datetime
import numpy as np
import pandas as pd

from collections import defaultdict

# quick visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Waffle Charts
# from pywaffle import Waffle 
# docs: https://pywaffle.readthedocs.io/en/latest/examples/block_shape_distance_location_and_direction.html

# visualization
import altair as alt

# enabling notebook renderer
# alt.renderers.enable('notebook')
alt.renderers.enable('default')

# disabling rows limit
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Importing data...

In [2]:
NewTable = pd.read_csv('./data/merged_db.csv', sep=';', encoding='utf-8', low_memory=False)

<br>

<font size=5>**Color Palette per Order**</font>

These images were used as inspiration (https://color.adobe.com/create/image)

<img src="./src/img1.jpg" width='500px'>

<img src="./src/img2.jpg" width='500px'>

Cores: 

    '#8ABFB0',  # azul claro
    '#41A681',  # verde
    '#7ACAAB',  # verde claro
    '#D9C2AD',  # bege
    '#0D0D0D',  # preto
    '#D96236',  # laranja
    '#D94B18',  # laranja escuro
    '#FFB27C',  # cor de pele clara
    '#732C02',  # marrom
    '#86471B',  # mostarda
    
cores temporárias:

    '#592202',  # marrom escuro
    '#D96236',  # laranja escuro

In [3]:
# 220 NaNs
NewTable['order'].value_counts(dropna=False)

Phyllodocida        2430
Eunicida            1496
Sabellida            753
Scolecida            698
Amphinomida          464
Spionida             436
Terebellida          387
NaN                  242
Sipuncula              9
Canalipalpata          6
Crassiclitellata       1
Aspidosiphonida        1
Name: order, dtype: int64

In [4]:
ordens = NewTable['order'].unique()
cores = [
    '#8ABFB0',  # azul claro
    '#41A681',  # verde
    '#7ACAAB',  # verde claro
    '#D9C2AD',  # bege
    '#0D0D0D',  # preto
    '#D96236',  # laranja
    '#D94B18',  # laranja escuro
    '#FFB27C',  # cor de pele clara
    '#732C02',  # marrom
    '#86471B',  # mostarda
    
    # cores novas para Canalipalpata e Aspidosiphonida (a ordem é aleatória. Fixar depois)
    '#592202',
    '#D96236'
]

cores_ordem = defaultdict()
for j in range(len(ordens)):
    ordem = ordens[j]
    cores_ordem[ordem] = cores[j]
    
cores_ordem = dict(cores_ordem)

<br>


## Graphs

---

<br>

### Creating chart: counts per determiner per year

In [22]:
teste = NewTable.groupby(['determiner_full_name','cataloged_year']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [23]:
# OBS: 1455 nomes faltando
NewTable['determiner_full_name'].isna().sum()

2255

In [24]:
# gráfico não tem todas as contagens por causa de nomes faltando
g1 = alt.Chart(teste, width=600, height=500).mark_circle().encode(
    x= alt.X('cataloged_year', type='ordinal', title='Cataloged Year'),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner Name', 
            sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts', scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    tooltip= alt.Tooltip(['determiner_full_name', 'cataloged_year', 'counts'])
)

g1.save('./graphs/determiner/counts_per_determiner.html')

g1

<font color='red' size='5'>mesmo gráfico, ordenando também pela ordem</font>

**OBS:** Note que

- as contagens mudam (porque estamos ordenando por um fator a mais)
- há pontos sobrepostos (semelhante ao que tinhamos para a base crustaceas - para um mesmo ano, um mesmo pesquisador descobriu animais de ordens/familias diferentes)

In [30]:
teste1 = NewTable.groupby(['determiner_full_name','cataloged_year', 'collection_prefix','order']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [35]:
g2 = alt.Chart(teste1, width=400, height=500).mark_circle().encode(
    x= alt.X('cataloged_year', type='ordinal', title='Cataloged Year'),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner Name', 
            sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts', type="quantitative",scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    color= alt.Color('order', type="nominal", title="Order", scale=alt.Scale(domain=ordens, range=cores)),
    tooltip= alt.Tooltip(['determiner_full_name', 'cataloged_year', 'counts', 'order'])
)

# saving chart
g2.facet(column='collection_prefix').resolve_scale(y='independent'
                                    ).save('./graphs/determiner/counts_per_determiner-w-order.html')

g2.facet(column='collection_prefix').resolve_scale(y='independent')

<br>

<font color='red'>**mesmo gráfico, agora ordenando pelo primeiro ano que o pesquisador aparece na base**</font>

In [36]:
# ordenando
teste1.sort_values(['cataloged_year', 'determiner_full_name'], inplace=True)

# salvando ordem das entradas
sorting = list(teste1['determiner_full_name'].unique())

In [40]:
g2 = alt.Chart(teste1, width=400, height=500).mark_circle().encode(
    x= alt.X('cataloged_year', type='ordinal', title='Cataloged Year'),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner Name', 
            sort=sorting),
    size= alt.Size('counts', type="quantitative",scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    color= alt.Color('order', type="nominal", title="Order", scale=alt.Scale(domain=ordens, range=cores)),
    tooltip= alt.Tooltip(['determiner_full_name', 'cataloged_year', 'counts', 'order'])
)

# saving chart
g2.facet(column='collection_prefix').resolve_scale(y='independent'
                            ).save('./graphs/determiner/counts_per_determiner-c-ordem_reordenado.html')

g2.facet(column='collection_prefix').resolve_scale(y='independent')

<br>

### Gráfico dos determinadores mais expressivos 

<font color='red' size='5'>mesmo gráfico, ordenado pela soma das contagens </font>

In [41]:
# somando contagens de cada pesquisador
sorting = teste1.groupby('determiner_full_name').sum()['counts'].reset_index().rename(
    columns={'counts':'sum'})

sorting = sorting.sort_values('sum', ascending=False)

# Nomes dos pesquisadores ordenados
sort_list = sorting['determiner_full_name'].unique()

# sorting.head()

In [43]:
g2 = alt.Chart(teste1[teste1['determiner_full_name'].isin(sort_list)], width=400, height=600).mark_circle().encode(
    x= alt.X('cataloged_year', type='ordinal', title='Cataloged Year'),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner Name', 
            sort=sort_list),
    size= alt.Size('counts', type="quantitative",scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    color= alt.Color('order', type="nominal", title="Order", scale=alt.Scale(domain=ordens, range=cores)),
    tooltip= alt.Tooltip(['determiner_full_name', 'cataloged_year', 'counts', 'order'])
)

# saving chart
g2.facet(column='collection_prefix').resolve_scale(y='independent'
                                ).save('./graphs/determiner/counts_per_determiner-ordenado_pela_soma.html')

g2.facet(column='collection_prefix').resolve_scale(y='independent')

<br>

### Creating chart: counts per collector per year

<font color='red' size='5'> Collectors name is sensitive data. Do not publish it without curator's permission </font>

In [44]:
# OBS: 1059 NANs
NewTable['collector_full_name'].isna().sum()

1439

In [45]:
teste = NewTable.groupby(['collector_full_name','start_year']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [46]:
g1 = alt.Chart(teste, width=600, height=600).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collector_full_name', type='nominal', title='Collector Name', 
            sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts', scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts'])
)

# g1.save('./graphs/collector/counts_per_collector.html')

# g1

<font color='red' size='5'>mesmo gráfico, ordenando também pela ordem</font>

**OBS:** Note que

- as contagens mudam (porque estamos ordenando por um fator a mais)
- há pontos sobrepostos (semelhante ao que tinhamos para a base crustaceas - para um mesmo ano, um mesmo pesquisador descobriu animais de ordens/familias diferentes)

In [47]:
teste1 = NewTable.groupby(['collector_full_name','start_year', 'order', 'collection_prefix']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [48]:
g2 = alt.Chart(teste1, width=600, height=600).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collector_full_name', type='nominal', title='Collector Name', 
            sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts', type="quantitative",scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    color= alt.Color('order', type="nominal", title="Order", scale=alt.Scale(domain=ordens, range=cores)),
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts', 'order'])
)

# faceting
g2 = g2.facet(column='collection_prefix').resolve_scale(y='independent')

# saving chart
# g2.save('./graphs/collector/counts_per_collector-w-order.html')

g2

<br>

<font color='red'>**mesmo gráfico, agora ordenando pelo primeiro ano que o pesquisador aparece na base**</font>

In [20]:
# ordenando
teste1.sort_values(['start_year', 'collector_full_name'], inplace=True)

# salvando ordem das entradas
sorting = list(teste1['collector_full_name'].unique())

In [22]:
g2 = alt.Chart(teste1, width=600, height=600).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collector_full_name', type='nominal', title='Collector Name', 
            sort=alt.EncodingSortField('start_year', op="min", order='ascending')),
    size= alt.Size('counts', type="quantitative",scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    color= alt.Color('order', type="nominal", title="Order", scale=alt.Scale(domain=ordens, range=cores)),
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts', 'order'])
)

g2 = g2.facet(column='collection_prefix').resolve_scale(y='independent')

# saving chart
# g2.save('./graphs/collector/counts_per_researcher-c-ordem_reordenado.html')

g2

<br>

### Gráfico dos coletores mais expressivos

In [23]:
# somando contagens de cada pesquisador
sorting = teste1.groupby('collector_full_name').sum()['counts'].reset_index().rename(
    columns={'counts':'sum'})

sorting = sorting.sort_values('sum', ascending=False)

# Nomes dos pesquisadores ordenados
sort_list = sorting['collector_full_name'].unique()

sorting.head()

Unnamed: 0,collector_full_name,sum
106,Ricardo Guimaraes,956
93,Paulo Paiva,781
20,Cenpes Petrobras,531
55,Guilherme Pereira-filho,444
101,Projeto Revizee/score-central,308


In [25]:
g2 = alt.Chart(teste1[teste1['collector_full_name'].isin(sort_list)], 
               width=600, height=600).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collector_full_name', type='nominal', title='Collector Name', sort= sort_list[:50]),
    size= alt.Size('counts', type="quantitative",scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    color= alt.Color('order', type="nominal", title="Order", scale=alt.Scale(domain=ordens, range=cores)),
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts', 'order'])
)

g2 = g2.facet(column='collection_prefix').resolve_scale(y='independent')

# saving chart
# g2.save('./graphs/collector/counts_per_collector-ordenado_pela_soma.html')

g2

<br>

**The end!**

-----

<font color='red' size='5'>**Experimental:** fazendo com primeiro e segundo nome dos coletores </font>

problema: temos poucos nomes para o segundo coletor

In [26]:
# juntando coletor 1 e coletor 2
NewTable['collectors'] = NewTable['collector_full_name'].astype(str) + ', ' + NewTable['collector_full_name2'].astype(str)

In [27]:
teste1 = NewTable.groupby(['collectors','start_year', 'order', 'collection_prefix']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [32]:
g2 = alt.Chart(teste1, width=600, height=900).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collectors', type='nominal', title='Collector 1, Collector 2', 
            sort=alt.EncodingSortField('counts', op="sum", order='descending')),
    size= alt.Size('counts', type="quantitative",scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    color= alt.Color('order', type="nominal", title="Order", scale=alt.Scale(domain=ordens, range=cores)),
    tooltip= alt.Tooltip(['collectors', 'start_year', 'counts', 'order'])
)

# faceting
g2 = g2.facet(column='collection_prefix').resolve_scale(y='independent')

# saving chart
# g2.save('./graphs/collector/counts_per_collector-w-order-coletor1e2.html')

g2

<br>

<font color='red'>**mesmo gráfico, agora ordenando pelo primeiro ano que o pesquisador aparece na base**</font>

In [30]:
# ordenando
teste1.sort_values(['start_year', 'collectors'], inplace=True)

# salvando ordem das entradas
sorting = list(teste1['collectors'].unique())

In [31]:
g2 = alt.Chart(teste1, width=600, height=900).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collectors', type='nominal', title='Collector 1, Collector 2', 
            sort=alt.EncodingSortField('start_year', op="min", order='ascending')),
    size= alt.Size('counts', type="quantitative",scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    color= alt.Color('order', type="nominal", title="Order", scale=alt.Scale(domain=ordens, range=cores)),
    tooltip= alt.Tooltip(['collectors', 'start_year', 'counts', 'order'])
)

g2 = g2.facet(column='collection_prefix').resolve_scale(y='independent')

# saving chart
g2.save('./graphs/collector/counts_per_researcher-c-ordem_reordenado-coletor1e2.html')

g2