# Counts per researcher

By **Franklin Oliveira**

-----
This notebook contains all code necessary to make the "type" charts from `poliqueta` database. Here you'll find some basic data treatment and charts' code. 

Database: <font color='blue'>'IBUFRJ27.07.2020 - visualização.xlsx'</font>.

In [1]:
import datetime
import numpy as np
import pandas as pd

from collections import defaultdict

# quick visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Waffle Charts
# from pywaffle import Waffle 
# docs: https://pywaffle.readthedocs.io/en/latest/examples/block_shape_distance_location_and_direction.html

# visualization
import altair as alt

# enabling notebook renderer
# alt.renderers.enable('notebook')
alt.renderers.enable('default')

# disabling rows limit
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Importing data...

In [2]:
NewTable = pd.read_csv('./data/merged_db.csv', sep=';', encoding='utf-8', low_memory=False)

In [3]:
# parsing some columns into strings so the chart's selectors work properly
NewTable['family'] = NewTable['family'].astype(str)

# formatting string nan as NaN to match palette dictionary key
NewTable['family'] = NewTable['family'].apply(lambda x: 'NaN' if str(x).lower() == 'nan' else x)

<br>

<font size=5>**Color Palette**</font>


<!-- <div class='row' style='padding-top:20px;'>
    <div class='col-md-6'>
        <img src="./src/img1.jpg" width='400px'>
    </div>
    <div class='col-md-6'>
        <img src="./src/img2.jpg" width='400px'>
    </div>
</div>

<br>

A partir das imagens acima, selecionamos cores (centróides) para criar a paleta de cores. Foram elas: 
<ul>
    <li style='color:#3CA67F'><b> #3CA67F </b># verde</li>
    <li style='color:#7A9FBF'><b> #7A9FBF </b># azul</li>
    <li style='color:#D94814'><b> #D94814 </b># laranja</li>
    <li style='color:#D96236'><b> #D96236 </b># laranja 2</li>
    <li style='color:#F2B999'><b> #F2B999 </b># 'cor de pele'</li>
    <li style='color:#A66C4B'><b> #A66C4B </b># marrom 1</li>
    <li style='color:#732C02'><b> #732C02 </b># marrom 2</li>
</ul>

A partir das cores "centróides", utilizamos a ferramenta Color Crafter para selecionar diferentes "shades" e auxiliar para categorização em diferentes grupos sugeridos pela equipe de Poliquetas do Museu Nacional. 

<ul>
    <li style='color:#3CA67F'><b> Verde: </b> ['#daffef', '#bbebd3', '#9adabc', '#77c8a5', '#57b791', '#3ca67f', '#2a9670', '#238762', '#257a56']</li>
    <li style='color:#7A9FBF'><b> Azul: </b> ['#e7e5df', '#ccd2d8', '#b2c0d0', '#96afc8', '#7a9fbf', '#5d90b6', '#3c81ae', '#0673a4', '#00669a']</li>
    <li style='color:#D94814'><b> laranja: </b> ['#ffbd84', '#ffaa74', '#ff9760', '#ff814b', '#fc6b36', '#eb5824', '#d94814', '#c83b03', '#b73000']</li>
    <li style='color:#D96236'><b> laranja 2: ['#ffeba9', '#ffd391', '#ffbb7b', '#fda468', '#f18e56', '#e57846', '#d96236', '#cc4d28', '#bf381b']</b> </li>
    <li style='color:#F2B999'><b> cor de pele: ['#ffe9c3', '#fbd0ad', '#f2b999', '#e8a287', '#dd8c76', '#d27666', '#c76158', '#bb4d4b', '#ae393e']</b> </li>
    <li style='color:#A66C4B'><b> marrom 1: ['#d9c6af', '#ccad96', '#c1977c', '#b48061', '#a66c4b', '#975b39', '#874c2c', '#774124', '#683720']</b> </li>
    <li style='color:#732C02'><b> marrom 2: ['#eebd93', '#dfa47a', '#d28d60', '#c37746', '#b4622f', '#a3501d', '#92420e', '#823606', '#732c02']</b> </li>
</ul>



**Colors  (antigas):** 

<ul>
    <li style='color:#41A681'><b> #41A681 </b># verde1</li>
    <li style='color:#3CA67F'><b> #3CA67F </b># verde2</li>
    <li style='color:#7ACAAB'><b> #7ACAAB </b># verde claro</li>
    <li style='color:#78a1a1'><b> #78a1a1 </b># azul</li>
    <li style='color:#8ABFB0'><b> #8ABFB0 </b># azul claro</li>
    <li style='color:#FFB27C'><b> #FFB27C </b># cor de pele clara</li>
    <li style='color:#F29877'><b> #F29877 </b># cor de pele</li>
    <li style='color:#ed845e'><b> #ed845e </b># laranja claro1</li>
    <li style='color:#D96236'><b> #D96236 </b># laranja claro2</li>
    <li style='color:#D95323'><b> #D95323 </b># laranja 1</li>
    <li style='color:#D94B18'><b> #D94B18 </b># laranja 2</li>
    <li style='color:#D9C2AD'><b> #D9C2AD </b># bege</li>
    <li style='color:#A66C4B'><b> #A66C4B </b># marrom claro</li>
    <li style='color:#86471B'><b> #86471B </b># marrom1</li>
    <li style='color:#732C02'><b> #732C02 </b># marrom2</li>
    <li style='color:#592202'><b> #592202 </b># marrom escuro1</li>
    <li style='color:#3D1806'><b> #3D1806 </b># marrom escuro2</li>
    <li style='color:#0D0D0D'><b> #0D0D0D </b># preto</li>
</ul>

 -->

In [4]:
# importing customized color palettes
from src.MNViz_colors import *

In [5]:
# # determinando cores de acordo com a planilha (2020.10.01 - IB e MN - Cores visualização.xlsx)
# ordens = NewTable['order'].unique()
# familias = NewTable['family'].unique()

# # # o agrupamento é feito por famílias (ordem daquelas famílias deve assumir certa cor)
# # cores_ordem = {
# #     'Spionida':'#41A681',   # verde
# #     'Sabellida':'#7ACAAB',  # verde claro
# #     'Canalipalpata':'#78a1a1',  # azul
# #     'Amphinomida':'#8ABFB0',  # azul claro
# #     'Eunicida':'#A66C4B', # marrom claro
# #     'Phyllodocida':'#732C02', # marrom2
# #     'Terebellida':'#ed845e', # laranja claro1
# #     'Scolecida':'#D94B18', # laranja 2
# #     np.NAN:'#0D0D0D',  # preto
    
# #     # ordens não citadas na planilha:
# #     'Sipuncula':'#D9C2AD', # bege
# #     'Crassiclitellata':'#FFB27C', # cor de pele clara
# #     'Aspidosiphonida':'#F29877',  # cor de pele
    
# # }

# # paleta de cores por família
# cores_familia = {
#     'Magelonidae':'#238762',    # verde escuro 
#     'Oweniidae':'#3CA67F',      # verde (centroide)  
#     'Chaetopteridae':'#77c8a5', # verde
#     'Amphinomidae':'#bbebd3',   # verde claro
#     'Lumbrineridae':'#e7e5df',  # azul claro 1
#     'Dorvilleidae':'#b2c0d0',   # azul claro2
#     'Oenonidae':'#7A9FBF',      # azul (centroide)
#     'Eunicidae':'#3c81ae',      # azul
#     'Onuphidae':'#00669a',      # azul escuro
#     'Syllidae':'#ffbd84', 
#     'Typhloscolecidae':'#ffaa74', 
#     'Aphroditidae':'#ff9760', 
#     'Acoetidae':'#ff814b', 
#     'Chrysopetalidae':'#fc6b36', 
#     'Eulepethidae':'#eb5824',
#     'Lopadorrhynchidae':'#d94814',  # laranja (centroide)
#     'Polynoidae':'#c83b03',
#     'Nereididae':'#b73000',
#     'Nephtyidae':'#f18e56',
#     'Glyceridae':'#D96236',         # laranja 2 (centroide)
#     'Tomopteridae':'#bf381b',
#     'Serpulidae':'#fbd0ad',
#     'Sabellidae':'#f2b999', # cor de pele (centroide)
#     'Sabellariidae':'#e8a287',
#     'Spionidae':'#d27666',
#     'Ampharetidae':'#b48061',
#     'Pectinariidae':'#a66c4b',  # marrom 1 (centroide),
#     'Trichobranchidae':'#975b39',
#     'Terebellidae':'#874c2c',
#     'Cirratulidae':'#774124',
#     'Flabelligeridae':'#683720',
#     'Sternaspidae':'#eebd93',
#     'Orbiniidae':'#dfa47a',
#     'Opheliidae':'#d28d60',
#     'Capitellidae':'#c37746',
#     'Arenicolidae':'#b4622f',
#     'Cossuridae':'#a3501d',
#     'Scalibregmatidae':'#92420e',
#     'Paraonidae':'#823606',
#     'Maldanidae':'#732c02', # marrom 2 (centroide)
#     'NaN':'#0D0D0D',  # preto
# }

<br>


## Graphs

---

### Creating chart: counts per determiner per year

In [6]:
teste = NewTable.groupby(['determiner_full_name','cataloged_year']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [7]:
# p.s.: 2255 missing names
# NewTable['determiner_full_name'].isna().sum()

In [15]:
# gráfico não tem todas as contagens por causa de nomes faltando
g1 = alt.Chart(teste, width=600, height=500).mark_circle().encode(
    x= alt.X('cataloged_year', type='ordinal', title='Cataloged Year'),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner Name', 
            sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts', scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    tooltip= alt.Tooltip(['determiner_full_name', 'cataloged_year', 'counts'])
)

# g1.save('./graphs/determiner/counts_per_determiner.html')
# g1

### Faceted chart version (needed some more flexibility, so I went for the hconcat version)

In [16]:
# g2 = alt.Chart(teste1, width=400, height=500).mark_circle().encode(
#     x= alt.X('cataloged_year', type='ordinal', title='Cataloged Year'),
#     y= alt.Y('determiner_full_name', type='nominal', title='Determiner Name', 
#             sort=alt.EncodingSortField('counts', op="count", order='descending')),
#     size= alt.Size('counts', type="quantitative",scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
#     color= alt.Color('order', type="nominal", title="Order", 
#                      scale=alt.Scale(domain=list(cores_ordem.keys()), range=list(cores_ordem.values()))),
#     tooltip= alt.Tooltip(['determiner_full_name', 'cataloged_year', 'counts', 'order'])
# )

# # saving chart
# # g2.facet(column='collection_prefix').resolve_scale(y='independent'
# #                                     ).save('./graphs/determiner/counts_per_determiner-w-order.html')

# g2.facet(column='collection_prefix').resolve_scale(y='independent', x='independent')

### hconcat: making each chart separately and concatenating them horizontally

In [17]:
teste1 = NewTable.groupby(['determiner_full_name','cataloged_year', 'collection_prefix','family']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [19]:
# database
db1 = teste1[teste1['collection_prefix']=='IBUFRJ']  # first spreadsheet
db2 = teste1[teste1['collection_prefix']=='MNRJP']   # second spreadsheet

# aux. variables
## first database
x_labels1 = db1.sort_values('cataloged_year')['cataloged_year'].unique()
temp = db1.groupby('determiner_full_name').sum().reset_index().sort_values('counts')
y_labels1 = temp['determiner_full_name'].unique()[::-1]

## second database
x_labels2 = db2.sort_values('cataloged_year')['cataloged_year'].unique()
temp = db2.groupby('determiner_full_name').sum().reset_index().sort_values('counts')
y_labels2 = temp['determiner_full_name'].unique()[::-1]

# both databases
temp = set(db1['counts'].unique()).union(db2['counts'].unique())
counts = list(range(min(temp), max(temp), 50))


# selector
select_family = alt.selection_multi(fields= ['family'], bind= 'legend')

g2 = alt.Chart(db1, width=500, height=580,
              title='IBUFRJ').mark_circle().encode(
    x= alt.X('cataloged_year', type='ordinal', title='Cataloged Year', 
             scale= alt.Scale(domain= x_labels1)),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner Name', 
             scale= alt.Scale(domain= y_labels1),
             sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts', type="quantitative",
                   legend= alt.Legend(orient='right', direction='horizontal'),
                   scale=alt.Scale(domain= counts,range=[20, 120])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia.keys()), range=list(cores_familia.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 42)),
    tooltip= alt.Tooltip(['determiner_full_name', 'cataloged_year', 'counts', 'family'])
).add_selection(select_family).transform_filter(select_family)

g3 = alt.Chart(db2, width=100, height=580, 
              title='MNRJP').mark_circle().encode(
    x= alt.X('cataloged_year', type='ordinal', title='Cataloged Year',
            scale= alt.Scale(domain= x_labels2)),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner Name', 
             scale= alt.Scale(domain= y_labels2),
             sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts:Q', title='Counts',
                   legend= alt.Legend(orient='right', direction='horizontal'),
                   scale=alt.Scale(domain= counts, range=[20, 120])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia.keys()), range=list(cores_familia.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 42)),
    tooltip= alt.Tooltip(['determiner_full_name', 'cataloged_year', 'counts', 'family'])
).add_selection(select_family).transform_filter(select_family)

graph = alt.hconcat(g2, g3)

# configuring graphic elements (font, fontSize, etc)
graph = graph.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# graph.save('./graphs/determiner/counts_per_determiner-wFamilies.html')
# graph

<br>

<font color='red'>**same chart, now sorting by the first year the determiner shows up in the database**</font>

In [20]:
# sorting
temp2 = teste1[teste1['collection_prefix']=='IBUFRJ'].copy()
temp3 = teste1[teste1['collection_prefix']=='MNRJP'].copy()

temp2.sort_values(['cataloged_year', 'determiner_full_name'], inplace=True)
temp3.sort_values(['cataloged_year', 'determiner_full_name'], inplace=True)

# saving entries
sorting2 = list(temp2['determiner_full_name'].unique())
sorting3 = list(temp3['determiner_full_name'].unique())

In [22]:
# database
db1 = teste1[teste1['collection_prefix']=='IBUFRJ']  # first spreadsheet
db2 = teste1[teste1['collection_prefix']=='MNRJP']   # second spreadsheet

# aux. variables
## first database
x_labels1 = db1.sort_values('cataloged_year')['cataloged_year'].unique()
y_labels1 = sorting2

## second database
x_labels2 = db2.sort_values('cataloged_year')['cataloged_year'].unique()
y_labels2 = sorting3

# both databases
temp = set(db1['counts'].unique()).union(db2['counts'].unique())
counts = list(range(min(temp), max(temp), 50))

# selector
select_family = alt.selection_multi(fields= ['family'], bind= 'legend')

g2 = alt.Chart(db1, width=500, height=580,
              title='IBUFRJ').mark_circle().encode(
    x= alt.X('cataloged_year', type='ordinal', title='Cataloged Year',
             scale= alt.Scale(domain= x_labels1)),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner Name', 
             scale= alt.Scale(domain= sorting2),
             sort=sorting2),
    size= alt.Size('counts', type="quantitative",
                   legend= alt.Legend(orient= 'right', direction= 'horizontal'),
                   scale=alt.Scale(domain= counts, range=[20, 120])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia.keys()), 
                                     range=list(cores_familia.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 42)),
    tooltip= alt.Tooltip(['determiner_full_name', 'cataloged_year', 'counts', 'family'])
).add_selection(select_family).transform_filter(select_family)

g3 = alt.Chart(db2, width=100, height=580, 
              title='MNRJP').mark_circle().encode(
    x= alt.X('cataloged_year', type='ordinal', title='Cataloged Year', 
             scale= alt.Scale(domain= x_labels2)),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner Name', 
             scale= alt.Scale(domain= sorting3),
             sort=sorting3),
    size= alt.Size('counts:Q', title='Counts', 
                   legend= alt.Legend(orient= 'right', direction= 'horizontal'),
                   scale=alt.Scale(domain= counts, range=[20, 120])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia.keys()), 
                                     range=list(cores_familia.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 42)),
    tooltip= alt.Tooltip(['determiner_full_name', 'cataloged_year', 'counts', 'family'])
).add_selection(select_family).transform_filter(select_family)

graph = alt.hconcat(g2, g3)

# configuring graphic elements (font, fontSize, etc)
graph = graph.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# graph.save('./graphs/determiner/counts_per_determiner-w-family-rearranged.html')
# graph

<br>

### Creating chart: counts per collector per year

In [23]:
# p.s.: 1059 missing names
# NewTable['collector_full_name'].isna().sum()

In [24]:
teste = NewTable.groupby(['collector_full_name','start_year']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [26]:
g1 = alt.Chart(teste, width=500, height=800).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collector_full_name', type='nominal', title='Collector Name', 
            sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts', scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts'])
)


# configuring graphic elements (font, fontSize, etc)
g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/collector/counts_per_collector.html')
# g1

<br>

### Chart: counts per collector with families

In [27]:
teste1 = NewTable.groupby(['collector_full_name','start_year', 'family', 'collection_prefix']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [29]:
# database
db1 = teste1[teste1['collection_prefix']=='IBUFRJ']  # first spreadsheet
db2 = teste1[teste1['collection_prefix']=='MNRJP']   # second spreadsheet

# aux. variables
## first database
x_labels1 = db1.sort_values('start_year')['start_year'].unique()
temp = db1.groupby('collector_full_name').sum().reset_index().sort_values('counts')
y_labels1 = temp['collector_full_name'].unique()[::-1]

## second database
x_labels2 = db2.sort_values('start_year')['start_year'].unique()
temp = db2.groupby('collector_full_name').sum().reset_index().sort_values('counts')
y_labels2 = temp['collector_full_name'].unique()[::-1]

# both databases
temp = set(db1['counts'].unique()).union(db2['counts'].unique())
counts = list(range(min(temp), max(temp), 50))

g2 = alt.Chart(db1, width=500, height=800,
              title='IBUFRJ').mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year',
            scale= alt.Scale(domain= x_labels1)),
    y= alt.Y('collector_full_name', type='nominal', title='Collector Name', 
            scale= alt.Scale(domain= y_labels1),
            sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts', type="quantitative", title='Counts',
                   legend= alt.Legend(orient= 'right', direction='horizontal'),
                   scale=alt.Scale(domain= counts, range=[20, 150])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia.keys()), range=list(cores_familia.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 42)),
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts', 'family'])
).add_selection(select_family).transform_filter(select_family)

g3 = alt.Chart(db2, width=500, height=800,
              title='MNRJP').mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year',
            scale= alt.Scale(domain= x_labels2)),
    y= alt.Y('collector_full_name', type='nominal', title='Collector Name', 
            scale= alt.Scale(domain= y_labels2),
            sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts', type="quantitative",
                   legend= alt.Legend(orient= 'right', direction='horizontal'),
                   scale=alt.Scale(domain= counts, range=[20, 150])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia.keys()), range=list(cores_familia.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 42)),
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts', 'family'])
).add_selection(select_family).transform_filter(select_family)

graph = alt.hconcat(g2, g3)

# configuring graphic elements (font, fontSize, etc)
graph = graph.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# graph.save('./graphs/collector/counts_per_collector-w-family.html')
# graph

<br>

<font color='red'>**same chart, now rearranging by the first year the collector shows up on the database**</font>

In [30]:
# sorting
temp2 = teste1[teste1['collection_prefix']=='IBUFRJ'].copy()
temp3 = teste1[teste1['collection_prefix']=='MNRJP'].copy()

temp2.sort_values(['start_year', 'collector_full_name'], inplace=True)
temp3.sort_values(['start_year', 'collector_full_name'], inplace=True)

# saving entries
sorting2 = list(temp2['collector_full_name'].unique())
sorting3 = list(temp3['collector_full_name'].unique())

In [32]:
# database
db1 = teste1[teste1['collection_prefix']=='IBUFRJ']  # first spreadsheet
db2 = teste1[teste1['collection_prefix']=='MNRJP']   # second spreadsheet

# aux. variables
## first database
x_labels1 = db1.sort_values('start_year')['start_year'].unique()
y_labels1 = sorting2

## second database
x_labels2 = db2.sort_values('start_year')['start_year'].unique()
y_labels2 = sorting3

# both databases
temp = set(db1['counts'].unique()).union(db2['counts'].unique())
counts = list(range(min(temp), max(temp), 50))

g2 = alt.Chart(db1, width=500, height=800,
              title='IBUFRJ').mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year',
             scale= alt.Scale(domain= x_labels1)),
    y= alt.Y('collector_full_name', type='nominal', title='Collector Name', 
             scale= alt.Scale(domain= y_labels1),
            sort=sorting2),
    size= alt.Size('counts', type="quantitative",  title= 'Counts',
                   legend= alt.Legend(orient= 'right', direction='horizontal', tickCount= 4),
                   scale=alt.Scale(domain= counts,range=[20, 150], zero=True)),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front    
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia.keys()), range=list(cores_familia.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 42)),
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts', 'family'])
).add_selection(select_family).transform_filter(select_family)

g3 = alt.Chart(db2, width=500, height=800,
              title='MNRJP').mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year',
             scale= alt.Scale(domain= x_labels2)),
    y= alt.Y('collector_full_name', type='nominal', title='Collector Name', 
             scale= alt.Scale(domain= y_labels2),
             sort=sorting3),
    size= alt.Size('counts', type="quantitative", title= 'Counts',
                   legend= alt.Legend(orient= 'right', direction='horizontal', tickCount= 4),
                   scale=alt.Scale(domain= counts, range=[20, 150], zero=True)),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia.keys()), range=list(cores_familia.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 42)),
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts', 'family'])
).add_selection(select_family).transform_filter(select_family)

graph = alt.hconcat(g2, g3)

# configuring graphic elements (font, fontSize, etc)
graph = graph.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# graph.save('./graphs/collector/counts_per_collector-rearranged.html')
# graph

<br>

**The end!**

-----

<font color='red' size='5'>**Experimental:** trying to add first and second collectors </font>

In [33]:
# juntando coletor 1 e coletor 2
NewTable['collectors'] = NewTable['collector_full_name'].astype(str) + ', ' + NewTable['collector_full_name2'].astype(str)

In [34]:
teste1 = NewTable.groupby(['collectors','start_year', 'family', 'collection_prefix']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [36]:
g2 = alt.Chart(teste1, width=600, height=900).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collectors', type='nominal', title='Collector 1, Collector 2', 
            sort=alt.EncodingSortField('counts', op="sum", order='descending')),
    size= alt.Size('counts', type="quantitative",scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia.keys()), range=list(cores_familia.values())),
                     legend= alt.Legend(columns=2, symbolLimit=42)),
    tooltip= alt.Tooltip(['collectors', 'start_year', 'counts', 'family'])
)

# faceting
g2 = g2.facet(column='collection_prefix').resolve_scale(y='independent')

# saving chart
# g2.save('./graphs/collector/counts_per_collector-1e2.html')
# g2

<br>

<font color='red'>**mesmo gráfico, agora ordenando pelo primeiro ano que o pesquisador aparece na base**</font>

In [37]:
# ordenando
teste1.sort_values(['start_year', 'collectors'], inplace=True)

# salvando ordem das entradas
sorting = list(teste1['collectors'].unique())

In [39]:
g2 = alt.Chart(teste1, width=600, height=900).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collectors', type='nominal', title='Collector 1, Collector 2', 
            sort=alt.EncodingSortField('start_year', op="min", order='ascending')),
    size= alt.Size('counts', type="quantitative",scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia.keys()), range=list(cores_familia.values())),
                     legend= alt.Legend(columns=2, symbolLimit=42)),
    tooltip= alt.Tooltip(['collectors', 'start_year', 'counts', 'family'])
)

g2 = g2.facet(column='collection_prefix').resolve_scale(y='independent')

# saving chart
# g2.save('./graphs/collector/counts_per_collector-1e2-wFamilies.html')
# g2

<br>

**That's it!**