# Counts per researcher

By **Franklin Oliveira**

-----
This notebook contains all code necessary to make charts from `carcinos` database with focus on time and space exploration. Here you'll find some basic data treatment and charts' code. 

Database: <font color='blue'>'Planilha geral Atualizada FINAL 5_GERAL_sendo trabalhada no Google drive.xlsx'</font>


In [1]:
import datetime
import numpy as np
import pandas as pd

from collections import defaultdict

# quick visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Waffle Charts
# from pywaffle import Waffle 
# docs: https://pywaffle.readthedocs.io/en/latest/examples/block_shape_distance_location_and_direction.html

# visualization
import altair as alt

# enabling notebook renderer
# alt.renderers.enable('notebook')
alt.renderers.enable('default')

# disabling rows limit
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Importing data...

In [2]:
NewTable = pd.read_csv('./data/treated_db.csv', sep=';', encoding='utf-8', low_memory=False)

## Filtering

At least for now, we'll be considering only specimens of order decapoda (deeply revised by the Museum's crew)

In [3]:
decapoda = NewTable[NewTable['order'] == 'Decapoda'].copy()

<br>

<font size=5>**Color palette**</font>

Colors (per infraorder): 

- <font color='#e26d67'><b>Ascacidae</b></font>
- <font color='#007961'><b>Anomura</b></font>
- <font color='#7a2c39'><b>Achelata</b></font>
- <font color='#b67262'><b>Axiidea</b></font>
- <font color='#ee4454'><b>Brachyura</b></font>
- <font color='#3330b7'><b>Caridea</b></font>
- <font color='#58b5e1'><b>Gebiidea</b></font>
- <font color='#b8e450'><b>Stenopodídea</b></font>
- <font color='#a0a3fd'><b>Astacidae</b></font>
- <font color='#deae9e'><b>Polychelida</b></font>
- <font color='#d867be'><b>Grapsidae</b></font>
- <font color='#fece5f'><b>Xanthoidea</b></font>

In [4]:
# importing customized color palettes
from src.MNViz_colors import *

<br>


## Graphs

---

### counts per determiner per year

In [5]:
teste = decapoda.groupby(['determiner_full_name','det_year']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [6]:
# gráfico não tem todas as contagens por causa de nomes faltando
g1 = alt.Chart(teste, title= 'Counts per determiner', width=600, height=800).mark_circle().encode(
    x= alt.X('det_year', type='ordinal', title='Determination Year'),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner Name', 
            sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts', scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    tooltip= alt.Tooltip(['determiner_full_name', 'det_year', 'counts'])
)

g1.save('./graphs/determiner/counts_per_determiner.html')
g1

<font color='red' size='5'>same chart, now sorting per family too</font>

In [7]:
teste1 = decapoda.groupby(['determiner_full_name','det_year','family', 'infraorder']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [8]:
g = alt.Chart(teste1, title= 'Counts per determiner (sorted by sum)', width=600, height=1000, 
              ).mark_circle().encode(
    x= alt.X('det_year', type='ordinal', title='Determination Year'),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner', 
            sort=alt.EncodingSortField('counts', op="sum", order='descending')),
    size= alt.Size('counts:Q', title='Counts',
                   legend= alt.Legend(orient='right', direction='horizontal'),
                   scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia_naive.keys()), 
                                     range=list(cores_familia_naive.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 102)),
    tooltip= alt.Tooltip(['determiner_full_name', 'det_year', 'counts', 'family'])
)

# configuring graphic elements (font, fontSize, etc)
g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

g.save('./graphs/determiner/counts_per_determiner-w-family.html')
g

<br>

<font color='red'>**same chart, now rearranging by the first year the determiner appears in the database**</font>

In [9]:
# ordenando
teste1.sort_values(['det_year', 'determiner_full_name'], inplace=True)

# salvando ordem das entradas
sorting = list(teste1['determiner_full_name'].unique())

In [10]:
select_family = alt.selection_multi(fields=['family'], bind='legend')

# database
db = teste1.sort_values('det_year')

# aux. variables for encoding channels 
x_labels = db['det_year'].unique()
y_labels = sorting
counts = db['counts'].unique()
counts = list(range(min(counts), max(counts),20))
# families = [f for f in cores_familia_naive.keys() if f in db['family'].unique()]
# colors = [cores_familia_naive[k] for k in families]

g = alt.Chart(db, title= 'Counts per determiner (rearranged by first year of appearance)', 
            width=600, height=1000).mark_circle().encode(
    x= alt.X('det_year', type='ordinal', title='Determination Year',
             scale= alt.Scale(domain=x_labels)),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner', 
            sort=sorting, 
            scale= alt.Scale(domain=y_labels)),
    size= alt.Size('counts', title='Counts',type="quantitative",
                   legend= alt.Legend(orient='right', direction='horizontal'),
                   scale=alt.Scale(domain= counts, range=[15, 100])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia_naive.keys()), 
                                     range=list(cores_familia_naive.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 102)),
    tooltip= alt.Tooltip(['determiner_full_name', 'det_year', 'counts', 'infraorder', 'family'])
).add_selection(select_family).transform_filter(select_family)

# configuring graphic elements (font, fontSize, etc)
g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

g.save('./graphs/determiner/counts_per_determiner-w-family-rearranged.html')
g

<br>

### Chart: top 50 determiners 

In [11]:
# somando contagens de cada pesquisador
sorting = teste1.groupby('determiner_full_name').sum()['counts'].reset_index().rename(
    columns={'counts':'sum'})

sorting = sorting.sort_values('sum', ascending=False)

# Nomes dos pesquisadores ordenados
sort_list = sorting['determiner_full_name'].unique()

In [12]:
select_family = alt.selection_multi(fields=['family'], bind='legend')

# selecting database (top 50 most expressive determiners) - p.s.: in this case, we can leave all determiners
# db = teste1[teste1['determiner_full_name'].isin(sort_list[:50])].sort_values('det_year')
db = teste1.sort_values('det_year')

# aux. variables for encoding channels 
x_labels = db['det_year'].unique()
y_labels = sort_list  
counts = db['counts'].unique()
counts = list(range(min(counts), max(counts),20))
# families = [f for f in cores_familia_naive.keys() if f in db['family'].unique()]
# colors = [cores_familia_naive[k] for k in families]

g = alt.Chart(db, title= 'Counts per determiner (Top 50)', width=600, height=1000).mark_circle().encode(
    x= alt.X('det_year', type='ordinal', title='Determination Year',
             scale= alt.Scale(domain= x_labels)),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner', 
            sort=sort_list,
            scale= alt.Scale(domain= y_labels)),
    size= alt.Size('counts:Q', title='Counts', 
                   legend=alt.Legend(orient='right', direction='horizontal'),
                   scale=alt.Scale(domain= counts, range=[15, 100])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia_naive.keys()), 
                                     range=list(cores_familia_naive.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 102)),
    tooltip= alt.Tooltip(['determiner_full_name', 'det_year', 'counts', 'infraorder','family'])
).add_selection(select_family).transform_filter(select_family)

# configuring graphic elements (font, fontSize, etc)
g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

g.save('./graphs/determiner/counts_per_determiner-top50.html')
g

#### Ordering by first year of appearance

In [13]:
select_family = alt.selection_multi(fields=['family'], bind='legend')

# selecting database (top 50 most expressive determiners) - p.s.: in this case, we can leave all determiners
# db = teste1[teste1['determiner_full_name'].isin(sort_list[:50])].sort_values('det_year')
db = teste1.sort_values('det_year')

# aux. variables for encoding channels 
x_labels = db['det_year'].unique()
y_labels = db['determiner_full_name'].unique()
counts = db['counts'].unique()
counts = list(range(min(counts), max(counts),20))
# families = [f for f in cores_familia_naive.keys() if f in db['family'].unique()]
# colors = [cores_familia_naive[k] for k in families]

g = alt.Chart(db, title= 'Counts per determiner (Top 50 - rearranged by first year of appearance)', 
            width=600, height=1000).mark_circle().encode(
    x= alt.X('det_year', type='ordinal', title='Determination Year',
             scale= alt.Scale(domain= x_labels)),
    y= alt.Y('determiner_full_name', type='nominal', title='Determiner', 
             sort=alt.EncodingSortField('start_year'),
             scale= alt.Scale(domain= y_labels)),
    size= alt.Size('counts:Q', title='Counts', legend=alt.Legend(orient= 'right', direction='horizontal'),
                   scale=alt.Scale(domain= counts, range=[15, 100])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia_naive.keys()), 
                                     range=list(cores_familia_naive.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 102)),
    tooltip= alt.Tooltip(['determiner_full_name', 'det_year', 'counts', 'infraorder','family'])
).add_selection(select_family).transform_filter(select_family)

# configuring graphic elements (font, fontSize, etc)
g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

g.save('./graphs/determiner/counts_per_determiner-top50-rearranged.html')
g

<br>

### Creating chart: counts per collector per year

In [14]:
teste = decapoda.groupby(['collector_full_name','start_year']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [15]:
g1 = alt.Chart(teste, title= 'Counts per collector', width=800, height=1400).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collector_full_name', type='nominal', title='Collector', 
            sort=alt.EncodingSortField('counts', op="count", order='descending')),
    size= alt.Size('counts', title='Counts',
                   scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts'])
)


# configuring graphic elements (font, fontSize, etc)
g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

g1.save('./graphs/collector/counts_per_collector.html')
g1

<font color='red' size='5'>same chart, now coloring by family</font>


In [16]:
teste1 = NewTable.groupby(['collector_full_name','start_year', 'infraorder','family']).count()['class'].reset_index().rename(columns=
                                                                                            {'class':'counts'})

In [17]:
g = alt.Chart(teste1, title= 'Counts per collector (sorted by sum)', 
              width=800, height=1400).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collector_full_name', type='nominal', title='Collector', 
            sort=alt.EncodingSortField('counts', op="sum", order='descending')),
    size= alt.Size('counts:Q', title='Counts',
                   legend= alt.Legend(orient='right', direction='horizontal'),
                   scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia_naive.keys()), 
                                     range=list(cores_familia_naive.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 102)),
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts', 'family'])
)

# configuring graphic elements (font, fontSize, etc)
g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

g.save('./graphs/collector/counts_per_collector-w-family.html')
g

<br>

<font color='red'>**same chart, now rearranging by the first year the collector shows up on the database**</font>

In [18]:
# ordenando
teste1.sort_values(['start_year', 'collector_full_name'], inplace=True)

# salvando ordem das entradas
sorting = list(teste1['collector_full_name'].unique())

In [19]:
g = alt.Chart(teste1, title= 'Counts per collector (rearranged by first year of appearance)', 
              width=800, height=1400).mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year'),
    y= alt.Y('collector_full_name', type='nominal', title='Collector', 
            sort=alt.EncodingSortField('start_year', op="min", order='ascending')),
    size= alt.Size('counts:Q', title='Counts',
                   legend= alt.Legend(direction='horizontal', orient='right'),
                   scale=alt.Scale(range=[15, 500])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front    
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia_naive.keys()), 
                                     range=list(cores_familia_naive.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 102)),
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts', 'family'])
)

# configuring graphic elements (font, fontSize, etc)
g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

g.save('./graphs/collector/counts_per_collector-w-family-rearranged.html')
g

<br>

### Gráfico dos coletores mais expressivos

In [20]:
# somando contagens de cada pesquisador
sorting = teste1.groupby('collector_full_name').sum()['counts'].reset_index().rename(
    columns={'counts':'sum'})

sorting = sorting.sort_values('sum', ascending=False)

# Nomes dos pesquisadores ordenados
sort_list = sorting['collector_full_name'].unique()

In [21]:
select_family = alt.selection_multi(fields= ['family'], bind='legend')

# filtering database (top 50 most expressive collectors)
db = teste1[teste1['collector_full_name'].isin(sort_list[:50])].sort_values('start_year')

# auxiliar variables for encoding fields
x_labels = db['start_year'].unique()
y_labels = sort_list[:50]
counts = db['counts'].unique()
counts = list(range(min(counts), max(counts), 200))
# families = [f for f in cores_familia_naive.keys() if f in db['family'].unique()]
# colors = [cores_familia_naive[k] for k in families]

g = alt.Chart(teste1[teste1['collector_full_name'].isin(sort_list[:50])], width=800, height=800,
              title= 'Counts per collector (Top 50)').mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year',
             scale= alt.Scale(domain= x_labels)),
    y= alt.Y('collector_full_name', type='nominal', title='Collector', 
            sort=sort_list[:50],
            scale= alt.Scale(domain= y_labels)),
    size= alt.Size('counts:Q', title='Counts',
                   legend= alt.Legend(orient= 'right', direction='horizontal'),
                   scale=alt.Scale(domain= counts,range=[15, 115])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia_naive.keys()), 
                                     range=list(cores_familia_naive.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 102)),
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts', 'infraorder','family'])
).add_selection(select_family).transform_filter(select_family)

# configuring graphic elements (font, fontSize, etc)
g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

g.save('./graphs/collector/counts_per_collector-top50.html')
g

<br>

#### ordering by first year of appearance

In [22]:
# somando contagens de cada pesquisador
sorting = teste1.groupby(['collector_full_name', 'start_year']).sum()['counts'].reset_index().rename(
    columns={'counts':'sum'})

sorting = sorting.sort_values('start_year', ascending=True)

In [23]:
select_family = alt.selection_multi(fields= ['family'], bind='legend')

# filtering database (top 50 most expressive collectors)
db = teste1[teste1['collector_full_name'].isin(sort_list[:50])].sort_values('start_year')

# auxiliar variables for encoding fields
x_labels = db['start_year'].unique()
y_labels = [n for n in db['collector_full_name'].unique() if n in sort_list[:50]]  # needs to be a top 50
counts = db['counts'].unique()
counts = list(range(min(counts), max(counts), 200))
# families = [f for f in cores_familia_naive.keys() if f in db['family'].unique()]
# colors = [cores_familia_naive[k] for k in families]

g = alt.Chart(teste1[teste1['collector_full_name'].isin(sort_list[:50])], width=800, height=800,
        title= 'Counts per collector (Top 50 - rearranged by first year of appearance)').mark_circle().encode(
    x= alt.X('start_year', type='ordinal', title='Sampling Year',
             scale= alt.Scale(domain= x_labels)),
    y= alt.Y('collector_full_name', type='nominal', title='Collector', 
            scale= alt.Scale(domain= y_labels),
            sort=alt.EncodingSortField('start_year')),
    size= alt.Size('counts:Q', title='Counts',
                   legend= alt.Legend(orient= 'right', direction= 'horizontal'),
                   scale=alt.Scale(domain= counts,range=[15, 115])),  # range ajusta tamanho do circulo
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    color= alt.Color('family:N', title="Family", 
                     scale=alt.Scale(domain=list(cores_familia_naive.keys()), 
                                     range=list(cores_familia_naive.values())),
                     legend= alt.Legend(columns=2, symbolLimit= 102)),
    tooltip= alt.Tooltip(['collector_full_name', 'start_year', 'counts', 'family'])
).add_selection(select_family).transform_filter(select_family)

# configuring graphic elements (font, fontSize, etc)
g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

g.save('./graphs/collector/counts_per_collector-top50-rearranged.html')
g

<br>

**The end!**

-----