# Type charts

By **Franklin Oliveira**

-----
This notebook contains all code necessary to make the "type" charts from `poliqueta` database. Here you'll find some basic data treatment and charts' code. 

Database: <font color='blue'>'IBUFRJ27.07.2020 - visualização.xlsx'</font> and <font color='blue'>'MNRJP27.07.2020 - visualização.xls'</font>.

In [1]:
import datetime
import numpy as np
import pandas as pd

from collections import defaultdict

# pacotes para visualização rápida
import seaborn as sns
import matplotlib.pyplot as plt

# pacote para visualização principal
import altair as alt

# habilitando renderizador para notebook
# alt.renderers.enable('notebook')
alt.renderers.enable('default')


# desabilitando limite de linhas
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Importing data...

In [2]:
NewTable = pd.read_csv('./data/merged_db.csv', sep=';', encoding='utf-8-sig')

In [3]:
# formatando a string NaN
NewTable['family'] = NewTable['family'].apply(lambda x: 'NaN' if x=='Nan' else x)
NewTable['type'] = NewTable['type'].apply(lambda x: x if str(x) != 'nan' else 'Non-type').astype(str)

<br>

<font size=5>**Color Palette**</font>

In [4]:
# importing customized color palette
from src.MNViz_colors import *

<br>


## Graphs

---

### Types (*per year*) per genus

x: Species1, cor: Type Status1, size: counts

In [5]:
# p.s.: the large majority is non-type
NewTable['type'].value_counts()

Non-type     6811
Paratype      144
Holotype       48
Paratype        1
Neotype         1
Name: type, dtype: int64

In [6]:
# subsetting
teste = NewTable[['min_depth','family','order', 'start_year', 'qualifier', 'catalog_number', 
                  'genus', 'species', 'type']].copy()

# grouping by type, year and order
temp = teste.groupby(['type','start_year', 'family']).count()['species'].reset_index().rename(columns={
    'species':'counts'
})

# p.s.: Cótipo and Topótipo are not types
temp = temp[(temp['type'] != 'Cótipo') & (temp['type'] != 'Topótipo')]

### Gráf. de Tipos

In [8]:
# selector
select_family = alt.selection_multi(fields= ['family'], bind='legend')

# database
db = temp[temp['type'] != 'Non-type']

# auxiliar variables for encoding charts
x_labels = db.sort_values('start_year')['start_year'].unique()
y_labels = db.groupby('type').sum().reset_index().sort_values('counts')['type'].unique()[::-1]
counts = db['counts'].unique()
counts = list(range(min(counts), 20, 1))

tipo = alt.Chart(db, height=150,width=600, title='Types per year').mark_circle().encode(
    x = alt.X('start_year:O', title='Sampling Year',
              scale= alt.Scale(domain= x_labels)),
    y = alt.Y('type:N', title= 'Types',
              scale= alt.Scale(domain= y_labels),
              sort=alt.EncodingSortField('counts', op='sum', order='descending')),
    color= alt.Color('family:N', title='Family',
                     scale=alt.Scale(domain=list(cores_familia.keys()), 
                                     range=list(cores_familia.values())),
                     legend= alt.Legend(columns=4, symbolLimit=102,
                                       direction='horizontal', orient='bottom')), 
    size= alt.Size('counts', title='Counts',
                   scale= alt.Scale(domain= counts, range=[20, 50]),
                   legend= alt.Legend(orient='bottom', direction='horizontal')),
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    tooltip= [alt.Tooltip('type', title='Tipo'),
              alt.Tooltip('start_year', title='Ano de Coleta'),
              alt.Tooltip('counts', title='Contagem')]
).add_selection(select_family).transform_filter(select_family)

tipo = tipo.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# tipo.save('./graphs/type/types_per_year.html')
# tipo

## Types per Genus 

same graph as above, with gender on Y axis and colored by type

In [9]:
# subsetting
teste = NewTable[['min_depth','family','order', 'start_year', 'qualifier', 'catalog_number', 
                  'genus', 'species', 'type']].copy()

# grouping by type, year and order
temp = teste.groupby(['type','start_year', 'genus', 'family']).count()['order'].reset_index().rename(columns={
    'order':'counts'
})

# p.s.: Cótipo and Topótipo are not types
temp = temp[(temp['type'] != 'Cótipo') & (temp['type'] != 'Topótipo')]

In [10]:
cores_padrao = ['#e45756', '#4c78a8', '#f58518']
tipos = ['Holotype', 'Paratype', 'Neotype']

In [12]:
select_type = alt.selection_multi(fields= ['type'], bind='legend')
select_family = alt.selection_multi(fields= ['family'], bind='legend')

# filtering out non types
db = temp[(temp['type'] != 'nan') & (temp['type'] != 'Non-type')]

# encoding labels
x_labels = db.sort_values('start_year')['start_year'].unique()
y_labels = db.groupby('genus').sum().reset_index().sort_values('counts')['genus'].unique()[::-1]
types = db['type'].unique()
counts = db['counts'].unique()
counts = [counts.min(), counts.max()]
families = db['family'].unique()

tipo = alt.Chart(db, height=500, width= 400, title='Types per Genus').mark_point(filled=False).encode(
    x = alt.X('start_year:O', title='Sampling Year',
              scale= alt.Scale(domain=x_labels)),
    y = alt.Y('genus:N', title= 'Genus',
              sort=alt.EncodingSortField('counts', op='count', order='descending'),
              scale= alt.Scale(domain=y_labels)),
    color= alt.Color('family:N', title='Family',
                    scale= alt.Scale(domain=list(cores_familia.keys()),
                                     range=list(cores_familia.values())),
                    legend= alt.Legend(columns=2, symbolLimit=102, symbolType= 'circle')), 
    size= alt.Size('counts', title= 'Counts',scale= alt.Scale(domain= counts, range=[20,220]),
                   legend= alt.Legend(orient= 'right', direction= 'horizontal')),
    order= alt.Order('counts', title='Counts', sort='descending'),  # smaller points in front
    shape= alt.Shape('type:N', title='Types', 
                     legend= alt.Legend(columns=4),
                     scale= alt.Scale(domain=['Holotype', 'Neotype','Paratype'],
                                      range=['triangle', 'square', 'cross'])),
     tooltip= [alt.Tooltip('genus', title='genus'),
              alt.Tooltip('type', title='type_status'),
              alt.Tooltip('start_year', title='start_year'),
              alt.Tooltip('counts', title='counts')]
).add_selection(select_type, select_family).transform_filter(select_type).transform_filter(select_family)

tipo = tipo.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# tipo.save('./graphs/type/types_per_genus.html')
# tipo

### rearranging by the first year of appearance

In [13]:
genus_order = list(temp.groupby(['genus']).min()['start_year'].reset_index().sort_values('start_year')['genus'])

In [15]:
select_type = alt.selection_multi(fields= ['type'], bind='legend')
select_family = alt.selection_multi(fields= ['family'], bind='legend')

# filtering out non types
db = temp[(temp['type'] != 'nan') & (temp['type'] != 'Non-type')].sort_values('start_year')

# encoding labels
x_labels = db['start_year'].unique()
y_labels = db['genus'].unique()
types = db['type'].unique()
counts = db['counts'].unique()
counts = [counts.min(), counts.max()]
families = db['family'].unique()

tipo = alt.Chart(db, height=500, width= 400, title='Types per Genus').mark_point(filled=False).encode(
    x = alt.X('start_year:O', title='Sampling Year',
              scale= alt.Scale(domain=x_labels)),
    y = alt.Y('genus:N', title= 'Genus',
              sort=genus_order,
              scale= alt.Scale(domain=y_labels)),
    color= alt.Color('family:N', title='Family',
                    scale= alt.Scale(domain=list(cores_familia.keys()),
                                     range=list(cores_familia.values())),
                    legend= alt.Legend(columns=2, symbolLimit=102, symbolType= 'circle')), 
    size= alt.Size('counts', title='Counts', scale= alt.Scale(domain= counts, range=[20,220]),
                   legend= alt.Legend(orient= 'right', direction= 'horizontal')),
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    shape= alt.Shape('type:N', title='Types', 
                     legend= alt.Legend(columns=4),
                     scale= alt.Scale(domain=['Holotype', 'Neotype','Paratype'],
                                      range=['triangle', 'square', 'cross'])),
    tooltip= [alt.Tooltip('type', title='type_status'),
              alt.Tooltip('start_year', title='start_year'),
              alt.Tooltip('counts', title='counts')]
).add_selection(select_type, select_family).transform_filter(select_type).transform_filter(select_family)

tipo = tipo.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# tipo.save('./graphs/type/types_per_genus-rearranged.html')
# tipo

## Types per determiner

In [16]:
# subsetting
teste = NewTable[['min_depth','family','order', 'start_year', 'qualifier', 'catalog_number', 
                  'determiner_full_name', 'species', 'type']].copy()

# grouping by type, year and order
temp = teste.groupby(['type','start_year', 'determiner_full_name', 'family']).count()['order'].reset_index().rename(columns={
    'order':'counts'
})

# p.s.: Cótipo and Topótipo are not types
temp = temp[(temp['type'] != 'Cótipo') & (temp['type'] != 'Topótipo')]

In [17]:
determiner_order = list(temp.groupby(['determiner_full_name']).min(
    )['start_year'].reset_index().sort_values('start_year')['determiner_full_name'])

In [19]:
select_type = alt.selection_multi(fields= ['type'], bind='legend')
select_family = alt.selection_multi(fields= ['family'], bind='legend')

# filtering out non types
db = temp[(temp['type'] != 'nan') & (temp['type'] != 'Non-type')].sort_values('start_year')

# encoding labels
x_labels = db.sort_values('start_year')['start_year'].unique()
y_labels = db['determiner_full_name'].unique()
types = db['type'].unique()
counts = db['counts'].unique()
counts = [counts.min(), counts.max()]
families = db['family'].unique()

tipo = alt.Chart(db, height=500, width= 400, title='Types per Determiner').mark_point(filled=False).encode(
    x = alt.X('start_year:O', title='Sampling Year', 
             scale= alt.Scale(domain=x_labels)),
    y = alt.Y('determiner_full_name:N', title= 'Determiner',
              sort=determiner_order, 
              scale= alt.Scale(domain= y_labels)),
    color= alt.Color('family:N', title='Family',
                    scale= alt.Scale(domain=list(cores_familia.keys()), 
                                     range=list(cores_familia.values())),
                    legend= alt.Legend(columns=2, symbolLimit=102)), 
    size= alt.Size('counts:Q', title='Counts', scale=alt.Scale(domain= counts, range=[20,220]),
                   legend= alt.Legend(orient= 'right', direction='horizontal')),
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    shape= alt.Shape('type:N', title='Types', 
                     legend= alt.Legend(columns=4),
                     scale= alt.Scale(domain=['Holotype', 'Neotype','Paratype'],
                                      range=['triangle', 'square', 'cross'])),
    tooltip= [alt.Tooltip('determiner_full_name', title='determiner'),
              alt.Tooltip('type', title='type_status'),
              alt.Tooltip('start_year', title='start_year'),
              alt.Tooltip('counts', title='counts')]
).add_selection(select_type, select_family).transform_filter(select_type).transform_filter(select_family)

tipo = tipo.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# tipo.save('./graphs/type/types_per_determiner-rearranged.html')
# tipo

## Types per family

In [20]:
# subsetting
teste = NewTable[['min_depth','family','order', 'start_year', 'qualifier', 'catalog_number', 
                  'genus', 'species', 'type']].copy()

# grouping by type, year and order
temp = teste.groupby(['type','start_year', 'family', 'order']).count()['genus'].reset_index().rename(columns={
    'genus':'counts'
})

# p.s.: Cótipo and Topótipo are not types
temp = temp[(temp['type'] != 'Cótipo') & (temp['type'] != 'Topótipo')]

In [21]:
family_order = list(temp.groupby(['family']).min(
    )['start_year'].reset_index().sort_values('start_year')['family'])

In [23]:
select_type = alt.selection_multi(fields= ['type'], bind='legend')
select_family = alt.selection_multi(fields= ['family'], bind='legend')

# filtering out non types
db = temp[(temp['type'] != 'nan') & (temp['type'] != 'Non-type')].sort_values('start_year')

# aux. variables for encoding labels
x_labels = db.sort_values('start_year')['start_year'].unique()
y_labels = db['family'].unique()
types = db['type'].unique()
counts = db['counts'].unique()
counts = [counts.min(), counts.max()]
families = db['family'].unique()

tipo = alt.Chart(db, height=500, width= 400, title='Types per Family').mark_point(filled=False).encode(
    x = alt.X('start_year:O', title='Sampling Year',
              scale= alt.Scale(domain=x_labels)),
    y = alt.Y('family:N', title= 'Family',
              sort=family_order,
              scale= alt.Scale(domain=y_labels)),
    color= alt.Color('family:N', title='Family',
                    scale= alt.Scale(domain=list(cores_familia.keys()), 
                                     range=list(cores_familia.values())),
                    legend= alt.Legend(columns=2, symbolLimit=102)), 
    size= alt.Size('counts:Q', title='Counts', scale=alt.Scale(domain= counts, range=[20,320]),
                   legend= alt.Legend(orient= 'right', direction= 'horizontal')),
    order= alt.Order('counts', sort='descending'),  # smaller points in front
    shape= alt.Shape('type:N', title='Types', 
                     legend= alt.Legend(columns=4),
                     scale= alt.Scale(domain=['Holotype', 'Neotype','Paratype'],
                                      range=['triangle', 'square', 'cross'])),
    tooltip= [alt.Tooltip('family', title='family'),
              alt.Tooltip('type', title='type'),
              alt.Tooltip('start_year', title='start_year'),
              alt.Tooltip('counts', title='counts')]
).add_selection(select_type, select_family).transform_filter(select_type).transform_filter(select_family)

tipo = tipo.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# tipo.save('./graphs/type/types_per_family.html')
# tipo

<br>

**The end!**

-----