# Type charts

By **Franklin Oliveira**

-----
This notebook contains all code necessary to make the "type" charts from `poliqueta` database. Here you'll find some basic data treatment and charts' code. 

Database: <font color='blue'>'IBUFRJ27.07.2020 - visualização.xlsx'</font> and <font color='blue'>'MNRJP27.07.2020 - visualização.xls'</font>.

In [1]:
import datetime
import numpy as np
import pandas as pd

from collections import defaultdict

# pacotes para visualização rápida
import seaborn as sns
import matplotlib.pyplot as plt

# pacote para visualização principal
import altair as alt

# habilitando renderizador para notebook
# alt.renderers.enable('notebook')
alt.renderers.enable('default')


# desabilitando limite de linhas
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Importing data...

In [2]:
NewTable = pd.read_csv('./data/merged_db.csv', sep=';', encoding='utf-8-sig')

<br>

<font size=5>**Color Palette per Order**</font>

These images were used as inspiration (https://color.adobe.com/create/image)


<div class='row' style='padding-top:20px;'>
    <div class='col-md-6'>
        <img src="./src/img1.jpg" width='400px'>
    </div>
    <div class='col-md-6'>
        <img src="./src/img2.jpg" width='400px'>
    </div>
</div>

<br>

**Colors:** 

<ul>
    <li style='color:#41A681'><b> #41A681 </b># verde</li>
    <li style='color:#7ACAAB'><b> #7ACAAB </b># verde claro</li>
    <li style='color:#78a1a1'><b> #78a1a1 </b># azul</li>
    <li style='color:#8ABFB0'><b> #8ABFB0 </b># azul claro</li>
    <li style='color:#FFB27C'><b> #FFB27C </b># cor de pele clara</li>
    <li style='color:#F29877'><b> #F29877 </b># cor de pele</li>
    <li style='color:#ed845e'><b> #ed845e </b># laranja claro1</li>
    <li style='color:#D96236'><b> #D96236 </b># laranja claro2</li>
    <li style='color:#D95323'><b> #D95323 </b># laranja 1</li>
    <li style='color:#D94B18'><b> #D94B18 </b># laranja 2</li>
    <li style='color:#D9C2AD'><b> #D9C2AD </b># bege</li>
    <li style='color:#A66C4B'><b> #A66C4B </b># marrom claro</li>
    <li style='color:#86471B'><b> #86471B </b># marrom1</li>
    <li style='color:#732C02'><b> #732C02 </b># marrom2</li>
    <li style='color:#592202'><b> #592202 </b># marrom escuro1</li>
    <li style='color:#3D1806'><b> #3D1806 </b># marrom escuro2</li>
    <li style='color:#0D0D0D'><b> #0D0D0D </b># preto</li>
</ul>



In [3]:
# determinando cores de acordo com a planilha (2020.10.01 - IB e MN - Cores visualização.xlsx)
ordens = NewTable['order'].unique()

# o agrupamento é feito por famílias (ordem daquelas famílias deve assumir certa cor)
cores_ordem = {
    'Spionida':'#41A681',   # verde
    'Sabellida':'#7ACAAB',  # verde claro
    'Canalipalpata':'#78a1a1',  # azul
    'Amphinomida':'#8ABFB0',  # azul claro
    'Eunicida':'#A66C4B', # marrom claro
    'Phyllodocida':'#732C02', # marrom2
    'Terebellida':'#ed845e', # laranja claro1
    'Scolecida':'#D94B18', # laranja 2
    np.NAN:'#0D0D0D',  # preto
    
    # ordens não citadas na planilha:
    'Sipuncula':'#D9C2AD', # bege
    'Crassiclitellata':'#FFB27C', # cor de pele clara
    'Aspidosiphonida':'#F29877',  # cor de pele
    
}

In [4]:
# paleta de cores ANTIGA

# ordens = NewTable['order'].unique()
# cores = [
#     '#8ABFB0',  # azul claro
#     '#41A681',  # verde
#     '#7ACAAB',  # verde claro
#     '#D9C2AD',  # bege
#     '#0D0D0D',  # preto
#     '#D96236',  # laranja
#     '#D94B18',  # laranja escuro
#     '#FFB27C',  # cor de pele clara
#     '#732C02',  # marrom
#     '#86471B',  # mostarda
    
#     # cores novas para Canalipalpata e Aspidosiphonida (a ordem é aleatória. Fixar depois)
#     '#592202',
#     '#D96236'
# ]

# cores_ordem = defaultdict()
# for j in range(len(ordens)):
#     ordem = ordens[j]
#     cores_ordem[ordem] = cores[j]
    
# cores_ordem = dict(cores_ordem)

<br>


## Graphs

---

### Types (*per year*) per genus

x: Species1, cor: Type Status1, size: counts

In [5]:
# p.s.: the large majority is non-type
NewTable['type'].value_counts()

Paratype    119
Holotype     35
Neotype       1
Name: type, dtype: int64

In [6]:
NewTable.columns

Index(['Unnamed: 0', 'catalog_number', 'cataloged_date', 'Determined date 1',
       'Start Date', 'start_year', 'start_month', 'determined_year',
       'determined_month', 'cataloged_month', 'cataloged_year', 'class',
       'kingdom', 'genus', 'species', 'family', 'phylum', 'order', 'type',
       'author', 'author_year', 'determiner_full_name', 'collector_full_name',
       'collector_full_name2', 'collector_first_name', 'collector_last_name',
       'qualifier', 'min_depth', 'max_depth', 'lat', 'long', 'continent',
       'water_body', 'country', 'state', 'county', 'locality',
       'determiner_last_name', 'determiner_first_name', 'collection_prefix'],
      dtype='object')

In [7]:
# subsetting
teste = NewTable[['min_depth','family','order', 'start_year', 'qualifier', 'catalog_number', 
                  'genus', 'species', 'type']].copy()

# grouping by type, year and order
temp = teste.groupby(['type','start_year', 'order']).count()['family'].reset_index().rename(columns={
    'family':'counts'
})

# p.s.: Cótipo and Topótipo are not types
temp = temp[(temp['type'] != 'Cótipo') & (temp['type'] != 'Topótipo')]

243 info. de tipos

### Gráf. de Tipos

In [8]:
tipo = alt.Chart(temp, height=150, title='Types per year').mark_circle().encode(
    x = alt.X('start_year:O', title='Sampling Year'),
    y = alt.Y('type:N', title= 'Type',
              sort=alt.EncodingSortField('counts', op='sum', order='descending')),
    color= alt.Color('order', title='order',
                     scale=alt.Scale(domain=list(cores_ordem.keys()), range=list(cores_ordem.values()))), 
    size= alt.Size('counts'),
    tooltip= [alt.Tooltip('type', title='type'),
              alt.Tooltip('start_year', title='start year'),
              alt.Tooltip('counts', title='counts')]
)

tipo.save('./graphs/tipo/tipos_por_ano.html')

tipo

## Types per Genus 

same graph as above, with gender on Y axis and colored by type

In [9]:
# subsetting
teste = NewTable[['min_depth','family','order', 'start_year', 'qualifier', 'catalog_number', 
                  'genus', 'species', 'type']].copy()

# grouping by type, year and order
temp = teste.groupby(['type','start_year', 'genus']).count()['family'].reset_index().rename(columns={
    'family':'counts'
})

# p.s.: Cótipo and Topótipo are not types
temp = temp[(temp['type'] != 'Cótipo') & (temp['type'] != 'Topótipo')]

In [10]:
cores_padrao = ['#e45756', '#4c78a8', '#f58518']
tipos = ['Holotype', 'Paratype', 'Neotype']

In [11]:
tipo = alt.Chart(temp, height=300, title='Types per Genus').mark_circle(opacity=0.8).encode(
    x = alt.X('start_year:O', title='Sampling Year'),
    y = alt.Y('genus:N', title= 'Genus',
              sort=alt.EncodingSortField('counts', op='sum', order='descending')),
    color= alt.Color('type:N', title='type', scale= alt.Scale(range=cores_padrao, domain=tipos)), 
    size= alt.Size('counts', scale=alt.Scale()),
    tooltip= [alt.Tooltip('type', title='type'),
              alt.Tooltip('start_year', title='start year'),
              alt.Tooltip('counts', title='counts')]
)

tipo.save('./graphs/tipo/tipos_por_genero.html')

tipo

## Types per family

In [12]:
# subsetting
teste = NewTable[['min_depth','family','order', 'start_year', 'qualifier', 'catalog_number', 
                  'genus', 'species', 'type']].copy()

# grouping by type, year and order
temp = teste.groupby(['type','start_year', 'family']).count()['genus'].reset_index().rename(columns={
    'genus':'counts'
})

# p.s.: Cótipo and Topótipo are not types
temp = temp[(temp['type'] != 'Cótipo') & (temp['type'] != 'Topótipo')]

In [13]:
cores_padrao = ['#e45756', '#4c78a8', '#f58518']
tipos = ['Holotype', 'Paratype', 'Neotype']

In [14]:
tipo = alt.Chart(temp, height=300, title='Types per Family').mark_circle(opacity=0.8).encode(
    x = alt.X('start_year:O', title='Sampling Year'),
    y = alt.Y('family:N', title= 'Family',
              sort=alt.EncodingSortField('counts', op='sum', order='descending')),
    color= alt.Color('type:N', title='type', scale= alt.Scale(range=cores_padrao, domain=tipos)), 
    size= alt.Size('counts', scale=alt.Scale()),
    tooltip= [alt.Tooltip('type', title='type'),
              alt.Tooltip('start_year', title='start year'),
              alt.Tooltip('counts', title='counts')]
)

tipo.save('./graphs/tipo/tipos_por_familia.html')

tipo

<br>

<font size='6'>Para a base de poliquetas acaba aqui, por enquanto </font>

<br>

#### defining some parameters

colors = ['#d62728', '#f58518', '#d95f02',
          '#d62729', '#f58519', '#d95f03',
          '#4daf4a', '#8c6d31',
          '#79706e', '#bab0ac', '#d8b5a5'
         ]

types = ['Holotipo', 'Alotipo', 'Neotipo',
          'Sintipo', 'Lectotipo', 'Paralectotipo',
          'Paratipo', 'Topotipo', 
          'Tipo', 'Co-tipo', 'Material tipo'
        ]

opacities= [1, 1, 1,
           0.4,0.4,0.4,
           1,1,1,
           1,1,1]

# creating dictionary with type:color
type_color = defaultdict()
for t in range(len(types)):
    type_color[types[t]] = colors[t] 

<br>

### ordering per holotype year (year in which the holotype was discovered)

counts = teste1.groupby(['determiner_first_and_last_name', 'type_status','holotipo_year']).count()['class'].reset_index()
counts.rename(columns={'class':'counts'}, inplace=True)

teste1 = pd.merge(teste1, counts, on=['determiner_first_and_last_name', 'type_status','holotipo_year'])

# sorting in subsample (again)
temp = teste1[(~teste1['determiner_first_and_last_name'].isna()) & (~teste1['type_status'].isna())
                      & (~teste1['start_year'].isna())].copy()

teste1.sort_values(['holotipo_year','determiner_first_and_last_name', 'counts'], inplace=True)

# names ordering for y axis
names_ordering = list(
    temp[~temp['determiner_first_and_last_name'].isna()]['determiner_first_and_last_name'].unique()
)

holotipo = teste1[(teste1['type_status'] == 'Holotipo')].copy()  # filtering only holotype
holotipo = holotipo[~holotipo['determiner_first_and_last_name'].isna()].copy()  # making sure names are 
#                                                                                nonempty...
# sorting
holotipo.sort_values('holotipo_year', inplace=True)

# sorting per holotipo year
names_ordering = list(holotipo['determiner_first_and_last_name'].values)

temp = holotipo.groupby(['order','start_year']).count()['count'].reset_index()

g1 = alt.Chart(temp[temp['order'] != 'Nan'],
               width=800, height=400, title='Number of holotypes of each order per year').mark_circle(size=60,
                                                                                color='red').encode(
    x= alt.X('start_year', type='ordinal', title='Year'),
    y= alt.Y('order', type='nominal', title='Order',
            sort= alt.EncodingSortField(field='count', op='max', order='descending')),
    size = alt.Size('count'),
)

# saving graph
# g1.save(f'./types/counts_per_year/holotypes_per_order.svg')
# g1.save(f'./types/counts_per_year/holotypes_per_order.png')
# g1.save(f'./types/counts_per_year/holotypes_per_order.html')

g1

### separating and grouping types

types = teste1['type_status'].unique()

group_types = []
for t in types:
    if t not in ['Holotipo', 'Alotipo', 'Paratipo']:
        group_types.append(t)

# replicating for ALL types
for t in ['Holotipo', 'Alotipo', 'Paratipo']:
    # preparing data
    temp1 = teste1[(teste1['type_status'] == t)].copy()                   # filtering only holotype
    temp = temp1[~temp1['determiner_first_and_last_name'].isna()].copy()  # making sure names are 
                                                                          # nonempty...
    # sorting
    temp.sort_values('holotipo_year', inplace=True)

    # sorting per holotipo year
    names_ordering = list(temp['determiner_first_and_last_name'].values)
    
    # grouping and counting per order and year
    temp = temp.groupby(['order','start_year']).count()['count'].reset_index()

    ### Chart
    g1 = alt.Chart(temp[temp['order'] != 'Nan'],
                   width=800, height=400, title=f'Number of {t} of each order per year').mark_circle(size=60,
                                                                     color=f'{type_color[t]}').encode(
        x= alt.X('start_year', type='ordinal', title='Year'),
        y= alt.Y('order', type='nominal', title='Order',
                sort= alt.EncodingSortField(field='count', op='max', order='descending')),
        size = alt.Size('count'),
    #     color= alt.Color('holotipo_year', type='quantitative', scale= alt.Scale(scheme='reds')), 
    #     opacity= alt.Opacity(scale= alt.Scale(domain=types,range=opacities), type='quantitative')
    #     size=alt.Size('depth')
    )

    # saving graph
#     g1.save(f'./types/counts_per_year/{t}_per_order.svg')
#     g1.save(f'./types/counts_per_year/{t}_per_order.png')
#     g1.save(f'./types/counts_per_year/{t}_per_order.html')

# g1

<br>

#### graph of other types

t = teste1['order'].unique()

temp = teste1[(teste1['order'] != 'Nan') & (teste1['type_status'].isin(group_types))]
temp = temp.groupby(['order','start_year','type_status']).count()['count'].reset_index()

colors = ['']

<font color='red' size='5'>**p.s.:** temporary adjustment!!!! </font>

I'm grouping (replacing) `Material tipo` and `co-tipo` in (for) `tipo`.

temp['type_status'] = temp['type_status'].str.replace('Material tipo', 'Tipo')
temp['type_status'] = temp['type_status'].str.replace('Topotipo', 'Tipo')
temp['type_status'] = temp['type_status'].str.replace('Co-tipo', 'Tipo')

**graph**

# making graph for other types (group_types)
g1 = alt.Chart(temp,
               width=800, height=400, title='Number of types of each order per year').mark_circle(
                                                                                size=60).encode(
    x= alt.X('start_year', type='ordinal', title='Year'),
    y= alt.Y('order', type='nominal', title='Order',
            sort= alt.EncodingSortField(field='count', op='max', order='descending')),
    color= alt.Color('type_status', title='type',
                     scale= alt.Scale(domain= list(temp['type_status'].unique()),
                                      range=[type_color[t] for t in list(temp['type_status'].unique())])),
    size = alt.Size('count', type='quantitative', scale = alt.Scale(type='bin-ordinal')),
)

# saving graph
# g1.save(f'./types/counts_per_year/other-types_per_order.svg')
# g1.save(f'./types/counts_per_year/other-types_per_order.png')
# g1.save(f'./types/counts_per_year/other-types_per_order.html')

g1

<br>

## New proposal

compare: Year in which the holotype was discovered v.s. Year in which it was firstly cataloged in the Museum

# p.s.: there are overlapping points (it's affecting opacity)
g1 = alt.Chart(holotipo, title='Holotypes discovered and determined years',
               width=600, height=500).mark_circle(size=60).encode(
    x= alt.X('start_year', type='ordinal', title='Discovered Year'),
    y= alt.Y('holotipo_year', type='ordinal', title='Start Year',
            sort= alt.EncodingSortField(field='holotipo_year', order='descending')),
    tooltip= alt.Tooltip(['holotipo_year', 'start_year']),
    color= alt.Color('order', type='nominal', scale= alt.Scale(scheme='reds')), 
#     opacity= alt.Opacity(scale= alt.Scale(domain=types,range=opacities), type='quantitative')
#     size=alt.Size('depth')
)

# saving graph
# g1.save(f'./types/holotipo_discoreved_vs_start_year.svg')
# g1.save(f'./types/holotipo_discoreved_vs_start_year.png')
# g1.save(f'./types/holotipo_discoreved_vs_start_year.html')

g1

<br>

**The end!**

-----