# Database Exploration

By **Franklin Oliveira**

-----

This notebook contains some code written to I (Franklin) get accquainted with the `repteis` database. Here you'll find some basic data treatment and adjustments that presented necessary as I started to understand the nature of the information in file <font color='blue'>'Compilacao Livros Repteis - 2 a 10 - 2020_04_28.xls'</font>.

In [1]:
import datetime
import numpy as np
import pandas as pd

from collections import defaultdict

# pacotes para visualização rápida
import seaborn as sns
import matplotlib.pyplot as plt

# pacote para visualização principal
import altair as alt

# habilitando renderizador para notebook
# alt.renderers.enable('notebook')
# alt.renderers.enable('default')


# desabilitando limite de linhas
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Importing data...

Importing pre-treated data in `1-data_treatment.ipynb`. In this notebook, I'm doing only some minnor adjustments for visualization purposes only. For a full traceback of data treatment, please see the `1-data_treatment` notebook.

In [2]:
NewTable = pd.read_csv('./data/treated_db.csv', sep=';', encoding='utf-8', low_memory= False)

<br>

<font size=5>**Color palette**</font>

<!-- <img src="./src/paleta_cores.jpeg" width='500px'> -->

In [3]:
# importing customized color palettes
from src.MNViz_colors import *

<br>

---

## Graphs

### Total amount of catalogations per year

x: Start Year (from Start Date)
y: number of catalogations per year

-----

## Altitude per family

In [4]:
# subsetting
teste = NewTable[['altitude','familia','ordem','subordem', 'ano_coleta', 'qualificador_atual', 'numero_catalogo', 
                  'genero_atual', 'especie_atual', 'subespecie_atual']].copy()

# sorting
teste = teste.sort_values(['altitude','familia'])

# dropping na
teste.dropna(subset=['altitude'], inplace=True)

# making sure altitude is a floating point number
teste['altitude'] = teste['altitude'].astype(float)

# removing outlier
teste = teste[teste['altitude'] < 7000].copy()

<br>

**OBS:** ainda na paleta antiga (por ordem)

In [8]:
# database
db = teste[teste['familia'] != "#n/d"]

# aux. variables
ordens = list(cores_ordem.keys())
cores = list(cores_ordem.values())

temp = alt.Chart(db, title='Altitude per family').mark_circle().encode(
    x = alt.X('familia', type='nominal', title='Family', 
              sort= alt.EncodingSortField('altitude', op='max', order='ascending')),
    y = alt.Y('altitude', type='quantitative', title='Altitude (in meters)'),
    color= alt.Color('ordem', title='Order',
                     scale=alt.Scale(domain=ordens, range=cores)),
    tooltip = alt.Tooltip(['numero_catalogo', 'genero_atual','especie_atual','subespecie_atual', 
                            'qualificador_atual', 'ano_coleta','altitude'])
)

# temp.facet(column='ordem').configure_title(fontSize=16).configure_axis(
#     labelFontSize=12,
#     titleFontSize=12
# ).configure_legend(
#     labelFontSize=12,
#     titleFontSize=12
# ).save('./graphs/altitude/altitude_per_family-faceted.html')

# temp.save('./graphs/altitude/altitude_per_family-order_palette.html')
# temp

#### Filtering only order `Squamata`

In [10]:
# database
db = teste[(teste['ordem'] == 'Squamata') & (teste['familia'] != "#n/d")]

alt_squam = alt.Chart(db,
        title='Altitude per family (Squamata)').mark_circle(color= cores[0]).encode(
    x = alt.X('familia', type='nominal', title='Family', 
              sort= alt.EncodingSortField('altitude', op='max', order='ascending')),
    y = alt.Y('altitude', type='quantitative', title='Altitude (in meters)'),
    tooltip = alt.Tooltip(['numero_catalogo', 'genero_atual','especie_atual','subespecie_atual', 
                           'ordem', 'subordem',
                            'qualificador_atual', 'ano_coleta','altitude'])
)

alt_squam = alt_squam.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# alt_squam.save('./graphs/altitude/altitude_per_family-squamata.html')
# alt_squam

<br>

### New Palette (per family)

In [29]:
# database
db = teste[teste['familia'] != "#n/d"]

# aux. variables
# familias = [f for f in cores_familia.keys() if f in teste['familia'].unique()]
# cores_temp = [cores_familia[f] for f in familias]
x_labels = db.sort_values('altitude', ascending=False)['familia'].unique()[::-1]
y_max = db['altitude'].max()
y_min = db['altitude'].min()

# selector
select_family = alt.selection_multi(fields=['familia'], bind='legend')

g = alt.Chart(db, title='Altitude per family', width=500).mark_circle().encode(
    x = alt.X('familia', type='nominal', title='Family', 
              scale= alt.Scale(domain= x_labels),
              sort= alt.EncodingSortField('altitude', op='max', order='ascending')),
    y = alt.Y('altitude', type='quantitative', title='Altitude (in meters)',
              scale= alt.Scale(domain= [min(y_min, 0), y_max])),
    color= alt.Color('familia:N', title= 'Family', 
                     legend = alt.Legend(columns=2, symbolLimit=50),
                     scale=alt.Scale(domain= list(cores_familia.keys()), 
                                     range= list(cores_familia.values()))),
    tooltip = alt.Tooltip(['numero_catalogo', 'genero_atual','especie_atual','subespecie_atual', 
                            'qualificador_atual', 'ano_coleta','altitude'])
).add_selection(select_family).transform_filter(select_family)

### Faceted version (per order)
# g.facet(column='ordem').configure_title(fontSize=16).configure_axis(
#     labelFontSize=12,
#     titleFontSize=12
# ).configure_legend(
#     labelFontSize=12,
#     titleFontSize=12
# ).save('./graphs/altitude/altitude_per_family-faceted.html')


g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g.save('./graphs/altitude/altitude_per_family.html')
# g

In [25]:
# database - filtering only order Squamata
db = teste[(teste['ordem'] == 'Squamata') & (teste['familia'] != "#n/d")]

# aux. variables
familias = [f for f in cores_familia.keys() if f in teste[teste['ordem'] == 'Squamata']['familia'].unique()]
cores_temp = [cores_familia[f] for f in familias]
x_labels = db.sort_values('altitude', ascending=False)['familia'].unique()[::-1]
y_max = db['altitude'].max()
y_min = db['altitude'].min()

# selector
select_family = alt.selection_multi(fields=['familia'], bind='legend')

# chart
alt_squam = alt.Chart(db, title='Altitude per family (Squamata)', width=500).mark_circle(
    color= cores[0]).encode(
        x = alt.X('familia', type='nominal', title='Family', 
                  scale= alt.Scale(domain= x_labels),
                  sort= alt.EncodingSortField('altitude', op='max', order='ascending')),
        y = alt.Y('altitude', type='quantitative', title='Altitude (in meters)', 
                  scale= alt.Scale(domain= [min(y_min, 0), y_max])),
        color = alt.Color('familia:N', title='Family', 
                          legend= alt.Legend(columns=1, symbolLimit=50),
                          scale= alt.Scale(domain= familias, 
                                           range= cores_temp)),
        tooltip = alt.Tooltip(['numero_catalogo', 'genero_atual','especie_atual','subespecie_atual', 
                               'ordem', 'subordem',
                                'qualificador_atual', 'ano_coleta','altitude'])
).add_selection(select_family).transform_filter(select_family)

alt_squam = alt_squam.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# alt_squam.save('./graphs/altitude/altitude_per_family-squamata.html')
# alt_squam

<br>

## Altitude per genus

In [9]:
teste = NewTable[['altitude','especie_atual','genero_atual','ordem', 'subordem',
                 'familia', 'ano_coleta', 'qualificador_atual', 'numero_catalogo', 'subespecie_atual']].copy()

# making sure altitude is a floating point number
teste['altitude'] = teste['altitude'].astype(float)

# sorting
teste = teste.sort_values(['altitude','genero_atual'])

# dropping na
teste = teste.dropna(subset=['altitude'])

# removing outlier
teste = teste[teste['altitude'] < 7000].copy()

In [24]:
# database
db = teste[~teste['ordem'].isna()]

# aux. variables
x_labels = db.sort_values('altitude', ascending=False)['genero_atual'].unique()[::-1]
y_max = db['altitude'].max()
y_min = db['altitude'].min()

# selector
select_family = alt.selection_multi(fields=['familia'], bind='legend')

# ordering x-axis per mean altitude - OUTLIER: ordem nula
g = alt.Chart(db, title='Altitude per genus',
                width= 900, height=400).mark_circle().encode(
    x = alt.X('genero_atual', type='nominal', title='Genus',
              scale= alt.Scale(domain= x_labels),
              sort=alt.EncodingSortField('altitude', op="max", order="ascending")),
    y = alt.Y('altitude', type='quantitative', title='Altitude (in meters)',
              scale= alt.Scale(domain= [min(y_min, 0), y_max])),  # shows if there's smecimens below sea level
    color = alt.Color('familia:N', title='Family',
                      legend= alt.Legend(columns=2, symbolLimit=50),
                      scale= alt.Scale(domain=list(cores_familia.keys()), range= list(cores_familia.values()))),
    tooltip = alt.Tooltip(['numero_catalogo', 'genero_atual','especie_atual','subespecie_atual', 
                           'ordem', 'subordem',
                            'qualificador_atual', 'ano_coleta','altitude'])
).add_selection(select_family).transform_filter(select_family)

g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g.save('./graphs/altitude/genus/altitude-per-genus.html')
# g

In [25]:
# input do especialista: altitude não faz sentido para crocodilianos
# separating per order
for j in range(2):
    # defining color chanel domain and range, respectively
    familias = [f for f in cores_familia.keys() if f in teste[teste['ordem'] == ordens[j]]['familia'].unique()]
    cores_temp = [cores_familia[f] for f in familias]
    
    # aux. variables
    x_labels = db.sort_values('altitude', ascending=False)['genero_atual'].unique()[::-1]
    y_max = db['altitude'].max()
    y_min = db['altitude'].min()

    # selector
    select_family = alt.selection_multi(fields=['familia'], bind='legend')
    
    temp = alt.Chart(teste[(~teste['ordem'].isna()) & (teste['ordem'] == ordens[j])], width=800,
                     title=f'Altitude per genus ({ordens[j]})').mark_circle(color= cores[j]).encode(
        x = alt.X('genero_atual', type='nominal', title='Genus',
                  scale= alt.Scale(domain= x_labels),
                  sort=alt.EncodingSortField('altitude', op="max", order="ascending")),
        y = alt.Y('altitude', type='quantitative', title='Altitude (in meters)',
                  scale= alt.Scale(domain= [min(y_min, 0), y_max])),
        color= alt.Color('familia:N', title='Family', 
                         legend= alt.Legend(columns=1, symbolLimit=42),
                         scale= alt.Scale(domain= familias, range= cores_temp)),
        tooltip = alt.Tooltip(['numero_catalogo','especie_atual','subespecie_atual', 
                               'ordem', 'subordem',
                            'qualificador_atual', 'ano_coleta','altitude'])
    ).add_selection(select_family).transform_filter(select_family)
    
    temp = temp.configure_title(fontSize=16).configure_axis(
                labelFontSize=12,
                titleFontSize=12
            ).configure_legend(
                labelFontSize=12,
                titleFontSize=12
            )
    temp.save(f'./graphs/altitude/genus/altitude_per_genus-{ordens[j]}.html')

# temp

<br>

## Separating per variance groups

**p.s.:** in this case, variance doesn't make much sense for groups with only one or two observations. So I'm separating per "density" (counting) groups as a proxy. 

p.s.: ended up separating per number of points (i.e., more and less represented genus)

In [26]:
# input do especialista: altitude faz mais sentido apenas para a ordem Squamata
squamata = teste[teste['ordem'] == 'Squamata'].copy()
squamata['altitude'] = squamata['altitude'].astype(float)

# counting per gender
sort = squamata.groupby('genero_atual').count()['ordem'].reset_index().rename(columns={'ordem':'counts'})

In [27]:
threshold = 2

# mais variabilidade (threshold: counts >=2)
grupo1 = sort[sort['counts'] > threshold]['genero_atual']

# menos variabilidade
grupo2 = sort[sort['counts'] <= threshold]['genero_atual']

#### higher variance group

genus with at least 2 points

In [30]:
# defining color chanel domain and range, respectively
familias = [f for f in cores_familia.keys() if f in squamata[(squamata['genero_atual'].isin(grupo1))]['familia'].unique()]
cores_temp = [cores_familia[f] for f in familias]

# aux. variables
x_labels = db.sort_values('altitude', ascending=False)['genero_atual'].unique()[::-1]
y_max = db['altitude'].max()
y_min = db['altitude'].min()

# selector
select_family = alt.selection_multi(fields=['familia'], bind='legend')

# ordering x-axis per mean altitude
g = alt.Chart(squamata[(squamata['genero_atual'].isin(grupo1))], width= 800,
                 title='Altitude per genus (Squamata)').mark_circle(color= cores[0]).encode(
    x = alt.X('genero_atual', type='nominal', title='Genus',
              scale= alt.Scale(domain= x_labels),
              sort=alt.EncodingSortField('altitude', op='max', order="ascending")),
    y = alt.Y('altitude', type='quantitative', title='Altitude (in meters)',
              scale= alt.Scale(domain= [min(y_min, 0), y_max])),
    color = alt.Color('familia:N', title='Family', 
                      legend= alt.Legend(columns=1, symbolLimit=42), 
                      scale= alt.Scale(domain= familias, range= cores_temp)),
    tooltip = alt.Tooltip(['numero_catalogo','especie_atual','subespecie_atual', 
                           'ordem', 'subordem', 'familia',
                            'qualificador_atual', 'ano_coleta','altitude'])
).add_selection(select_family).transform_filter(select_family)

g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g.save('./graphs/altitude/genus/altitude_per_genus-higher-var.html')
# g

#### lower variance group

less represented genus

In [31]:
# não dá para usar como marca gráfica (muitos valores faltando)
teste['qualificador_atual'].value_counts(dropna=False)

NaN     625
sp.      10
cf.       5
aff.      5
c.        1
Name: qualificador_atual, dtype: int64

In [35]:
# filtering - defining color chanel's domain and range, respectively
familias = [f for f in cores_familia.keys() if f in squamata[(squamata['genero_atual'].isin(grupo2))]['familia'].unique()]
cores_temp = [cores_familia[f] for f in familias]

# aux. variables
x_labels = db.sort_values('altitude', ascending=False)['genero_atual'].unique()[::-1]
y_max = db['altitude'].max()
y_min = db['altitude'].min()

# selector
select_family = alt.selection_multi(fields=['familia'], bind='legend')

# ordering x-axis per mean altitude
g = alt.Chart(squamata[(squamata['genero_atual'].isin(grupo2))], title='Altitude per genus (Squamata)',
                width=800).mark_circle(color= cores[0]).encode(
    x = alt.X('genero_atual', type='nominal', title='Genus',
              scale= alt.Scale(domain= x_labels),
              sort=alt.EncodingSortField('altitude', op="max", order="ascending")),
    y = alt.Y('altitude', type='quantitative', title='Altitude (in meters)',
              scale= alt.Scale(domain= [min(y_min, 0), y_max])),
    color = alt.Color('familia:N', title='Family', 
                  legend= alt.Legend(columns=1, symbolLimit=42), 
                  scale= alt.Scale(domain= familias, range= cores_temp)),
    tooltip = alt.Tooltip(['numero_catalogo','especie_atual','subespecie_atual', 
                           'ordem', 'subordem',
                            'qualificador_atual', 'ano_coleta','altitude'])
).add_selection(select_family).transform_filter(select_family)

g = g.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g.save('./graphs/altitude/genus/altitude_per_genus-lower-var.html')
# g

<br>

**Thats it!**

-----