# Cumulative Counts per researcher & family

By **Franklin Oliveira**

-----
This notebook contains all code necessary to make the cumulative counts line charts for `poliqueta` database. Here you'll find some basic data treatment and charts' code. 

Database: <font color='blue'>'IBUFRJ27.07.2020 - visualização.xlsx'</font> and <font color='blue'>'MNRJP27.07.2020 - visualização.xls'</font>.

In [1]:
import datetime
import numpy as np
import pandas as pd

from collections import defaultdict

# quick visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Waffle Charts
# from pywaffle import Waffle 
# docs: https://pywaffle.readthedocs.io/en/latest/examples/block_shape_distance_location_and_direction.html

# visualization
import altair as alt

# enabling notebook renderer
# alt.renderers.enable('notebook')
# alt.renderers.enable('default')

# disabling rows limit
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Importing data...

In [2]:
NewTable = pd.read_csv('./data/treated_db.csv', sep=';', encoding='utf-8-sig', low_memory=False)

## Filtering

At least for now, we'll be considering only specimens of order decapoda (deeply revised by the Museum's crew)

In [3]:
decapoda = NewTable[NewTable['order'] == 'Decapoda'].copy()

<br>

<font size=5>**Color palette**</font>

Colors (per infraorder): 

- <font color='#e26d67'><b>Ascacidae</b></font>
- <font color='#007961'><b>Anomura</b></font>
- <font color='#7a2c39'><b>Achelata</b></font>
- <font color='#b67262'><b>Axiidea</b></font>
- <font color='#ee4454'><b>Brachyura</b></font>
- <font color='#3330b7'><b>Caridea</b></font>
- <font color='#58b5e1'><b>Gebiidea</b></font>
- <font color='#b8e450'><b>Stenopodídea</b></font>
- <font color='#a0a3fd'><b>Astacidae</b></font>
- <font color='#deae9e'><b>Polychelida</b></font>
- <font color='#d867be'><b>Grapsidae</b></font>
- <font color='#fece5f'><b>Xanthoidea</b></font>

In [4]:
# importing customized color palettes
from src.MNViz_colors import *

In [5]:
# p.s.: there are strings and numbers (floats) in these columns
# decapoda['det_year'].unique()
# decapoda['start_year'].unique()

In [6]:
def convert2int(y):
    try:
        return int(y)
    except:
        return y
    
decapoda['det_year'] = decapoda['det_year'].apply(convert2int).astype(str)  # forcing to string to keep NaNs
decapoda['start_year'] = decapoda['start_year'].apply(convert2int).astype(str)  

<br>


## Graphs

---

<br>

### Creating chart: counts per determiner per year

To calculate the cumulative counts, we need to consider ALL determiner's columns, in this case:

    - 'determiner_full_name'
    - 'determiner_full_name2'

In [7]:
determiner_columns = ['determiner_full_name', 'determiner_full_name2']

# name of all determiners (first and second)
determiners = set(decapoda['determiner_full_name'].str.strip()).union(set(decapoda['determiner_full_name2']))

determiners = [name for name in determiners if 'nan' not in str(name)]  # removing NaN and parsing into a list

In [8]:
# subsetting (p.s.: kingdom is a non-empty column used only for counting)
df = decapoda[[determiner_columns[0], 'det_year', 'kingdom']].copy()

# concatenating all columns into just one to make grouping per determiner and year easier
if len(determiner_columns) > 1:
    for det_col in determiner_columns[1:]:
        temp = decapoda[[det_col, 'det_year', 'kingdom']].copy()
        temp.columns = [determiner_columns[0], 'det_year', 'kingdom']
        
        df = pd.concat([df, temp])
        
# parsing columns into strings so we don't lose information while grouping
df[determiner_columns[0]] = df[determiner_columns[0]].astype(str)
df['det_year'] = df['det_year'].astype(str)

In [9]:
# grouping
grouped = df.groupby(['determiner_full_name', 'det_year']).count().reset_index().rename(columns=
                                                        {'kingdom':'count'})

# sorting
grouped.sort_values('det_year', inplace=True)

# cumulatively counting for each determiner
counts = pd.DataFrame()
for det in determiners:
    temp = grouped[grouped['determiner_full_name'] == det].copy()
    temp['cumulative_sum'] = temp['count'].cumsum()
    
    counts = pd.concat([counts, temp])
    
# grouped['cumulative_sum'] = cumCounts

In [10]:
# temporary adjustment for axis labels only
counts['det_year'] = counts['det_year'].apply(lambda x:str(x).split('.')[0])

### Chart: cumulative counts per determiner

In [12]:
select = alt.selection(type='single', on='mouseover', nearest=True, fields=['determiner_full_name'])

base = alt.Chart(counts, title='Cumulative contribution of each determiner', width=1000,
              height=500).encode(
    x= alt.X('det_year', type="ordinal", title='Determination Year'),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending')),
    color= alt.Color('determiner_full_name:N', title='', legend=None,
                    scale= alt.Scale(scheme='warmgreys')),
    opacity= alt.value(0)
#     tooltip= alt.Tooltip(['determinator_full_name','det_year','cumulative_sum'])
)

points = base.mark_circle(size=40).encode(
    opacity=alt.value(0.5),
    tooltip= alt.Tooltip(['determiner_full_name','det_year','count','cumulative_sum'])
).add_selection(
    select
)

front = base.mark_line(point=True).encode(
    size=alt.condition(~select, alt.value(1), alt.value(4)),
    opacity= alt.value(1),
#     color = alt.condition(~select, alt.value('lightgray'), alt.value('black'))
#     tooltip= alt.Tooltip(['determinator_full_name','det_year','cumulative_sum'])
)

g1 = points + front

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/determiner/cumCount-per-year.html')
# g1

<br>

### Collector's cumulative contribution per year

To calculate the cumulative counts, we need to consider ALL collector's columns, in this case:

    - 'collector_full_name'
    - 'collector_full_name2' ... 'collector_full_name7'

In [13]:
collector_columns = ['collector_full_name', 'collector_full_name2', 
                     'collector_full_name3', 'collector_full_name4', 
                     'collector_full_name5', 'collector_full_name6', 'collector_full_name7']

# name of all collectors (first through sixth)
collectors = set(decapoda['collector_full_name'].str.strip())

for col in collector_columns[1:]:
    collectors = collectors.union(set(decapoda[col]))

collectors = [name for name in collectors if 'nan' not in str(name)]  # removing NaN and parsing into a list

In [14]:
# subsetting (p.s.: kingdom is a non-empty column used only for counting)
df = decapoda[[collector_columns[0], 'start_year', 'kingdom']].copy()

# concatenating all columns into just one to make grouping per determiner and year easier
if len(collector_columns) > 1:
    for col in collector_columns[1:]:
        temp = decapoda[[col, 'start_year', 'kingdom']].copy()
        temp.columns = [collector_columns[0], 'start_year', 'kingdom']
        
        df = pd.concat([df, temp])
        
# parsing columns into strings so we don't lose information while grouping
df[collector_columns[0]] = df[collector_columns[0]].astype(str)
df['start_year'] = df['start_year'].astype(str)

In [15]:
# grouping
grouped = df.groupby([collector_columns[0], 'start_year']).count().reset_index().rename(columns=
                                                        {'kingdom':'count'})

# sorting
grouped.sort_values('start_year', inplace=True)

# cumulatively counting for each determiner
counts = pd.DataFrame()
for col in collectors:
    temp = grouped[grouped[collector_columns[0]] == col].copy()
    temp['cumulative_sum'] = temp['count'].cumsum()
    
    counts = pd.concat([counts, temp])
    
# grouped['cumulative_sum'] = cumCounts

In [16]:
# temporary adjustment for axis labels only
counts['start_year'] = counts['start_year'].apply(lambda x:str(x).split('.')[0])

### Chart: cumulative counts per collector

In [18]:
select = alt.selection(type='single', on='mouseover', nearest=True, fields=['collector_full_name'])

base = alt.Chart(counts, title='Cumulative contribution of each Collector', width=800,
              height=400).encode(
    x= alt.X('start_year', type="ordinal", title='Collecting Year'),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending')),
    color= alt.Color('collector_full_name:N', title='', legend=None,
                     scale= alt.Scale(scheme='warmgreys')),
#     tooltip= alt.Tooltip(['collector_full_name','start_year','cumulative_sum'])
)


points = base.mark_circle(size=40).encode(
    opacity=alt.value(0.5),
    tooltip= alt.Tooltip(['collector_full_name','start_year','cumulative_sum'])
).add_selection(
    select
)

front = base.mark_line(point=True).encode(
    size=alt.condition(~select, alt.value(1), alt.value(3)),
    opacity= alt.value(1),
#     color = alt.condition(~select, alt.value('lightgray'), alt.value('black'))
#     tooltip= alt.Tooltip(['determinator_full_name','det_year','cumulative_sum'])
)

g1 = points + front

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/collector/cumCount-per-year.html')
# g1

<br>

## Cumulative counts per Family


In [19]:
# grouping per Year and Family
teste = decapoda.groupby(['start_year','family']).count()['class'].reset_index().rename(columns={
    'class':'counts'
})

# sorting...
teste = teste.sort_values(['family', 'start_year'])

In [20]:
# cumulatively counting
cumSum = []
for family in teste['family'].unique():
    cumSum.extend(list(teste[teste['family'] == family]['counts'].cumsum()))
    
teste['cumulative_sum'] = cumSum

### Chart: per collected year

In [21]:
# searching back for infraorder of each family
infra = []
for f in teste['family']:
    infra.append(decapoda[decapoda['family'] == f]['infraorder'].unique()[0])
    
teste['infraorder'] = infra

In [23]:
# filtering out some families lost while grouping
# familias = [f for f in cores_familia_naive.keys() if f in teste['family'].unique()]
# cores_temp = [cores_familia_naive[f] for f in familias] 


# selector
select_family = alt.selection_multi(fields=['family'], bind='legend')

# aux. variables for encoding fields
x_labels = teste.sort_values('start_year')['start_year'].unique()
y_max = teste['cumulative_sum'].max()

g1 = alt.Chart(teste, title='Cumulative evolution of each family', 
               width=800, height=500).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain= x_labels)),
    y= alt.Y('cumulative_sum', title='', 
             scale= alt.Scale(domain= [0,y_max]),
             sort= alt.EncodingSortField('counts', op="count", order='descending')),
    color= alt.Color('family:N', title='Family',
                     legend= alt.Legend(columns=3, symbolLimit=102),
                     scale=alt.Scale(domain=list(cores_familia_naive.keys()), 
                                     range= list(cores_familia_naive.values()))),
    tooltip= alt.Tooltip(['infraorder','family','start_year','counts', 'cumulative_sum']),
    opacity= alt.condition(select_family, alt.value(1), alt.value(0))
).add_selection(select_family).transform_filter(select_family)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/family/evolution_per_family.html')
# g1

<br>

**The end!**

-----