# Cumulative Counts per researcher & family

By **Franklin Oliveira**

-----
This notebook contains all code necessary to make the cumulative counts line charts for `poliqueta` database. Here you'll find some basic data treatment and charts' code. 

Database: <font color='blue'>'IBUFRJ27.07.2020 - visualização.xlsx'</font> and <font color='blue'>'MNRJP27.07.2020 - visualização.xls'</font>.

In [1]:
import datetime
import numpy as np
import pandas as pd

from collections import defaultdict

# quick visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Waffle Charts
# from pywaffle import Waffle 
# docs: https://pywaffle.readthedocs.io/en/latest/examples/block_shape_distance_location_and_direction.html

# visualization
import altair as alt

# enabling notebook renderer
# alt.renderers.enable('notebook')
# alt.renderers.enable('default')

# disabling rows limit
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Importing data...

In [2]:
NewTable = pd.read_csv('./data/merged_db.csv', sep=';', encoding='utf-8-sig')

In [3]:
# only this first collection database contains determination date
ibufrj = NewTable[NewTable['catalog_number'].str.contains('IBUFRJ')].copy()

In [4]:
# formatting the string NaN
NewTable['family'] = NewTable['family'].apply(lambda x: 'NaN' if x=='Nan' else x)

# parsing into string
NewTable['determiner_full_name'] = NewTable['determiner_full_name'].astype(str)

In [5]:
# parsing NaN into string so it's not excluded while grouping
def convert2int(y):
    try:
        return int(y)
    except:
        return y
    
# NewTable['determined_year'] = NewTable['determined_year'].apply(convert2int).astype(str)  # forcing to string to keep NaNs
NewTable['start_year'] = NewTable['start_year'].apply(convert2int).astype(str) 

<br>

<font size=5>**Color Palette**</font>

<!-- <div class='row' style='padding-top:20px;'>
    <div class='col-md-6'>
        <img src="./src/img1.jpg" width='400px'>
    </div>
    <div class='col-md-6'>
        <img src="./src/img2.jpg" width='400px'>
    </div>
</div> -->

<br>

In [6]:
# importing customized color palettes
from src.MNViz_colors import *

<br>


## Graphs

---

### Creating chart: counts per determiner per year

To calculate the cumulative counts, we need to consider ALL determiner's columns, in this case:

    - 'determinator_full_name'
    - 'determinator_full_name2': as I'm writing this script, it's all empty

In [7]:
determiner_columns = ['determiner_full_name', 'determiner_full_name1', 'determiner_full_name2']

# name of all determiners (first, second and third)
determiners = set(ibufrj['determiner_full_name'].str.strip()).union(set(ibufrj['determiner_full_name1']))
determiners = determiners.union(set(ibufrj['determiner_full_name2']))
# determiners = NewTable['determiner_full_name'].unique()

determiners = [name for name in determiners if str(name) != 'nan']  # removing NaN and parsing into a list

In [62]:
# subsetting (p.s.: kingdom is a non-empty column used only for counting)
df = ibufrj[[determiner_columns[0], 'determined_year', 'kingdom']].copy()

# concatenating all columns into just one to make grouping per determiner and year easier
if len(determiner_columns) > 1:
    for det_col in determiner_columns[1:]:
        temp = ibufrj[[det_col, 'determined_year', 'kingdom']].copy()
        temp.columns = [determiner_columns[0], 'determined_year', 'kingdom']
        
        df = pd.concat([df, temp])
        
# parsing columns into strings so we don't lose information while grouping
df[determiner_columns[0]] = df[determiner_columns[0]].astype(str)
df['determined_year'] = df['determined_year'].astype(str)

In [86]:
# grouping
grouped = df.groupby(['determiner_full_name', 'determined_year']).count().reset_index().rename(columns=
                                                        {'kingdom':'count'})

# sorting
grouped.sort_values('determined_year', inplace=True)

# cumulatively counting for each determiner
counts = pd.DataFrame()
for det in determiners:
    temp = grouped[grouped['determiner_full_name'] == det].copy()
    temp['cumulative_sum'] = temp['count'].cumsum()
    
    counts = pd.concat([counts, temp])
    
# grouped['cumulative_sum'] = cumCounts

In [93]:
# temporary adjustment for axis labels only
counts['determined_year'] = counts['determined_year'].apply(lambda x:str(x).split('.')[0])

## Chart: cumulative counts per determiner

In [99]:
select = alt.selection(type='single', on='mouseover', nearest=True, fields=['determiner_full_name'])

base = alt.Chart(counts, title='Cumulative contribution of each determiner', width=800,
              height=400).mark_line(point=True).encode(
    x= alt.X('determined_year', type="ordinal", title='Determination Year'),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending')),
    color= alt.Color('determiner_full_name:N', title='', legend=None,
                     scale= alt.Scale(scheme='warmgreys')),
)

points = base.mark_circle(size=40).encode(
    opacity=alt.value(0.5),
    tooltip= alt.Tooltip(['determiner_full_name','determined_year','count','cumulative_sum'])
).add_selection(
    select
)

lines = base.mark_line(point=True).encode(
    size=alt.condition(~select, alt.value(1), alt.value(4)),
    opacity= alt.value(1),
#     color = alt.condition(~select, alt.value('lightgray'), alt.value('black'))
#     tooltip= alt.Tooltip(['determinator_full_name','det_year','cumulative_sum'])
)

g1 = points + lines 

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/determiner/cumCount-per-year.html')

# g1

<br>

### Collector's cumulative contribution per year

To calculate the cumulative counts, we need to consider ALL collector's columns, in this case:

    - 'collector_full_name'
    - 'collector_full_name2' ... 'collector_full_name6'

In [101]:
collector_columns = ['collector_full_name', 'collector_full_name2', 
                     'collector_full_name3', 'collector_full_name4']

# name of all collectors (first through sixth)
collectors = set(NewTable['collector_full_name'].str.strip())

for col in collector_columns[1:]:
    collectors = collectors.union(set(NewTable[col]))

collectors = [name for name in collectors if 'nan' not in str(name)]  # removing NaN and parsing into a list

In [104]:
# subsetting (p.s.: kingdom is a non-empty column used only for counting)
df = NewTable[[collector_columns[0], 'start_year', 'kingdom']].copy()

# concatenating all columns into just one to make grouping per determiner and year easier
if len(collector_columns) > 1:
    for col in collector_columns[1:]:
        temp = NewTable[[col, 'start_year', 'kingdom']].copy()
        temp.columns = [collector_columns[0], 'start_year', 'kingdom']
        
        df = pd.concat([df, temp])
        
# parsing columns into strings so we don't lose information while grouping
df[collector_columns[0]] = df[collector_columns[0]].astype(str)
df['start_year'] = df['start_year'].astype(str)

In [114]:
# grouping
grouped = df.groupby([collector_columns[0], 'start_year']).count().reset_index().rename(columns=
                                                        {'kingdom':'count'})

# sorting
grouped.sort_values('start_year', inplace=True)

# cumulatively counting for each determiner
counts = pd.DataFrame()
for col in collectors:
    temp = grouped[grouped[collector_columns[0]] == col].copy()
    temp['cumulative_sum'] = temp['count'].cumsum()
    
    counts = pd.concat([counts, temp])
    
# grouped['cumulative_sum'] = cumCounts

In [115]:
# temporary adjustment for axis labels only
counts['start_year'] = counts['start_year'].apply(lambda x:str(x).split('.')[0])

### Chart: cumulative counts per collector

In [123]:
select = alt.selection(type='single', on='mouseover', nearest=True, fields=['collector_full_name'])


base = alt.Chart(counts, title='Cumulative contribution of each Collector', width=800,
              height=400).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year'),
    y= alt.Y('cumulative_sum', title='', 
             sort=alt.EncodingSortField('counts', op="count", order='descending')),
    color= alt.Color('collector_full_name:N', title='', legend=None, 
                     scale= alt.Scale(scheme= 'warmgreys')),
)

points = base.mark_circle(size=40).encode(
    opacity=alt.value(0.5),
    tooltip= alt.Tooltip(['collector_full_name','start_year','count','cumulative_sum'])
).add_selection(
    select
)

front = base.mark_line(point=True).encode(
    size=alt.condition(~select, alt.value(1), alt.value(4)),
    opacity= alt.value(1),
)

g1 = points + front

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/collector/cumCount-per-year.html')

# g1

<br>

## Cumulative counts per Family


In [124]:
# grouping per Year and Family
teste = NewTable.groupby(['start_year','family']).count()['class'].reset_index().rename(columns={
    'class':'counts'
})

# sorting...
teste = teste.sort_values(['family', 'start_year'])

In [125]:
# cumulatively counting
cumSum = []
for family in teste['family'].unique():
    cumSum.extend(list(teste[teste['family'] == family]['counts'].cumsum()))
    
teste['cumulative_sum'] = cumSum

In [126]:
teste['start_year'] = teste['start_year'].apply(lambda x: str(x).split('.')[0])

### Chart: per collected year

In [134]:
## filtering out some families lost while grouping
# familias = [f for f in cores_familia.keys() if f in teste['family'].unique()]
# cores_temp = [cores_familia[f] for f in familias] 

# selector
select_family = alt.selection_multi(fields=['family'], bind='legend')

# aux. variables for encoding fields
x_labels = teste.sort_values('start_year')['start_year'].unique()
# x_labels = [str(y).split('.')[0] for y in x_labels]
y_max = np.ceil(max(teste['cumulative_sum'].unique()))

g1 = alt.Chart(teste, title='Cumulative amount of specimens per family', 
               width=600, height=400).mark_line(point=True).encode(
    x= alt.X('start_year', type="ordinal", title='Sampling Year',
             scale= alt.Scale(domain= x_labels)),
    y= alt.Y('cumulative_sum', title='',
             scale= alt.Scale(domain= [0,y_max]),
             sort=alt.EncodingSortField('counts', op="count", order='descending')),
    color= alt.Color('family:N', title='Family',
                     legend= alt.Legend(columns=2, symbolLimit=50),
                     scale=alt.Scale(domain=list(cores_familia.keys()), range= list(cores_familia.values()))),
    tooltip= alt.Tooltip(['family','start_year','counts', 'cumulative_sum']),
#     opacity= alt.condition(select_family, alt.value(1), alt.value(0))
).add_selection(select_family).transform_filter(select_family)

g1 = g1.configure_title(fontSize=16).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=12,
    titleFontSize=12
)

# g1.save('./graphs/cumCounts/family/temporal_evolution_per_family.html')
# g1

<br>

**The end!**

-----