In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

# Tutorial

A few examples to show how to use the classes from the package `catede` in order to estimate quantities such the Shannon entropy and the Kullback-Leibler divergence from data. 

## The `Experiment` class 
In these examples we generate $K=20^3$ categories distributed as sequeunces of length $L=3$ generated as a $20$ states Markov chain with random transition matrix.

In [8]:
from catede.handle_ngrams import markov_class
from catede.estimate import Experiment

seed_1 = 12345                                          # rng seed

#  simulation  #
A = 20                                                  # n. of states
L = 3                                                   # length of the L-grams
K = A ** L                                              # n. categories a priori

mobj_1 = markov_class( L, n_states=A, seed=seed_1 )     # random Markov matrix

size = int(5e4)                                         # sample size
seqs_1 = mobj_1.generate_counts( size, seed=seed_1 )    # generate histogram of counts
exact_shannon = mobj_1.exact_shannon()                  # exact Shannon entropy

exp_1 = Experiment( seqs_1, categories=K )              # first experiment

## Shannon entropy estimation

$$ S (q)= - \sum_{i=1}^{K} q_{i} \log q_{i}$$

In [9]:
naive = exp_1.shannon( method='naive' ) 
cae = exp_1.shannon( method='Chao-Shen' ) 
nsb, nsb_std = exp_1.shannon( method='NSB', error=True ) 

print("Shannon entropy")
print(f"exact : {exact_shannon:.3f}")
print(f"naive : {naive:.3f}")
print(f"CAE : {cae:.3f}")
print(f"NSB : {nsb:.3f} +- {nsb_std:.3f}")

Shannon entropy
exact : 8.592
naive : 8.509
CAE : 8.571
NSB : 8.589 +- 0.003


## Simpson index estimation

$$ \lambda (q)= \sum_{i=1}^{K} {q_{i}}^2

In [10]:
exact_simpson = mobj_1.exact_simpson()
naive = exp_1.simpson(method='naive')
cae = exp_1.simpson(method='Chao-Shen') 
nsb, nsb_std = exp_1.simpson(method='NSB', error=True)

print("Simpson index")
print(f"exact : {exact_simpson:.6f}")
print(f"naive : {naive:.6f}")
print(f"CAE : {cae:.6f}")
print(f"NSB : {nsb:.6f} +- {nsb_std:.6f}")

Simpson index
exact : 0.000227
naive : 0.000247
CAE : 0.000239
NSB : 0.000233 +- 0.000001


## The `Divergence` class

In [11]:
from catede.estimate import Divergence

# simulation of an independent second system
seed_2 = 54321                                          # rng seed

mobj_2 = markov_class(L, n_states=A, seed=seed_2)       # random Markov matrix generation
seqs_2 = mobj_2.generate_counts(size, seed=seed_2)      # generate histogram of counts
exact_sh_entropy_2 = mobj_2.exact_shannon()             # exact Shannon entropy  
exp_2 = Experiment(seqs_2, categories=K)                # second experiment
div_to1from2 = Divergence(exp_1, exp_2)                 # divergence class

## Kullback-Leibler divergence estimation

$$ D_{\rm KL} \left( q \Vert t \right) = \sum_{i=1}^{K} q_{i} \log \frac{q_{i}}{t_{i}}$$

In [12]:
# Kullback Leibler divergence estimation #
exact_DKL_to1from2 = mobj_1.exact_kullbackleibler(mobj_2)
naive = div_to1from2.kullback_leibler(method='naive')
zhang = div_to1from2.kullback_leibler(method='Zhang-Grabchak', error=True) 
dpm, dpm_std = div_to1from2.kullback_leibler(method='DPM', error=True) 

print("Kullback Leilber divergence")
print(f"exact : { exact_DKL_to1from2:.3f}")
print(f"naive : {naive:.3f}")
print(f"Z : {zhang:.3f}")
print(f"DPM : {dpm:.3f} +- {dpm_std:.3f}")

Kullback Leilber divergence
exact : 0.949
naive : 0.587
Z : 0.828
DPM : 0.935 +- 0.011


## Squared Hellinger divergence estimation

$$ D_{\rm H}^2 \left( q \Vert t \right) = 1 - \sum_{i=1}^{K} \sqrt{q_{i}} \sqrt{t_{i}} $$

In [13]:
# Squared Hellinger divergence estimation #
exact_DH_to1from2 = mobj_1.exact_squared_hellinger(mobj_2)
naive = div_to1from2.squared_hellinger(method='naive')
dpm, dpm_std = div_to1from2.squared_hellinger(method='DPM', error=True) 

print("Squared Hellinger divergence")
print(f"exact : { exact_DH_to1from2:.3f}")
print(f"naive : {naive:.3f}")
print(f"DPM : {dpm:.3f} +- {dpm_std:.3f}")

Squared Hellinger divergence
exact : 0.204
naive : 0.278
DPM : 0.202 +- 0.002
