In [8]:
import itertools, json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import colors

import sys
sys.path.append('..')
from lib import *
from lib.maxent import *

In [9]:
output = True
N = 9
q = naminoacids

proteome = proteome_path('Human')
seed = 1234
prng = np.random.RandomState(seed)

In [25]:
params = np.load('data/Human_full_k9.npz')
hi = params['hi']
Jij = params['Jij']

In [26]:
datasets = ['train', 'test', 'model']
sample_matrices = {}
for dataset in datasets:
    sample_matrices[dataset] =  np.loadtxt('data/%s_matrix.csv.gz' % dataset).astype(int)

In [27]:
energies = [energy_potts(x, hi, Jij) for x in sample_matrices['model']]

In [28]:
F0 = -np.sum(np.log(np.sum(np.exp(hi), axis=1)))

In [30]:
def Fprime(alpha):
    jump = lambda x: local_jump(x, q)
    x0 = prng.randint(q, size=N)
    matrix = mcmcsampler(x0, lambda x: energy_potts(x, hi, alpha*Jij), jump, 1e6, nsample=10, nburnin=1e3)
    return np.mean([energy_potts(x, np.zeros_like(hi), Jij) for x in matrix])

In [31]:
xs = np.linspace(0, 1, 4)
Fprimes = [Fprime(x) for x in xs]

In [32]:
Fint = scipy.integrate.simps(Fprimes, xs)
Fint

0.10532807605400357

In [33]:
F0, np.mean(energies), Fint

(-28.01652616764632, -2.072497153551555, 0.10532807605400357)

In [34]:
energies_ind = [energy_potts(x, hi, np.zeros_like(Jij)) for x in independent_matrix]

NameError: name 'independent_matrix' is not defined

In [None]:
np.mean(energies_ind), np.mean(energies)

In [None]:
def calc_Sind(hi):
    fis = np.exp(hi)/np.sum(np.exp(hi), axis=1)[:, np.newaxis]
    return np.sum(scipy.stats.entropy(fis.T))

In [None]:
Sind = calc_Sind(hi)
S = np.mean(energies) - (F0 + Fint)
S, Sind

In [None]:
Sind, np.mean(energies_ind) - F0

In [None]:
Suni = np.log2(20)

In [None]:
Sind*np.log2(np.exp(1))/N, S*np.log2(np.exp(1))/N

In [28]:
df = pd.read_csv('../kmerentropy/data/entropy.csv')

In [29]:
np.array(df['Human'])/np.arange(1, 6)

array([4.17756346, 4.16934288, 4.16129658, 4.14931991, 4.11369582])

In [31]:
(Sind-S)*np.log2(np.exp(1))

0.30957087318646137

In [21]:
def entropy_thermodynamic_integration(hi, Jij, integration_intervals=1, mcmc_kwargs=dict()):
    F0 = -np.sum(np.log(np.sum(np.exp(hi), axis=1)))
    N, q = hi.shape
    
    jump = lambda x: local_jump(x, q)
    x0 = prng.randint(q, size=N)
    matrix = mcmcsampler(x0, lambda x: energy_potts(x, hi, Jij), jump, **mcmc_kwargs)
    energy_mean = np.mean([energy_potts(x, hi, Jij) for x in matrix])
    
    def Fprime(alpha):
        jump = lambda x: local_jump(x, q)
        x0 = prng.randint(q, size=N)
        matrix = mcmcsampler(x0, lambda x: energy_potts(x, hi, alpha*Jij), jump, **mcmc_kwargs)
        return np.mean([energy_potts(x, np.zeros_like(hi), Jij) for x in matrix])
    
    xs = np.linspace(0, 1, integration_intervals+1)
    Fprimes = [Fprime(x) for x in xs]
    Fint = scipy.integrate.simps(Fprimes, xs)
    
    S = energy_mean - (F0 + Fint)
    return S

In [22]:
mcmc_kwargs = dict(nsteps=1e6, nsample=10, nburnin=1e3)

In [23]:
entropy_thermodynamic_integration(hi, Jij, integration_intervals=3, mcmc_kwargs=mcmc_kwargs)

25.828290270215764