In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from numba import jit

import sys
sys.path.append('..')
from lib import *
from lib.maxent import *
plt.style.use('../peptidome.mplstyle')

In [None]:
proteome = 'Human'
models = ['independent', 'ncov', 'nskew', 'nskewfcov', 'train']
k = 9

coincidence_probs = {}

for model in models:
    if not model == 'train':
        energy = make_energy(np.load('../maxent/data/{proteome}_{model}_k{k}_params.npz'.format(
            proteome=proteome, model=model, k=k)))
        F = np.float(pd.read_csv('../maxent/data/{proteome}_{model}_k{k}_entropy.csv'.format(
            proteome=proteome, model=model, k=k),
                                header=None, index_col=0).loc['F'])
        loglikelihood  = lambda seq: -energy(seq) + F
    matrix = load_matrix('data/{proteome}_{model}_k{k}_matrix.csv.gz'.format(
                         proteome=proteome, model=model, k=k))
    coincidence_empirical = calc_coincidence_prob(matrix)
    if not model == 'train':
        logp = np.array([loglikelihood(row) for row in matrix])
        coincidence_prob = np.mean(np.exp(logp))
        print(model, coincidence_prob, coincidence_empirical)
    else:
        coincidence_prob = coincidence_empirical
        print(model, coincidence_empirical)
    coincidence_probs[model] = coincidence_prob

In [None]:
coincidence_probs = pd.Series(coincidence_probs, name='coincidence_prob')

In [None]:
series['independent']

In [None]:
pd.read_csv('test.csv', index_col=0, squeeze=True)

In [None]:
# effective diversities
'%e'%np.exp(-np.mean(logp)),'%e'%(1/np.mean(np.exp(logp)))

# test reproducibility on independent site model

In [None]:
model = 'independent'
f = np.load('../maxent/data/{proteome}_{model}_k{k}_params.npz'.format(
        proteome=proteome, model=model, k=k))['f']
np.sum(f**2)**9

In [None]:
model_matrix = np.random.choice(np.arange(0, 20, 1), size=matrix.shape, p=f)
coincidence_empirical = calc_coincidence_prob(model_matrix)
coincidence_empirical

# Plot

In [None]:
coincidence_probs['uniform'] = 1/20**k
coincidence_probs.sort_values(inplace=True)
labels = {'uniform' : 'uniform',
          'independent' : '1st moment',
          'ncov' : '2nd moment',
          'nskew' : '3rd moment',
          'nskewfcov' : '2-point',
          'train' : 'data'
         }
coincidence_probs.index = coincidence_probs.index.map(labels)

In [None]:
fig, ax = plt.subplots(figsize=(1.0, 1.8))
coincidence_probs.plot(kind='bar', log=True, ax=ax)
ax.set_ylabel('Coincidence probability')
fig.savefig('coincidence_prob.png')

# Empirical coincidences in data

In [None]:
proteomes = ['Human', 'Humannozf']
models = ['train', 'test']
k = 9
for proteome in proteomes:
    for model in models:
        matrix = load_matrix('data/{proteome}_{model}_k{k}_matrix.csv.gz'.format(
                                 proteome=proteome, model=model, k=k))
        coincidence_empirical = calc_coincidence_prob(matrix)
        print(proteome, model, coincidence_empirical)

In [None]:
proteome = 'Human'
matrices = {}
for model in models:
    matrix = load_matrix('data/{proteome}_{model}_k{k}_matrix.csv.gz'.format(
                          proteome=proteome, model=model, k=k))
    matrices[model] = matrix_to_kmers(matrix)

In [None]:
intersection = set(matrices['test']).intersection(set(matrices['train']))

In [None]:
overlap = [kmer for kmer in matrices['test'] if kmer in intersection]

In [None]:
len(overlap)/(len(matrices['train'])*len(matrices['test']))