In [None]:
import numpy as np
import pandas as pd
from scipy.stats import entropy
import scipy.stats
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('../peptidome.mplstyle')

import sys
sys.path.append('..')
from lib import *

In [None]:
df = load_proteome_as_df('Human')
print('complete length', '%e'%np.sum(df['Sequence'].str.len()))
print('multiple seqs', len(df['Sequence'])-len(df['Sequence'].unique()))
df.drop_duplicates(subset=['Sequence'], inplace=True)

df_clustered = load_proteome_as_df_path(datadir+'cluster/UP000005640Human_rep_seq.fasta')
print('>.9 identity', len(df)-len(df_clustered),
      '%e'%(np.sum(df['Sequence'].str.len())-np.sum(df_clustered['Sequence'].str.len())))


df_clustered_50 = load_proteome_as_df_path(datadir+'cluster/UP000005640Human_50_rep_seq.fasta')
print('>.5 identity', len(df_clustered)-len(df_clustered_50),
      '%e'%(np.sum(df_clustered['Sequence'].str.len())-np.sum(df_clustered_50['Sequence'].str.len())))

# uniref viruses
df = load_unirefproteome_as_df_path(datadir + 'human-viruses-uniref90.fasta')
print('complete length', '%e'%np.sum(df['Sequence'].str.len()))

In [None]:
def to_kmers_nonoverlapping(seqs, k):
    for seq in seqs:
         for i in range((len(seq)-k+1)//k):
                s = seq[k*i:k*(i+1)]
                if isvalidaa(s):
                    yield s

In [None]:
fragmentlength = 40
fragments = [kmer for kmer in to_kmers_nonoverlapping(df['Sequence'], fragmentlength)]
df_fragments = pd.DataFrame(dict(identifier=[str(i) for i in range(len(fragments))],
                                 Sequence=fragments))

In [None]:
'%e'%(len(fragments)*fragmentlength)

In [None]:
write_fasta(df_fragments, 'fragments.fasta.gz', seqcolumn='Sequence', idcolumn='identifier')

In [None]:
!mmseqs easy-cluster fragments.fasta.gz fragments_50 tmp --min-seq-id 0.5 -c 0.8 --cov-mode 0

In [None]:
df_fragments_clustered = load_proteome_as_df_path('fragments_50_rep_seq.fasta',
                                                  parse_genes=False, parse_accessions=False)
print('fragment clustered length', '%e'%np.sum(df_fragments_clustered['Sequence'].str.len()))

In [None]:
def counter_to_series(counter):
    count_df = pd.DataFrame.from_dict(counter, orient='index', columns=['count'])
    count_series = count_df.T.squeeze()
    return count_series
def fraction_multiple(series):
    return np.sum(series[series>1])/np.sum(series)

In [None]:
k = 9
filterlength = 12

In [None]:
for d in [#df,
          #df_clustered,# df_clustered_50,
          df_fragments_clustered]:
    count_series = counter_to_series(count_kmers_iterable(d['Sequence'], k, clean=True))
    count_series_unique = counter_to_series(
                       count_kmers_iterable(
                           filter_unique(d['Sequence'], k, filterlength),
                           k)
                       )
    print(fraction_multiple(count_series), fraction_multiple(count_series_unique))

In [None]:
counts, bins, _ = plt.hist(count_series, log=True,
                           bins=np.arange(0, count_series.max()+1, 1));
plt.hist(count_series_unique, log=True, bins=bins);
#plt.xscale('log')

In [None]:
counts = count_series_unique

In [None]:
2*np.sum(scipy.special.binom(counts[counts>1], 2))/np.sum(counts)**2

In [None]:
'{:.3e}, {:.3e}'.format(np.sum(count_series), np.sum(count_series_unique))

In [None]:
count_series.sort_values().tail(50)

In [None]:
count_series_unique.sort_values().tail(20)