In [7]:
import json
from DBAASP.peptide import Peptide
import numpy as np
import pandas as pd
import numpy as np

In [4]:
def read_fasta(file_path):
    """
    Reads a FASTA file and returns a list of tuples containing sequence IDs and sequences.
    :param file_path: The path to the FASTA file.
    :return:
    """
    with open(file_path, 'r') as f:
        lines = f.readlines()
    sequences = []
    for line in lines:
        if line.startswith('>'):
            id_ = line[1:].strip()
        else:
            sequence = line.strip()
            sequences.append((id_, sequence))
    return sequences
def sequence_entropy(sequence: str) -> float:
    """
    Calculate the Shannon entropy of a sequence.
    :param sequence: The sequence to calculate the entropy for
    :return: The Shannon entropy of the sequence
    """
    from collections import Counter
    from math import log2

    counts = Counter(sequence)
    total = len(sequence)
    probabilities = [count / total for count in counts.values()]
    return -sum(p * log2(p) for p in probabilities if p > 0)

In [5]:
with open('.cache/DBAASP_raw.json', 'r') as f:
    data = json.load(f)

synthesis_comp = {
    'Nonribosomal': [],
    'Ribosomal': [],
    'Synthetic': []
}
for sample in data:
    peptide = Peptide(sample)
    if peptide.complexity != "Monomer":
        continue
    synthesis_comp[peptide.synthesisType].append(sequence_entropy(peptide.sequence))

In [11]:
table = [
    ["Synthesis Type", "Count", "Mean Entropy", "Std Entropy", "Median entropy"]
]
for syntype, dist in synthesis_comp.items():
    table.append([syntype, len(dist), np.mean(dist), np.std(dist), np.median(dist)])

# Add Peptide Atlas
data = read_fasta("../peptide_atlas/.cache/peptide_atlas.fasta")
entropies = [sequence_entropy(seq) for id_, seq in data]
table.append(["Peptide Atlas", len(entropies), np.mean(entropies), np.std(entropies), np.median(entropies)])
df = pd.DataFrame(table[1:], columns=table[0])
df

Unnamed: 0,Synthesis Type,Count,Mean Entropy,Std Entropy,Median entropy
0,Nonribosomal,786,1.898336,0.767331,2.0
1,Ribosomal,3222,3.196063,0.455751,3.270027
2,Synthetic,19028,2.426342,0.811254,2.582877
3,Peptide Atlas,3570953,3.090888,0.386158,3.108459
