In [1]:
import torch, re
import torch.utils.data as data_utils
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
def load_data():
    ds = []
    # Read from src
    with open('data/train_set.fasta') as f:
        lines = f.read().split(">")[1:]
        ds = [0]*len(lines)
        for i, line in enumerate(tqdm(lines)):
            header, seq, sep = line.strip().split("\n")
            # seq = " join(seq)
            # sep = re.sub("S|T|L|P", "0", sep) # simplify to SP existance
            # sep = re.sub("I|M|O",   "1", sep) # simplify to SP non-existance
            # sep = 1. if "0" in sep else 0.
            ac, kingdom, type_, id_ = header.split("|")
            ds[i] = {
                'header': {
                    'uniprot_ac': ac,
                    'kingdom': kingdom,
                    'type': type_,
                    'id': id_
                },
                'seq': seq,
                'sep': sep
            }
    return ds

In [3]:
ds = load_data()
df = pd.DataFrame(ds)

  0%|          | 0/20290 [00:00<?, ?it/s]

100%|██████████| 20290/20290 [00:00<00:00, 563644.50it/s]


In [4]:
kingdoms = pd.DataFrame([i['kingdom'] for i in df.get('header')])
sequences = df.get('seq')
classes = df.get('sep')

In [25]:
# Find all items with a specific SP
q = "L"
ds_with_q = [item for item in ds if q in item['sep']]
pd.DataFrame([item['seq'][:len(item) - item['sep'][::-1].index(q) -1] for item in ds_with_q]).value_counts()

0                              
MKKYLALALIAPLLISCS                 28
MRYLATLLLSLAVLITAGCG               27
MKLRWFAFLIVLLAGCS                  23
MPLPDFRLIRLLPLAALVLTACS            22
MQKNAAHTYAISSLLVLSLTGCA            20
                                   ..
MKLLSKVMILALAASMLQACN               1
MKLNLKASGVARQLTTLAKTVAALSVLTACA     1
MKLNLRFPSYFLPVVAASAFLVSCA           1
MKLNQFGAAIGLLATGALLSGCG             1
MYRRLLLNLFCMVFLQACL                 1
Name: count, Length: 902, dtype: int64

In [28]:
# Which kingdoms shares the most probable 
q = "MQKNAAHTYAISSLLVLSLTGCA"
pd.DataFrame(map(lambda x: x['header']['kingdom'], filter(lambda x: x['seq'][:len(q)] == q, ds))).value_counts()

0       
NEGATIVE    20
Name: count, dtype: int64

In [21]:
ds_with_pep = list(filter(lambda x: x['header']['type'] != "NO_SP", ds))
print("Peptide distribution:", pd.DataFrame(map(lambda x:   
                "S" if "S" in x['sep'] else \
                "T" if "T" in x['sep'] else \
                "P" if "P" in x['sep'] else \
                "L" if "L" in x['sep'] else \
                "N/A", ds_with_pep)).value_counts())

Peptide distribution: 0
S    2582
L    1615
T     398
P      70
Name: count, dtype: int64


In [29]:
print("Type distribution:", pd.DataFrame([x['header']['type'] for x in ds]).value_counts())

Type distribution: 0      
NO_SP      15625
SP          2582
LIPO        1615
TAT          365
PILIN         70
TATLIPO       33
Name: count, dtype: int64


In [30]:
print("Kingdom distribution:", kingdoms.value_counts())

Kingdom distribution: 0       
EUKARYA     16396
NEGATIVE     2764
POSITIVE      935
ARCHAEA       195
Name: count, dtype: int64


In [37]:
ds1 = torch.utils.data.DataLoader(ds['seq'])

In [42]:
next(iter(ds1))

['M A P T L F Q K L F S K R T G L G A P G R D A R D P D C G F S W P L P E F D P S Q I R L I V Y Q D C E R R G R N V L F D S S V K R R N E D I']