In [209]:
import sys
sys.path.append('..')
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import projgrad
plt.style.use('../peptidome.mplstyle')

from lib import *


In [42]:
columnsstr = 'seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <clan'
columns = columnsstr.split('> <')
df_pfam = pd.read_csv(datadir+'9606_pfam.tsv.gz', sep='\t', skiprows=3, names=columns)

In [29]:
proteome = [(h, seq) for h, seq in fasta_iter(proteome_path('Human'))]
headers, seqs = list(zip(*proteome))
seqids = [h.split('|')[1] for h in headers]
df = pd.DataFrame(dict(seqid=seqids, seq=seqs))
seqids_set = set(df['seqid'])

In [56]:
len(set(df['seqid'])-set(df_pfam['seq id'])), len(set(df['seqid'])), len(df_pfam['seq id'])

(2040, 21080, 100194)

In [308]:
df_pfam_filtered = df_pfam[[(s in seqids_set) for s in df_pfam['seq id']]]
df_pfam_filtered['hmm acc'].value_counts().head()

PF00096    5390
PF00400     751
PF00028     693
PF00041     612
PF07679     566
Name: hmm acc, dtype: int64

In [73]:
zfs = df_pfam_filtered[df_pfam_filtered['hmm acc'] == 'PF00096']
zfs.head()

Unnamed: 0,seq id,alignment start,alignment end,envelope start,envelope end,hmm acc,hmm name,type,hmm start,hmm end,hmm length,bit score,E-value,clan
1319,A0A087WUU8,444,466,444,466,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,28.4,0.0036,CL0361
1320,A0A087WUU8,388,410,388,410,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,27.4,0.0074,CL0361
1321,A0A087WUU8,304,326,304,326,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,26.7,0.012,CL0361
1322,A0A087WUU8,276,298,276,298,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,25.6,0.028,CL0361
1323,A0A087WUU8,332,354,332,354,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,23.8,0.1,CL0361


In [97]:
dfind = df.set_index('seqid')
seq_zf = [dfind.loc[row['seq id']]['seq'][row['envelope start']:row['envelope end']] for ind, row in zfs.iterrows()]

In [307]:
seq_zf[:5]

['KCKECGKAFKRSSNLTEHRIIH',
 'KCEECGKAFNTSSHLTTHKRIH',
 'ICEHCGRAFNQSSNLTKHKRIH',
 'KCKECGKAFNQSSTLTRHKIIH',
 'KCEECGKAFNVSSTLTQHKRIH']

In [147]:
pfams_ordered = df_pfam_filtered['hmm acc'].value_counts()
pfams_ordered = pfams_ordered.reset_index()

In [180]:
multiple_domains = set(pfams_ordered[pfams_ordered['hmm acc'] > 1]['index'])

In [189]:
df_multiple = df_pfam_filtered[df_pfam_filtered['hmm acc'].isin(multiple_domains)]

In [190]:
seqs = []
for ind, (seqid, seq) in df.iterrows():
    pfam = df_multiple[df_multiple['seq id']==seqid]
    if pfam.shape[0] == 0:
        seqs.append(seq)
    else:
        start = 0
        pfam = pfam.sort_values('envelope start')
        for ind, row in pfam.iterrows():
            end = row['envelope start']
            seqs.append(seq[start:end])
            start = row['envelope end']
        end = row['envelope end']
        seqs.append(seq[end:])
with open('data/human_filtered.txt', 'w') as f:
    for seq in seqs:
        f.write(seq + '\n')

In [191]:
seqs = []
for ind, (seqid, seq) in df.iterrows():
    pfam = df_multiple[df_multiple['seq id']==seqid]
    if pfam.shape[0] == 0:
        seqs.append(seq)
    else:
        newseq = ''
        start = 0
        pfam = pfam.sort_values('envelope start')
        for ind, row in pfam.iterrows():
            end = row['envelope start']
            newseq += seq[start:end]
            newseq += 'X'*(row['envelope end']-row['envelope start'])
            start = row['envelope end']
        end = row['envelope end']
        newseq += seq[end:]
        seqs.append(newseq)
with open('data/human_replaced.txt', 'w') as f:
    for seq in seqs:
        f.write(seq + '\n')

In [203]:
zincfingers = set(df_pfam_filtered[df_pfam_filtered['hmm acc'] == 'PF00096']['seq id'].unique())

In [204]:
df_nozf = df[~df['seqid'].isin(zincfingers)]
df_nozf.to_csv('data/human_nozf.csv')

In [286]:
pfam_ordered_nproteins = df_pfam_filtered.groupby(['seq id', 'hmm acc']).mean().reset_index()['hmm acc'].value_counts()
pfam_ordered_nproteins = pfam_ordered_nproteins.reset_index()

In [296]:
# keep one random protein per family
remove = set()
for pfam in pfam_ordered_nproteins[pfam_ordered_nproteins['hmm acc'] > 1]['index']:
    family = set(df_pfam_filtered[df_pfam_filtered['hmm acc'] == pfam]['seq id'].unique())
    family = family - set(random.sample(family, 1))
    remove = remove | family
len(remove)

15501

In [297]:
df_domains = df[~df['seqid'].isin(remove)]
df_domains.to_csv('data/human_uniquedomains.csv')

In [125]:
seqid = "A8MTY0"
seq = dfind.loc['A8MTY0']
pfam = df_pfam[df_pfam['seq id']==seqid]
pfam.sort_values('envelope start')

Unnamed: 0,seq id,alignment start,alignment end,envelope start,envelope end,hmm acc,hmm name,type,hmm start,hmm end,hmm length,bit score,E-value,clan
16584,A8MTY0,3,44,3,44,PF01352,KRAB,PfamLive::Result::SequenceOntology=HASH(0xe0bf...,1,42,42,78.8,4.2e-19,No_clan
16596,A8MTY0,170,192,170,192,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,20.8,0.94,CL0361
16597,A8MTY0,226,248,226,248,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,20.6,1.1,CL0361
16588,A8MTY0,282,304,282,304,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,25.2,0.037,CL0361
16587,A8MTY0,310,332,310,332,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,26.4,0.016,CL0361
16591,A8MTY0,338,360,338,360,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,23.5,0.13,CL0361
16592,A8MTY0,366,388,366,388,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,23.5,0.13,CL0361
16586,A8MTY0,394,416,394,416,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,27.0,0.0097,CL0361
16594,A8MTY0,422,444,422,444,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,22.0,0.39,CL0361
16585,A8MTY0,450,472,450,472,PF00096,zf-C2H2,PfamLive::Result::SequenceOntology=HASH(0x89b8...,1,23,23,28.0,0.0047,CL0361


## Olfactory receptors

In [301]:
df_pfam_filtered[df_pfam_filtered['hmm acc'] == 'PF13853'].head()

Unnamed: 0,seq id,alignment start,alignment end,envelope start,envelope end,hmm acc,hmm name,type,hmm start,hmm end,hmm length,bit score,E-value,clan
4798,A0A096LPK9,33,303,31,305,PF13853,7tm_4,PfamLive::Result::SequenceOntology=HASH(0x107e...,3,278,280,151.2,1.2e-40,CL0192
9231,A0A0G2JMP0,32,305,31,307,PF13853,7tm_4,PfamLive::Result::SequenceOntology=HASH(0x107e...,2,278,280,130.1,3.2e-34,CL0192
9443,A0A0G2JNH3,33,303,31,305,PF13853,7tm_4,PfamLive::Result::SequenceOntology=HASH(0x107e...,3,278,280,147.6,1.5e-39,CL0192
11021,A0A0X1KG70,32,305,31,307,PF13853,7tm_4,PfamLive::Result::SequenceOntology=HASH(0x107e...,2,278,280,128.8,8.3e-34,CL0192
14495,A0A286YEU6,34,304,30,307,PF13853,7tm_4,PfamLive::Result::SequenceOntology=HASH(0x107e...,4,276,280,124.2,2.1e-32,CL0192


In [298]:
seqs = [dfind.loc[row['seq id']]['seq'][row['envelope start']:row['envelope end']]
        for ind, row in df_pfam_filtered[df_pfam_filtered['hmm acc'] == 'PF13853'].iterrows()]