In [7]:
import numpy as np
import tensorflow as tf
import matplotlib.pylab as plt
from scipy import stats
from scipy.spatial.distance import pdist,squareform
import pandas as pd
import os
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

from keras.callbacks import TensorBoard
tbCallBack = TensorBoard(log_dir='./log', histogram_freq=1,
                         write_graph=True,
                         write_grads=True,
                         batch_size=None,
                         write_images=True)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  return f(*args, **kwds)
Using TensorFlow backend.


In [1]:
################
# note: if you are modifying the alphabet
# make sure last character is "-" (gap)
################
alphabet = "ARNDCQEGHILKMFPSTWYV-"
states = len(alphabet)
a2n = {}
for a,n in zip(alphabet,range(states)):
    a2n[a] = n
################

def aa2num(aa):
    '''convert aa into num'''
    if aa in a2n: return a2n[aa]
    else: return a2n['-']
def num2aa(num):
    '''convert num to aa'''
    return alphabet[num]
    
# from fasta
def parse_fasta(filename):
    '''function to parse fasta'''
    header = []
    sequence = []
    lines = open(filename, "r")
    for line in lines:
        line = line.rstrip()
        if line[0] == ">":
            header.append(line[1:])
            sequence.append([])
        else:
            sequence[-1].append(line)
    lines.close()
    sequence = [''.join(seq) for seq in sequence]
    return np.array(header), np.array(sequence)

def filt_gaps(msa,gap_cutoff=0.5):
    '''filters alignment to remove gappy positions'''
    tmp = (msa == states-1).astype(np.float)
    non_gaps = np.where(np.sum(tmp.T,-1).T/msa.shape[0] < gap_cutoff)[0]
    return msa[:,non_gaps],non_gaps

def get_eff(msa,eff_cutoff=0.8):
    '''compute effective weight for each sequence'''
    ncol = msa.shape[1]

    # pairwise identity
    msa_sm = 1.0 - squareform(pdist(msa,"hamming"))

    # weight for each sequence
    msa_w = (msa_sm >= eff_cutoff).astype(np.float)
    msa_w = 1/np.sum(msa_w,-1)

    return msa_w

def cluster(msa, method='ward', threshold=0.8):
    Z = linkage(1-squareform(pdist(msa,"hamming")),method) 
    return fcluster(Z,threshold,criterion='distance')

def mk_msa(seqs):
    '''converts list of sequences to msa'''

    msa_ori = []
    for seq in seqs:
        msa_ori.append([aa2num(aa) for aa in seq])
    msa_ori = np.array(msa_ori)

    # remove positions with more than > 50% gaps
    msa, v_idx = filt_gaps(msa_ori,0.5)
    
    msa_cluster = cluster(msa)
    print('original number of sequences:{}\nclustered number of sequences:{}'.format(len(msa_cluster),len()))
    msa_new  = np.zeros([len(np.unique(msa_cluster)),len(msa[0,:])])
    index_once = np.zeros([len(np.unique(msa_cluster))])
    for tick,clus in enumerate(np.unique(msa_cluster)):
        index_once[tick] = msa_cluster.tolist().index(clus)
    msa_new = msa[index_once.astype(int),:]
    # compute effective weight for each sequence
    msa_weights = get_eff(msa_new,0.8)

    # compute effective number of sequences
    ncol = msa_new.shape[1] # length of sequence
    w_idx = v_idx[np.stack(np.triu_indices(ncol,1),-1)]
    
    
    return {"msa_ori":msa_ori,
          "msa":msa_new,
          "weights":msa_weights,
          "neff":np.sum(msa_weights),
          "v_idx":v_idx,
          "w_idx":w_idx,
          "nrow":msa.shape[0],
          "ncol":ncol,
          "ncol_ori":msa_ori.shape[1],
          "cluster":msa_cluster}


'A'

In [87]:
# useless shit
# def parse_fasta_new(file):
#     with open(file) as f:
#         lines = f.readlines()
#     line_numbers = [i for i in range(len(lines)) if '>' in lines[i]]
#     lines = list(map(lambda x: x.replace("\n", "").replace(">",""),lines))
#     return np.array(lines)[line_numbers], np.array(lines)[~np.array(line_numbers)]
    

In [88]:
%%time
names, seqs = parse_fasta_new("4FAZA.fas")

CPU times: user 3.67 ms, sys: 1.5 ms, total: 5.17 ms
Wall time: 3.82 ms


In [89]:
%%time
names, seqs = parse_fasta("4FAZA.fas")

CPU times: user 2.61 ms, sys: 1.29 ms, total: 3.9 ms
Wall time: 2.78 ms


In [90]:
msa = mk_msa(seqs)

In [91]:
msa['msa'].shape

(716, 61)

In [92]:
names_1, seqs_1 = parse_fasta_new("allpdb0148/align_1/allpdb0148.a2m")
msa_1 = mk_msa(seqs_1)

In [14]:
names_2, seqs_2 = parse_fasta("allpdb0148/align_2/allpdb0148.a2m")
msa_2 = mk_msa(seqs_2)

In [33]:
msa_1['cluster']

array([ 664, 1097,  649, ..., 1363, 1379,   62], dtype=int32)

In [94]:
msa_2['msa']

array([[ 5, 10, 11, ..., 10,  6,  6],
       [ 5, 12,  1, ..., 10,  0,  5],
       [ 5, 12,  1, ..., 10,  0,  5],
       ...,
       [20, 20, 20, ..., 13,  6,  3],
       [20, 20, 20, ..., 10, 15, 15],
       [20, 20, 20, ..., 20, 20, 20]])

In [3]:
from itertools import permutations, repeat
with open('concat.txt') as f:
    for clus_num_1,seq_1 in enumerate(msa_1['msa']):
        clus_name_1 = 'cluster1_no_'+str(clus_num_1)
        for clus_num_2,seq_2 in msa_2['msa']:
            clus_name_2 = 'cluster2_no_'+str(clus_num_2)
            f.write('>'clus_name_1+'_'+clus_name_2+'/1-'+str(len(seq_1)+len(seq_2))

SyntaxError: invalid syntax (<ipython-input-3-773e0cd78e98>, line 5)

In [121]:
header = []
sequence = []
lines = open("allpdb0148/align_1/allpdb0148.a2m", "r")
for line in lines:
    line = line.rstrip()
    if line[0] == ">":
        header.append(line[1:])
        sequence.append([])
    else:
        sequence[-1].append(line)
lines.close()
sequence = [''.join(seq) for seq in sequence]

msa1_df = pd.DataFrame()
msa1_df['full_names']=np.array(header)
msa1_df['seqs']=np.array(sequence)
msa1_df['names']=msa1_df.full_names.str.split('/',expand=True)[0]
msa1_df['region'] = msa1_df.full_names.str.split('/',expand=True)[1]
msa1_df['species']=msa1_df.names.str.split('_',expand=True)[1]
#msa1_df['monomer'] = np.ones([len(np.array(header))],dtype = np.int)
msa1_df.head()

Unnamed: 0,full_names,seqs,names,region,species
0,SGF11_YEAST/1-99,mteetitidsisngilnnllttliqDIVARETTQQQLLKTRYPDLR...,SGF11_YEAST,1-99,YEAST
1,tr|A0A1I8HZI1|A0A1I8HZI1_9PLAT/121-201,.........................------------QQQQQQQQK...,tr|A0A1I8HZI1|A0A1I8HZI1_9PLAT,121-201,9PLAT
2,tr|A0A1I8HZI1|A0A1I8HZI1_9PLAT/269-323,.........................---------------QQPPTQ...,tr|A0A1I8HZI1|A0A1I8HZI1_9PLAT,269-323,9PLAT
3,tr|A0A1I8HZI1|A0A1I8HZI1_9PLAT/332-393,.........................---------PKHQRRPEPQKP...,tr|A0A1I8HZI1|A0A1I8HZI1_9PLAT,332-393,9PLAT
4,tr|A0A2C9JJT7|A0A2C9JJT7_BIOGL/328-380,.........................------------------ETN...,tr|A0A2C9JJT7|A0A2C9JJT7_BIOGL,328-380,BIOGL


In [122]:
header = []
sequence = []
lines = open("allpdb0148/align_2/allpdb0148.a2m", "r")
for line in lines:
    line = line.rstrip()
    if line[0] == ">":
        header.append(line[1:])
        sequence.append([])
    else:
        sequence[-1].append(line)
lines.close()
sequence = [''.join(seq) for seq in sequence]

msa2_df = pd.DataFrame()
msa2_df['full_names']=np.array(header)
msa2_df['seqs']=np.array(sequence)
msa2_df['names']=msa1_df.full_names.str.split('/',expand=True)[0]
msa2_df['region'] = msa1_df.full_names.str.split('/',expand=True)[1]
msa2_df['species']=msa1_df.names.str.split('_',expand=True)[1]
#msa2_df['monomer'] = np.ones([len(np.array(header))],dtype = np.int)*2
msa2_df.head()

Unnamed: 0,full_names,seqs,names,region,species
0,SUS1_YEAST/1-96,mtmdtaQLKSQIQQYLVESGNYELISNELKARLLQEGWVDKVKDLT...,SGF11_YEAST,1-99,YEAST
1,tr|A0A091DY92|A0A091DY92_FUKDA/3-92,...kdaQMRAAINQKLIETGERERLKELLRAKLIECGWKDQLKAHC...,tr|A0A1I8HZI1|A0A1I8HZI1_9PLAT,121-201,9PLAT
2,tr|A0A1S3EU27|A0A1S3EU27_DIPOR/3-92,...kdaQMRAAINQKLIETGERERLKELLRAKLIECGWKDQLKAHC...,tr|A0A1I8HZI1|A0A1I8HZI1_9PLAT,269-323,9PLAT
3,tr|A0A1S3WCK5|A0A1S3WCK5_ERIEU/3-92,...kdaQMRAAINQKLIETGERERLKELLRAKLIECGWKDQLKAHC...,tr|A0A1I8HZI1|A0A1I8HZI1_9PLAT,332-393,9PLAT
4,tr|A0A1U7TT05|A0A1U7TT05_TARSY/3-92,...kdaQMRAAINQKLIETGERERLKELLRAKLIECGWKDQLKAHC...,tr|A0A2C9JJT7|A0A2C9JJT7_BIOGL,328-380,BIOGL


In [123]:
species_list = np.unique(np.concatenate((msa2_df['species'].values,msa1_df['species'].values)))
species_list = np.unique(msa2_df['species'])

In [124]:
np.setdiff1d(msa2_df['species'].values,msa1_df['species'].values)

array([], dtype=object)

In [125]:
import itertools

In [142]:
with open('concat.a2m','w+') as f:
    for spec in species_list:
        for prod in (itertools.product(msa1_df[msa1_df['species']==spec]['full_names'].values,msa2_df[msa2_df['species']==spec]['full_names'].values)):
            f.write('>'+msa1_df[msa1_df['full_names']==prod[0]]['names'].values[0]+'-'+msa2_df[msa2_df['full_names']==prod[1]]['names'].values[0]+'/'+msa1_df[msa1_df['full_names']==prod[0]]['region'].values[0]+'-'+msa2_df[msa2_df['full_names']==prod[1]]['region'].values[0]+'\n')
            f.write(msa1_df[msa1_df['full_names']==prod[0]]['seqs'].values[0]+msa2_df[msa2_df['full_names']==prod[1]]['seqs'].values[0]+'\n')

In [136]:
msa1_df[msa1_df['full_names']==prod[1]]['seqs'].values[0]

IndexError: index 0 is out of bounds for axis 0 with size 0

In [140]:
msa2_df[msa2_df['full_names']==prod[1]]['seqs'].values[0]


'......RLTQSVEKQFVESGEQQRIVDTLMKRLKESGWEDEVKKMVHKIIKEKDGATAEAIFEELKTPSRRLVSNETKQEVYQMIRKFVADqm...'