In [1]:
import numpy as np
import matplotlib.pylab as plt
from scipy import stats
from scipy.spatial.distance import pdist,squareform
import pandas as pd
import os
import itertools
import glob
import math
import time

def return_msa_parsed(align_file):
    uid = []
    sequence = []
    spec = []
    lines = open(align_file, "r")
    for line in lines:
        line = line.rstrip()
        if line[0] == ">":
            uid.append(line[1:])
            spec.append(line[1:].split('/')[0].split('_')[1])
            sequence.append([])
        else:
            sequence[-1].append(line)
    lines.close()
    sequence = [''.join(seq) for seq in sequence]
    print('parsing finished for {} sequences'.format(len(uid)))
    msa_df = pd.DataFrame()
    msa_df['species']=pd.Series(spec,dtype='category')
    msa_df['seqs']=np.arange(len(uid),dtype=np.int)
    print('pd initialized')
    return msa_df,sequence,uid

def write_concat_a2m(msa1_df,sequence_1,uid1,msa2_df,sequence_2,uid2,filename):
    #species_list = np.unique(np.concatenate((msa2_df['species'].values,msa1_df['species'].values)))
#     plt.histogram(np.concatenate((msa2_df['species'].values,msa1_df['species'].values)))
#     plt.savefig(os.path.basename(filename)+'/species_hist.png')         
    species_list = np.intersect1d(msa2_df['species'].unique(),msa1_df['species'].unique())
#     plt.histogram(np.concatenate((msa2_df['species'].values,msa1_df['species'].values)))
#     plt.savefig(os.path.basename(filename)+'/species_hist.png')   
    print('total number of species:{}'.format(len(species_list)))
    start = time.time()
    with open(filename,'w+') as f:
        for spec in species_list:
            #print(spec)
            #print('monomer 1:{} \t monomer 2:{} \t total alignments per specie: {}'.format(len(msa1_df[msa1_df['species']==spec]['seqs'].values),
            #                                                                               len(msa2_df[msa2_df['species']==spec]['seqs'].values),
            #      (len(msa2_df[msa2_df['species']==spec]['seqs'].values)*len(msa1_df[msa1_df['species']==spec]['seqs'].values)))
            #     )
            for prod in (itertools.product(msa1_df[msa1_df['species']==spec]['seqs'].values,msa2_df[msa2_df['species']==spec]['seqs'].values)):
                f.write('>'+uid1[prod[0]]+'-'+uid2[prod[1]]+'\n')
                f.write(sequence_1[prod[0]]+sequence_2[prod[1]]+'\n')
    print('time elapsed: {} seconds'.format(time.time()-start))

In [3]:
hpc_df = pd.read_csv('low_precision_complexes.csv',header=None)
hpc_df

Unnamed: 0,0,1
0,7,allpdb0027
1,22,allpdb0055
2,28,allpdb0068
3,38,allpdb0084
4,39,allpdb0086
5,48,allpdb0120
6,53,allpdb0127
7,64,allpdb0161
8,66,allpdb0171
9,69,allpdb0174


In [4]:
for index,row in hpc_df.iterrows():
    aligns = glob.glob('/home/as974/marks/users/kbrock/ecoli_complex/calibration/output/'+row.values[1]+'/align*')
    align_1 = aligns[0]+'/'+row.values[1]+'.a2m'
    align_2 = aligns[1]+'/'+row.values[1]+'.a2m'
    file_dir = 'benchmark/'+row.values[1]
    try: os.mkdir(file_dir)
    except: print(row.values[1] + ' already exists')
    msa1_df,sequence_1,uid1 = return_msa_parsed(align_1)
    msa2_df,sequence_2,uid2 = return_msa_parsed(align_2)
    print('finished parsing individual alignments')
    filename = file_dir+'/concatenation_fixed.a2m'
    write_concat_a2m(msa1_df,sequence_1,uid1,msa2_df,sequence_2,uid2,filename)
    print('finished concatenating alignments')
    del msa1_df
    del msa2_df

parsing finished for 9787 sequences
pd initialized
parsing finished for 126458 sequences
pd initialized
finished parsing individual alignments
total number of species:1607
time elapsed: 16.308589696884155 seconds
finished concatenating alignments
parsing finished for 2387 sequences
pd initialized
parsing finished for 1432 sequences
pd initialized
finished parsing individual alignments
total number of species:711
time elapsed: 0.8636472225189209 seconds
finished concatenating alignments
parsing finished for 27331 sequences
pd initialized
parsing finished for 5137 sequences
pd initialized
finished parsing individual alignments
total number of species:776
time elapsed: 2.12134051322937 seconds
finished concatenating alignments
parsing finished for 49430 sequences
pd initialized
parsing finished for 14029 sequences
pd initialized
finished parsing individual alignments
total number of species:574
time elapsed: 3.494563341140747 seconds
finished concatenating alignments
parsing finished for 

time elapsed: 326.40416526794434 seconds
finished concatenating alignments
parsing finished for 236541 sequences
pd initialized
parsing finished for 33175 sequences
pd initialized
finished parsing individual alignments
total number of species:2314
time elapsed: 462.9214539527893 seconds
finished concatenating alignments
parsing finished for 73274 sequences
pd initialized
parsing finished for 1487 sequences
pd initialized
finished parsing individual alignments
total number of species:784
time elapsed: 4.061470031738281 seconds
finished concatenating alignments
parsing finished for 3612 sequences
pd initialized
parsing finished for 127026 sequences
pd initialized
finished parsing individual alignments
total number of species:970
time elapsed: 5.6829612255096436 seconds
finished concatenating alignments
parsing finished for 163423 sequences
pd initialized
parsing finished for 1337 sequences
pd initialized
finished parsing individual alignments
total number of species:683
time elapsed: 1.0