In [1]:
import numpy as np
import matplotlib.pylab as plt
from scipy import stats
from scipy.spatial.distance import pdist,squareform
import pandas as pd
import os
import itertools
import glob

In [2]:
# from fasta
def parse_fasta(filename):
    '''function to parse fasta'''
    header = []
    sequence = []
    lines = open(filename, "r")
    for line in lines:
        line = line.rstrip()
        if line[0] == ">":
            header.append(line[1:])
            sequence.append([])
        else:
            sequence[-1].append(line)
    lines.close()
    sequence = [''.join(seq) for seq in sequence]
    return np.array(header), np.array(sequence)

In [3]:
hpc_df = pd.read_csv('high_precision_complexes.csv',header=None)

In [4]:
def return_msa_parsed(align_file):
    header = []
    sequence = []
    lines = open(align_file, "r")
    for line in lines:
        line = line.rstrip()
        if line[0] == ">":
            header.append(line[1:])
            sequence.append([])
        else:
            sequence[-1].append(line)
    lines.close()
    sequence = [''.join(seq) for seq in sequence]

    msa_df = pd.DataFrame()
    msa_df['full_names']=np.array(header)
    msa_df['seqs']=np.array(sequence)
    msa_df['names']=msa_df.full_names.str.split('/',expand=True)[0]
    msa_df['region'] = msa_df.full_names.str.split('/',expand=True)[1]
    msa_df['species']=msa_df.names.str.split('_',expand=True)[1]
    return msa_df

In [5]:
def write_concat_a2m(msa1_df,msa2_df,filename):
    species_list = np.unique(np.concatenate((msa2_df['species'].values,msa1_df['species'].values)))
    with open(filename,'w+') as f:
        for spec in species_list:
            for prod in (itertools.product(msa1_df[msa1_df['species']==spec]['full_names'].values,msa2_df[msa2_df['species']==spec]['full_names'].values)):
                f.write('>'+msa1_df[msa1_df['full_names']==prod[0]]['names'].values[0]+'-'+msa2_df[msa2_df['full_names']==prod[1]]['names'].values[0]+'/'+msa1_df[msa1_df['full_names']==prod[0]]['region'].values[0]+'-'+msa2_df[msa2_df['full_names']==prod[1]]['region'].values[0]+'\n')
                f.write(msa1_df[msa1_df['full_names']==prod[0]]['seqs'].values[0]+msa2_df[msa2_df['full_names']==prod[1]]['seqs'].values[0]+'\n')

In [None]:
for row in hpc_df.iterrows():
    aligns = glob.glob('/home/as974/marks/users/kbrock/ecoli_complex/calibration/output/'+row[1].values[1]+'/align*')
    align_1 = aligns[0]+'/'+row[1].values[1]+'.a2m'
    align_2 = aligns[1]+'/'+row[1].values[1]+'.a2m'
    file_dir = 'benchmark/'+row[1].values[1]
    try: os.mkdir(file_dir)
    except: print(row[1].values[1] + ' already exists')
    msa1_df = return_msa_parsed(align_1)
    msa2_df = return_msa_parsed(align_2)
    print(msa1_df.loc[0,:])
    print(msa2_df.loc[0,:])
    file_name = file_dir+'/concat.a2m'
    write_concat_a2m(msa1_df,msa2_df,file_name)
    del msa1_df
    del msa2_df

allpdb0042 already exists
full_names                                      CYB_BOVIN/1-379
seqs          mtnirksHPLMKIVNNAFIDLPAPSNISSWWNFGSLLGICLILQIL...
names                                                 CYB_BOVIN
region                                                    1-379
species                                                   BOVIN
Name: 0, dtype: object
full_names                                      CY1_BOVIN/1-325
seqs          maaaaatlrgamvgprgaglpgarargllcgarpgqlplrtpqavs...
names                                                 CY1_BOVIN
region                                                    1-325
species                                                   BOVIN
Name: 0, dtype: object
