In [1]:
%matplotlib inline

# Notes

This is just a "messy" notebook that I like to keep around to perform random tasks that warrant documentation but aren't too complicated or really important. Should basically be ignored by everyone except me.

# Basic imports

In [28]:
import ete3

import glob
import pandas as pd

from Bio import PDB
from Bio.PDB.Polypeptide import is_aa, three_to_one

# Read in the tree and extract the lineage of interest

In [29]:
my_tree = ete3.Tree('../Data/Sequences_and_phylogeny/IF2_423_tree_paml_output', format=1)

In [30]:
name = [i.name for i in my_tree.get_leaves() if 'E.coli' in i.name]
assert len(name) == 1
name = name[0]
print('E. coli name in tree:\t', name)
print()
node = my_tree.search_nodes(name=name)[0]


lineage = []
root_dists = []
while node:
    lineage.append(node.name)
    root_dists.append(my_tree.get_distance(node))
    node = node.up
print('E. coli lineage:\t', lineage)

E. coli name in tree:	 Gammaproteobacteria_E.coli_IF2

E. coli lineage:	 ['Gammaproteobacteria_E.coli_IF2', '801', '800', '799', '795', '794', '793', '792', '791', '790', '789', '788', '754', '750', '743', '719', '702', '670', '622', '558', '550', '536', '520', '424']


# Add in knowledge of which of those nodes we care about

In [31]:
rep_names = []
for i in glob.glob('../Data/Structures/*'):
    name = i.split('/')[-1]
    if name.find('IF2_Anc') == 0:
        number = name.split('Anc')[-1]
        rep_names.append(number)
        
        
rep = [False]
for ancestor in lineage[1:]:
    if ancestor in rep_names:
        rep.append(True)
    else:
        rep.append(False)

In [32]:
df = pd.DataFrame(zip(lineage, root_dists, rep))
df.columns = ['Node_id', 'Dist_to_root', 'representative']
df

Unnamed: 0,Node_id,Dist_to_root,representative
0,Gammaproteobacteria_E.coli_IF2,4.159481,False
1,801,4.1407,False
2,800,4.08319,False
3,799,4.064128,False
4,795,3.961898,False
5,794,3.922597,False
6,793,3.887007,False
7,792,3.706455,False
8,791,3.665965,False
9,790,3.631085,False


# Get shortened version of ancestral sequences for template modeling

And write them to a file / files. Doing both EFTU and IF2 here

In [85]:
def extract_seq_from_pdb_file(pdb_file_loc):
    '''
    Returns a dictionary of chain id to sequence mapping
    '''
    model_names = []
    chain_names = []
    seqs = []
    ###Read in PDB and extract the chain
    pdb_parser = PDB.PDBParser()
    structure = pdb_parser.get_structure(" ", pdb_file_loc)
    for model in structure:
        for chain in list(model.get_chains()):
            model_names.append(model.id)
            chain_names.append(chain.id)
            ###Enumerate the sequence
            seq_list = []
            chainID = chain.get_id()
            for residue in chain:
                if is_aa(residue.get_resname(), standard=True):
                    seq_list.append(three_to_one(residue.get_resname()))
                else:
                    seq_list.append('X')

            ###Add to growing list
            aa_seq = ''.join(seq_list)
            seqs.append(aa_seq)
    df = pd.DataFrame(zip(model_names, chain_names, seqs))
    df.columns = ['Model_id', 'Chain_id', 'aa_sequence']
    return df

In [106]:
seqs_dicty = {}
for struct_folder in glob.glob('../Data/Structures/IF2_Anc*')[:]:
    all_seqs = []
    for infile_loc in glob.glob(struct_folder+'/*.pdb')[:]:
        single_df = extract_seq_from_pdb_file(infile_loc)
        if single_df.shape[0] == 1:
            all_seqs.append(single_df.iloc[0]['aa_sequence'])
        else:
            prin('Error')
    ###Make sure all examples gave the same result
    assert len(set(all_seqs)) == 1
    ###And add to the dictionary
    seqs_dicty[struct_folder.split('/')[-1]] = all_seqs[0]
    
with open('../Data/truncated_anc_seqs_IF2.fasta', 'w') as outfile:
    for seq_id, seq in seqs_dicty.items():
        outfile.write('>{}\n{}\n'.format(seq_id, seq))

In [107]:
seqs_dicty = {}
for struct_folder in glob.glob('../Data/Structures/EFTU_Anc*')[:]:
    all_seqs = []
    for infile_loc in glob.glob(struct_folder+'/*.pdb')[:]:
        single_df = extract_seq_from_pdb_file(infile_loc)
        if single_df.shape[0] == 1:
            all_seqs.append(single_df.iloc[0]['aa_sequence'])
        else:
            prin('Error')
    ###Make sure all examples gave the same result
    assert len(set(all_seqs)) == 1
    ###And add to the dictionary
    seqs_dicty[struct_folder.split('/')[-1]] = all_seqs[0]
    
with open('../Data/truncated_anc_seqs_EFTU.fasta', 'w') as outfile:
    for seq_id, seq in seqs_dicty.items():
        outfile.write('>{}\n{}\n'.format(seq_id, seq))

**And the E. coli IF2 and EFTU models**

In [130]:
infiles = ['../Data/Structures/Templates/1zo1.clean.pdb',\
          '../Data/Structures/Templates/1efc.pdb',\
          '../Data/Structures/Templates/3jcjF.pdb']
df_dict = {}
for infile in infiles:
    df = extract_seq_from_pdb_file(infile)
    df_dict[infile] = df



In [131]:
df_dict['../Data/Structures/Templates/1efc.pdb'].iloc[0]['aa_sequence']

'TKPHVNVGTIGHVDHGKTTLTAAITTVLAKTYGGAARAFDQIDNAPEEKARGITINTSHVEYDTPTRHYAHVDCPGHADYVKNMITGAAQMDGAILVVAATDGPMPQTREHILLGRQVGVPYIIVFLNKCDMVDDEELLELVEMEVRELLSQYDFPGDDTPIVRGSALKALEGDAEWEAKILELAGFLDSYIPEPERAIDKPFLLPIEDVFSISGRGTVVTGRVERGIIKVGEEVEIVGIKETQKSTCTGVEMFRKLLDEGRAGENVGVLLRGIKREEIERGQVLAKPGTIKPHTKFESEVYILSKDEGGRHTPFFKGYRPQFYFRTTDVTGTIELPEGVEMVMPGDNIKMVVTLIHPIAMDDGLRFAIREGGRTVGAGVVAKVLSXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

In [133]:
df_dict['../Data/Structures/Templates/1zo1.clean.pdb'].iloc[0]['aa_sequence']

'EPRAPVVTIMGHVDHGKTSLLEYIRSTKVASGEAGGITQHIGAYHVETENGMITFLDTPGHAAFTSMRARGAQATDIVVLVVAADDGVMPQTIEAIQHAKAAQVPVVVAVNKIDKPEADPDRVKNELSQYGILPEEWGGESQFVHVSAKAGTGIDELLDAILLQAEVLELKAVRKGMASGAVIESFLDKGRGPVATVLVREGTLHKGDIVLCGFEYGRVRAMRNELGQEVLEAGPSIPVEILGLSGVPAAGDEVTVVRDEKKAREVALYRQGKFREVKLARQQKSKLENMFANMTEGEVHEVNIVLKADVQGSVEAISDSLLKLSTDEVKVKIIGSGVGGITETDATLAAASNAILVGFNVRADASARKVIEAESLDLRYYSVIYNLIDEVKAAMSGMLSPELKQQIIGLAEVRDVFKSPKFGAIAGCMVTEGVVKRHNPIRVLRDNVVIYEGELESLRRFKDDVNEVRNGMECGIGVKNYNDVRTGDVIEVFEIIEIQRT'

In [127]:
df_dict['../Data/Structures/Templates/3jcjF.pdb'].iloc[0]['aa_sequence']

'DTGAAAEPRAPVVTIMGHVDHGKTSLLDYIRSTKVASGEAGGITQHIGAYHVETENGMITFLDTPGHAAFTSMRARGAQATDIVVLVVAADDGVMPQTIEAIQHAKAAQVPVVVAVNKIDKPEADPDRVKNELSQYGILPEEWGGESQFVHVSAKAGTGIDELLDAILLQAEVLELKAVRKGMASGAVIESFLDKGRGPVATVLVREGTLHKGDIVLCGFEYGRVRAMRNELGQEVLEAGPSIPVEILGLSGVPAAGDEVTVVRDEKKAREVALYRQGKFREVKLARQQKSKLENMFANMTEGEVHEVNIVLKADVQGSVEAISDSLLKLSTDEVKVKIIGSGVGGITETDATLAAASNAILVGFNVRADASARKVIEAESLDLRYYSVIYNLIDEVKAAMSGMLSPELKQQIIGLAEVRDVFKSPKFGAIAGCMVTEGVVKRHNPIRVLRDNVVIYEGELESLRRFKDDVNEVRNGMECGIGVKNYNDVRTGDVIEVFEIIEIQRTIA'

503

**Remove ambiguous characters from modern seqs**

In [44]:
listy = []
for line in open('../Data/Sequences_and_phylogeny/IF2_423_modern_sequences.fasta', 'r').readlines():
    if line[0] == '>':
        listy.append(line)
    elif line == '\n':
        pass
    else:
        listy.append(line.replace('X', '-'))

In [47]:
with open('../Data/Sequences_and_phylogeny/IF2_423_modern_sequences.clean.fasta', 'w') as outfile:
    for line in listy:
        outfile.write(line)

**Doing some comparisons**

In [48]:
from Bio import SeqIO

In [58]:
records = SeqIO.parse('../Data/Sequences_and_phylogeny/IF2_423_ancestral_sequences.fasta', 'fasta')
for record in records:
    if record.id == '520':
        my_seq = str(record.seq)

In [59]:
records = SeqIO.parse('/Users/adamhockenberry/Downloads/IF2_initial_test_N96,N112-ancestors_GRASP.fasta', 'fasta')
for record in records:
    print(record.id)
    if record.id == 'N96':
        grasp_seq = str(record.seq)

N96
N112


In [60]:
grasp_seq = grasp_seq.replace('-', '')
print(len(grasp_seq), len(my_seq))

1028 698


In [66]:
print(grasp_seq[:90])
print(my_seq[:90])

MSKVRVYELAKELGMSSKELLEVLKDLGIEVKSHMSTLDEEEVEKIRDAFKKSKSKGAGAAPKKTTAKKPAKTEAKKPAPAEEAKEKPSE
MSKVRVYELAKKLGMSSKELLEVLKELGIEVKSHMSTLDEETVEVIRDLFEEEKPEEEKPPEAAEKEAKKNKKGNKKKKKGRKEKAEEEV


In [62]:
my_seq

'MSKVRVYELAKKLGMSSKELLEVLKELGIEVKSHMSTLDEETVEVIRDLFEEEKPEEEKPPEAAEKEAKKNKKGNKKKKKGRKEKAEEEVEEEEIKTIKIKPEEITLDELAEKLNVPPNEIIKKLFMKGIMLTINQTLSFEQAEQIAMEYGVLVEIEEEQKAEEEEEPEEVLETRWLELYEDEEEDLVPRPPVVTIMGHVDHGKTTLLDAIRKTNVAEKEAGGITQHIGAYQVEHNGKKITFIDTPGHEAFTEMRARGAQVTDIAILVVAADDGVMPQTIEAINHAKAANVPIIVAINKIDKPNANPDRVKQQLVSEYGLVPEEWGGDTIFVPISAKTGQGIDELLEMILLVAEMQELKANPDGRARGVIIESKLDKGLGPVATVIVQDGTLKVGDVFVAGSTYGKVRAMIDDKGRRVKEAGPSTPVEILGFEEVPDAGSTLYVVESEKQAREIAEKVKEKQEQEEQNRTKRHIRLEDLFKQMQEGEVKELNLILKADTMGSVEALKNSLEKLSNDEVEINIIHAGVGAITESDVMLASASDAIILGFNVRVDSKARKMAEKEGVEIRTYNIIYDLIDDIKKALEGMLEPEEKEEVLGQGEIKQVFKISKVGNIAGVQVTDGKVKRDAKVRILRNGVVIYDGKIESLKHYKDDVKEVAAGQECGIKLENFNDIKEGDILECYEMEEVKRTLEFNNNEN'

**E. coli seq**

In [70]:
records = SeqIO.parse('../Data/Sequences_and_phylogeny/IF2_423_modern_sequences.fasta', 'fasta')
for record in records:
    if 'E.coli' in record.id:
        print(record.id)
        ecoli_seq = str(record.seq)

Gammaproteobacteria_E.coli_IF2


In [71]:
len(ecoli_seq)

1995

In [75]:
ecoli_seq.replace('-', '')

'MTDVTIKTLAAERQTSVERLVQQFADAGIRKSADDSVSAQEKQTLIDHLNQKNSGPDKLTLQRKTRSTLNIPGTGGKSKSVQIEVRKKRTFVKRDPQEAERLAAEEQAQREAEEQARREAEESAKREAQQKAEREAAEQAKREAAEQAKREAAEKDKVSNQQDDMTKNAQAEKARREQEAAELKRKAEEEARRKLEEEARRVAEEARRMAEENKWTDNAEPTEDSSDYHVTTSQHARQAEDESDREVEGGRGRGRNAKAARAKKGNKHAESKADREEARAAVRGGKGGKRKGSSLQQGFQKPAQAVNRDVVIGETITVGELANKMAVKGSQVIKAMMKLGAMATINQVIDQETAQLVAEEMGHKVILRRENELEEAVMSDRDTGAAAEPRAPVVTIMGHVDHGKTSLLDYIRSTKVASGEAGGITQHIGAYHVETENGMITFLDTPGHAAFTSMRARGAQATDIVVLVVAADDGVMPQTIEAIQHAKAAQVPVVVAVNKIDKPEADPDRVKNELSQYGILPEEWGGESQFVHVSAKAGTGIDELLDAILLQAEVLELKAVRKGMASGAVIESFLDKGRGPVATVLVREGTLHKGDIVLCGFEYGRVRAMRNELGQEVLEAGPSIPVEILGLSGVPAAGDEVTVVRDEKKAREVALYRQGKFREVKLARQQKSKLENMFANMTEGEVHEVNIVLKADVQGSVEAISDSLLKLSTDEVKVKIIGSGVGGITETDATLAAASNAILVGFNVRADASARKVIEAESLDLRYYSVIYNLIDEVKAAMSGMLSPELKQQIIGLAEVRDVFKSPKFGAIAGCMVTEGVVKRHNPIRVLRDNVVIYEGELESLRRFKDDVNEVRNGMECGIGVKNYNDVRTGDVIEVFEIIEIQRTIA'

# Move around modeler results

In [117]:
from io import StringIO
import shutil
import os

In [124]:
# response_dict = {}
for base_dir in glob.glob('../Data/Structures/IF2_*')+glob.glob('../Data/Structures/EFTU_*')[:]:
    print(base_dir)
    if 'IF2_Anc559' in base_dir:
        continue
    results_dir = base_dir.replace('/Data/', '/Results/').replace('/Structures/', '/modeller/')
    if os.path.exists(results_dir):
        pass
    else:
        os.mkdir(results_dir)
    log_file = base_dir + '/model-single.log'
    with open(log_file, 'r') as infile:
        tempy = infile.readlines()[-6:]
    df = pd.read_csv(StringIO(''.join(tempy)), delim_whitespace=True, header=None)
    best_version = df[df[2]==df[2].min()].iloc[0][0]
    shutil.copy(base_dir + '/' + best_version, results_dir + '/model1.pdb')
#     my_file_loc = base_dir + '/' + best_version
#     my_id = base_dir.split('/')[-1]
#     print(best_version, my_id)

../Data/Structures/IF2_Anc670
../Data/Structures/IF2_Anc622
../Data/Structures/IF2_Anc536
../Data/Structures/IF2_Anc750
../Data/Structures/IF2_Anc702
../Data/Structures/IF2_Anc743
../Data/Structures/IF2_Anc788
../Data/Structures/IF2_Anc719
../Data/Structures/IF2_Anc559
../Data/Structures/IF2_Anc754
../Data/Structures/IF2_Anc550
../Data/Structures/IF2_Anc558
../Data/Structures/IF2_Anc520
../Data/Structures/EFTU_Anc168
../Data/Structures/EFTU_Anc262


In [None]:
shutil.