<html>
    <summary></summary>
         <div> <p></p> </div>
         <div style="font-size: 20px; width: 800px;"> 
              <h1>
               <left>Extract Individual Ferritin X-ray Crystal Structures and Amino Acid Sequences from POSA Structural Alignment</left>
              </h1>
              <p><left>============================================================================</left> </p>
<pre>May, 2025
Toroidal diffusion for structural phylogenetics
Clementine Yan, Walter Xie, Alex Popinga, Alexei Drummond
Notebook by: Alex Popinga
</pre>
         </div>
    </p>

</html>

<details>
  <summary>Copyright info</summary>
<details>



(Input: PDB File, Output: PDB file, FASTA file)

Define functions to extract the individual ferritin structures and amino acid sequences, then output as individual PDB files and FASTA files, respectively.

In [None]:
import os
import re

def extract_ferritin_structures(input_pdb_file):
    with open(input_pdb_file, 'r') as file:
        lines = file.readlines()
    
    current_structure = []
    structure_id = None
    sequences = {}
    
    for line in lines:
        if line.startswith("HEADER"):  # Identify the start of a new structure
            if structure_id and current_structure:
                write_pdb(structure_id, current_structure)
                sequences[structure_id] = extract_sequence(current_structure)
            structure_id = line.split()[-1]  # Last word in HEADER is assumed to be the ID
            structure_id = re.sub(r"-tran\.pdb$", "", structure_id)  # Remove "-tran.pdb" suffix
            current_structure = [line]
        elif line.startswith("TER"):  # Termination of the current structure
            current_structure.append(line)
            if structure_id:
                write_pdb(structure_id, current_structure)
                sequences[structure_id] = extract_sequence(current_structure)
            current_structure = []
            structure_id = None
        elif structure_id:
            current_structure.append(line)
    
    write_fasta(input_pdb_file, sequences)

def write_pdb(structure_id, pdb_lines):
    filename = f"{structure_id}.pdb"
    with open(filename, 'w') as out_file:
        out_file.writelines(pdb_lines)
    print(f"Extracted: {filename}")

def extract_sequence(pdb_lines):
    amino_acids = {
        'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
        'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
        'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
        'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
    }
    sequence = []
    seen_residues = set()
    
    for line in pdb_lines:
        if line.startswith("ATOM") and line[13:15].strip() == "CA":
            res_name = line[17:20].strip()
            res_num = line[22:26].strip()
            
            if (res_name, res_num) not in seen_residues:
                seen_residues.add((res_name, res_num))
                if res_name in amino_acids:
                    sequence.append(amino_acids[res_name])
    
    return "".join(sequence)

def write_fasta(input_pdb_file, sequences):
    fasta_filename = os.path.splitext(os.path.basename(input_pdb_file))[0] + "_sequences.fasta"
    with open(fasta_filename, 'w') as fasta_file:
        for structure_id, sequence in sequences.items():
            fasta_file.write(f">{structure_id}\n{sequence}\n")
    print(f"FASTA file created: {fasta_filename}")

Call each alignment and output both alignments into one FASTA file.

In [None]:
if __name__ == "__main__":
    input_pdb_file_a = "POSA Structural Alignments/ferritins_a.pdb"
    extract_ferritin_structures(input_pdb_file_a)
if __name__ == "__main__":
    input_pdb_file_b = "POSA Structural Alignments/ferritins_b.pdb"
    extract_ferritin_structures(input_pdb_file_b)
    
# Combine the two FASTA files into a single file
# with open("POSA Structural Alignments/ferritins_sequences.fasta", 'w') as out_file:
#     for input_file in [input_pdb_file_a, input_pdb_file_b]:
#         with open(os.path.splitext(input_file)[0] + "_sequences.fasta", 'r') as in_file:
#             out_file.write(in_file.read())
#     print("Combined FASTA file created: ferritins_sequences.fasta")

Extracted: 1vlgA.pdb
Extracted: 1lb3A.pdb
Extracted: 1krqA.pdb
Extracted: 1jtsA.pdb
Extracted: 1jigA.pdb
Extracted: 1ji5A.pdb
Extracted: 1ji4A.pdb
Extracted: 1jgcA.pdb
Extracted: 1eumA.pdb
Extracted: 1dpsA.pdb
Extracted: 1bg7A.pdb
Extracted: 1uvhA.pdb
Extracted: 1bcfA.pdb
Extracted: 1tk6A.pdb
Extracted: 1tjoA.pdb
Extracted: 1r03A.pdb
Extracted: 1qghA.pdb
Extracted: 1o9rA.pdb
Extracted: 1nfvA.pdb
FASTA file created: ferritins_a_sequences.fasta
Extracted: 1za0A.pdb
Extracted: 3e6sA.pdb
Extracted: 2uw1A.pdb
Extracted: 1otkA.pdb
Extracted: 2fzfA.pdb
Extracted: 2chpA.pdb
Extracted: 2fkzA.pdb
Extracted: 2jd70.pdb
Extracted: 2vzbA.pdb
FASTA file created: ferritins_b_sequences.fasta


Do the same for the MUSTANG alignment.

In [8]:
if __name__ == "__main__":
    input_pdb_file_m = "MUSTANG Structural Alignments/ferritins.pdb"
    extract_ferritin_structures(input_pdb_file_m)

Extracted: 1BCFA.pdb.pdb
Extracted: 1jigA.pdb.pdb
Extracted: 1nfvA.pdb.pdb
Extracted: 1uvhA.pdb.pdb
Extracted: 2jd70.pdb.pdb
Extracted: 1o9rA.pdb.pdb
Extracted: 1vlgA.pdb.pdb
Extracted: 2uw1A.pdb.pdb
Extracted: 1dpsA.pdb.pdb
Extracted: 1jtsA.pdb.pdb
Extracted: 1otkA.pdb.pdb
Extracted: 1eumA.pdb.pdb
Extracted: 1krqA.pdb.pdb
Extracted: 1qghA.pdb.pdb
Extracted: 2chpA.pdb.pdb
Extracted: 2vzbA.pdb.pdb
Extracted: 1jgcA.pdb.pdb
Extracted: 1lb3A.pdb.pdb
Extracted: 1r03A.pdb.pdb
Extracted: 1ji4A.pdb.pdb
Extracted: 2fkzA.pdb.pdb
Extracted: 3e6sA.pdb.pdb
Extracted: 1ji5A.pdb.pdb
Extracted: 1tjoA.pdb.pdb
Extracted: 2fzfA.pdb.pdb
FASTA file created: ferritins_sequences.fasta
