In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import requests
from io import StringIO
from time import sleep
import re
import time

from pathlib import Path
from typing import Iterable, Dict, Set

In [2]:
def pdbid_to_uniprot_length(pdb_id):
    pdb_id = pdb_id.upper()
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}"
    r = requests.get(url)
    r.raise_for_status()
    data = r.json()
    if pdb_id not in data:
        return None
    results = []
    for up, details in data[pdb_id]["UniProt"].items():
        length = details["sequence"]["length"]
        results.append((up, length))
    return results  # list of (UniProtID, length)

# Example:
# print(pdbid_to_uniprot_length("4HHB"))
# [('P69905', 142), ('P68871', 147)]


In [3]:
folder = Path("prion_pdbs")
files = [f.name for f in folder.iterdir() if (f.is_file() and f.name.endswith(".pdb"))]
print(files)

['1lg4.pdb', '1qm3.pdb', '2iv6.pdb', '1qm2.pdb', '1qm0.pdb', '1oei.pdb', '2iv5.pdb', '2iv4.pdb', '1oeh.pdb', '1e1j.pdb', '1dwz.pdb', '1ag2.pdb', '1e1g.pdb', '1e1p.pdb', '1dwy.pdb', '1e1s.pdb', '1e1w.pdb', '1qlx.pdb', '1qlz.pdb', '1e1u.pdb', '1dx0.pdb', '1hjn.pdb', '1b10.pdb', '1hjm.pdb']


In [4]:
# valid = []
# not_valid = []
# length_limit = 256
# for fname in files:
#     time.sleep(5)
#     if fname is None:
#         continue
#     a = pdbid_to_uniprot_length(fname[:-4])
#     if a is None:
#         print("watch out for : " + str(fname))
#     if (not fname.endswith(".pdb")) or (a and a[1] < length_limit):
#         not_valid.append(fname)
#     else:
#         valid.append(fname)
        
# valid, not_valid

In [5]:
def parse_pdb(pdb_path):
    """
    Parse a PDB file line by line.
    Returns:
      atoms:  list of dicts (serial, name, resname, chain, resseq, x,y,z)
      helices: list of dicts (helix_id, start_res, start_chain, end_res, end_chain, length)
    """
    atoms, helices = [], []
    atom_lines = []
    with open(pdb_path) as f:
        for line in f:
            record = line[0:6].strip()

            # ---- ATOM records ----
            if record == "ATOM":
                atoms.append({
                    "serial":   int(line[6:11]),
                    "name":     line[12:16].strip(),
                    "resname":  line[17:20].strip(),
                    "chain":    line[21].strip(),
                    "resseq":   int(line[22:26]),
                    "x":        float(line[30:38]),
                    "y":        float(line[38:46]),
                    "z":        float(line[46:54]),
                    "occ":      float(line[54:60] or 0),
                    "bfac":     float(line[60:66] or 0),
                })
                atom_lines.append(str(line))

            # ---- HELIX records ----
            elif record == "HELIX":
                helices.append({
                    "ser_num":    line[7:10].strip(),       # helix serial number
                    "helix_id":   line[11:14].strip(),      # helix identifier
                    "start_res":  line[15:18].strip(),      # initial residue name
                    "start_chain":line[19].strip(),         # chain identifier
                    "start_seq":  int(line[21:25]),         # start residue sequence number
                    "end_res":    line[27:30].strip(),      # ending residue name
                    "end_chain":  line[31].strip(),         # chain identifier
                    "end_seq":    int(line[33:37]),         # end residue sequence number
                    "helix_class":line[38:40].strip(),
                    "length":     int(line[71:76]),
                })

    return atom_lines, helices

In [6]:
# now retrieve residue numbers and atomic coords
folder = "prion_pdbs"
for pdb in files:
    fname = f"{folder}/{pdb}"
    atoms, helices = parse_pdb(fname)
    # now process atomic coords
    atom_file_name = f"prion_atomic_coords/{pdb}"
    with open(atom_file_name, 'w') as f:
        for line in atoms:
            f.write(line)