In [13]:
# === TEK HÜCRE: FASTA adından zinciri al, CIF+DSSP ile H/E/C'yi SEQRES'e hizala ===
# Kabul edilen FASTA adı: 1CEE.B.fasta  (nokta/altçizgi/tire de olur: 1CEE_B.fa, 1CEE-B.fasta)
# CIF adı:  1CEE.cif
# DSSP adı: 1CEE.dssp
# Çıktılar: /content/chain_csv_out/  (CSV + _seqres_ss.txt) ve otomatik ZIP indirme

# Gerekli paketler
try:
    from Bio import SeqIO
    from Bio.PDB.MMCIF2Dict import MMCIF2Dict
    import pandas as pd
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "biopython", "pandas"])
    from Bio import SeqIO
    from Bio.PDB.MMCIF2Dict import MMCIF2Dict
    import pandas as pd

import re, io, zipfile, gzip, os, shutil
from pathlib import Path
from google.colab import files

BASE   = Path("/content")
FASTAD = BASE/"fasta_files"; FASTAD.mkdir(exist_ok=True)
CIFD   = BASE/"cif_files";   CIFD.mkdir(exist_ok=True)
DSSPD  = BASE/"dssp_files";  DSSPD.mkdir(exist_ok=True)
OUTD   = BASE/"chain_csv_out"; OUTD.mkdir(exist_ok=True)

# --- yardımcılar ---
AA3_TO_1 = {'ALA':'A','ARG':'R','ASN':'N','ASP':'D','CYS':'C','GLN':'Q','GLU':'E','GLY':'G',
            'HIS':'H','ILE':'I','LEU':'L','LYS':'K','MET':'M','PHE':'F','PRO':'P','SER':'S',
            'THR':'T','TRP':'W','TYR':'Y','VAL':'V','SEC':'U','PYL':'O'}

def parse_pdb_chain_from_filename(p: Path):
    # 1CEE.B.fasta / 1CEE_B.fa / 1CEE-B → (1CEE, B)
    m = re.match(r"^([0-9A-Za-z]{4})[._-]([A-Za-z0-9])", p.stem)
    if m: return m.group(1).upper(), m.group(2)
    return None, None

def parse_pdb_chain_from_header(header: str):
    # >1CEE.B / >1CEE_B / >1CEE|B / >1CEE B
    m = re.search(r'([0-9A-Za-z]{4})[ _\.\|\-]?([A-Za-z0-9])', header)
    if m: return m.group(1).upper(), m.group(2)
    # sadece PDB varsa:
    m2 = re.search(r'([0-9A-Za-z]{4})', header)
    return (m2.group(1).upper(), None) if m2 else (None, None)

def load_seqres_for_chain(mm: MMCIF2Dict, chain_id: str):
    la  = mm.get('_atom_site.label_asym_id', [])
    ent = mm.get('_atom_site.label_entity_id', [])
    chain2ent = {}
    for a,e in zip(la, ent): chain2ent.setdefault(a, e)
    if chain_id not in chain2ent:
        raise ValueError(f"Chain {chain_id} mmCIF'te yok.")
    entity_id = chain2ent[chain_id]
    eps_ent = mm.get('_entity_poly_seq.entity_id', [])
    eps_mon = mm.get('_entity_poly_seq.mon_id', [])
    seq3 = [m for m,e in zip(eps_mon, eps_ent) if e == entity_id]
    return "".join(AA3_TO_1.get(x,'X') for x in seq3)

def build_auth_to_label_map(mm: MMCIF2Dict, chain_id: str):
    la  = mm.get('_atom_site.label_asym_id', [])
    auth= mm.get('_atom_site.auth_seq_id', [])
    ins = mm.get('_atom_site.pdbx_PDB_ins_code', [])
    lab = mm.get('_atom_site.label_seq_id', [])
    m = {}
    for a,au,ic,ls in zip(la, auth, ins, lab):
        if a!=chain_id or au=='?' or ls=='?': continue
        try: au_i, ls_i = int(au), int(ls)
        except: continue
        ic = '' if ic in ('?','.') else ic.strip()
        m[(au_i, ic)] = ls_i
    return m

def parse_dssp_chain(dssp_path: Path, chain_id: str):
    rows, in_data = [], False
    with open(dssp_path) as f:
        for line in f:
            if line.startswith("  #  RESIDUE"): in_data=True; continue
            if not in_data or len(line)<50: continue
            ch = line[11].strip()  # zincir karakteri
            if (chain_id or '') != (ch or ''):  # None ise boş zincirle eşleşebilir
                continue
            resseq = line[5:10].strip()
            if not resseq: continue
            aa  = line[13].strip()
            ss  = line[16].strip() or 'C'
            icode = line[10].strip()
            asa = line[34:38].strip()
            phi = line[103:109].strip()
            psi = line[109:115].strip()
            rows.append((int(resseq), '' if icode in ('',' ') else icode, aa, ss, asa, phi, psi))
    rows.sort(key=lambda x:(x[0],x[1]))
    return rows

def project_dssp_to_seqres(cif_path: Path, dssp_path: Path, chain_id: str):
    mm = MMCIF2Dict(str(cif_path))
    seqres = load_seqres_for_chain(mm, chain_id)
    L = len(seqres)
    a2l = build_auth_to_label_map(mm, chain_id)
    drows = parse_dssp_chain(dssp_path, chain_id)
    ss_seqres = ['-']*L
    recs=[]
    for auth_seq,icode,aa,ss,asa,phi,psi in drows:
        lab = a2l.get((auth_seq, icode))
        if lab is None or not (1<=lab<=L): continue
        # 8->3 sınıf dönüşümü
        ss_simple = ss if ss in ('H','E','C') else ('H' if ss in ('G','I') else ('E' if ss=='B' else 'C'))
        ss_seqres[lab-1] = ss_simple
        recs.append({"label_seq_id": lab, "aa_seqres": seqres[lab-1], "ss": ss_simple,
                     "auth_seq_id": auth_seq, "icode": icode, "asa": asa, "phi": phi, "psi": psi})
    return seqres, "".join(ss_seqres), pd.DataFrame(recs).sort_values("label_seq_id")

# === 1) FASTA seç–yükle (tek/çoklu; isimden zincir alacağız) ===
print("FASTA (.fa/.fasta) dosyalarını seçin ⤵")
up = files.upload()
for name, data in up.items():
    (FASTAD/Path(name).name).write_bytes(data)
print("FASTA klasörü:", FASTAD)

# === 2) CIF seç–yükle (.cif veya .zip/.cif.gz de olur) ===
print("CIF (.cif/.cif.gz ya da zip içinden) seçin ⤵")
up = files.upload()
for name, data in up.items():
    lower = name.lower()
    if lower.endswith(".zip"):
        with zipfile.ZipFile(io.BytesIO(data)) as z:
            for nm in z.namelist():
                if not (nm.lower().endswith(".cif") or nm.lower().endswith(".cif.gz")):
                    continue
                stem = Path(nm).stem
                if stem.lower().endswith(".cif"): stem = Path(stem).stem
                m = re.match(r"([0-9A-Za-z]{4})", stem)
                if not m: continue
                pdb4 = m.group(1).upper()
                by = z.read(nm)
                try: by = gzip.decompress(by)
                except OSError: pass
                (CIFD/f"{pdb4}.cif").write_bytes(by)
    else:
        stem = Path(name).stem
        if stem.lower().endswith(".cif"): stem = Path(stem).stem
        m = re.match(r"([0-9A-Za-z]{4})", stem)
        if not m: continue
        pdb4 = m.group(1).upper()
        by = data
        if lower.endswith(".gz"):
            try: by = gzip.decompress(by)
            except OSError: pass
        (CIFD/f"{pdb4}.cif").write_bytes(by)
print("CIF klasörü:", CIFD)

# === 3) DSSP seç–yükle (.dssp veya zip) ===
print("DSSP (.dssp ya da zip) seçin ⤵")
up = files.upload()
for name, data in up.items():
    lower = name.lower()
    if lower.endswith(".zip"):
        with zipfile.ZipFile(io.BytesIO(data)) as z:
            for nm in z.namelist():
                if not nm.lower().endswith(".dssp"): continue
                m = re.match(r"([0-9A-Za-z]{4})", Path(nm).stem)
                if not m: continue
                (DSSPD/f"{m.group(1).upper()}.dssp").write_bytes(z.read(nm))
    else:
        m = re.match(r"([0-9A-Za-z]{4})", Path(name).stem)
        if not m: continue
        (DSSPD/f"{m.group(1).upper()}.dssp").write_bytes(data)
print("DSSP klasörü:", DSSPD)

# === 4) İşle (dosya adından zinciri al) ===
ok=skip=fail=0
for f in sorted(FASTAD.glob("*")):
    # dosya adından pdb/chain
    pdb, chain = parse_pdb_chain_from_filename(f)
    # header fallback
    if not pdb:
        try:
            rec = next(SeqIO.parse(str(f), "fasta"))
            pdb2, chain2 = parse_pdb_chain_from_header(rec.description)
            pdb, chain = pdb or pdb2, chain or chain2
        except Exception:
            pass
    if not pdb or not chain:
        print(f"↷ atlandı (pdb/chain bulunamadı): {f.name}"); skip += 1; continue

    cif  = CIFD/f"{pdb}.cif"
    dssp = DSSPD/f"{pdb}.dssp"
    if not cif.exists() or not dssp.exists():
        print(f"↷ atlandı (eksik dosya): {pdb}_{chain} (cif:{cif.exists()} dssp:{dssp.exists()})")
        skip += 1; continue

    try:
        seqres, ss_seqres, df = project_dssp_to_seqres(cif, dssp, chain)
    except Exception as e:
        print(f"✗ {pdb}_{chain}: {e}")
        fail += 1; continue

    df.to_csv(OUTD/f"{pdb}_{chain}.csv", index=False)
    with open(OUTD/f"{pdb}_{chain}_seqres_ss.txt","w") as w:
        w.write(">SEQRES\n"+seqres+"\n>SS_on_SEQRES\n"+ss_seqres+"\n")
    print(f"✓ {pdb}_{chain}: {len(seqres)} aa → {pdb}_{chain}.csv")
    ok += 1

print(f"\nÖZET | OK:{ok}  SKIP:{skip}  FAIL:{fail}")
print("Çıktılar:", OUTD)

# === 5) Zip indir ===
zip_path = BASE/"chain_csv_out.zip"
if zip_path.exists(): zip_path.unlink()
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for p in OUTD.glob("*"):
        z.write(p, arcname=p.name)
files.download(str(zip_path))


FASTA (.fa/.fasta) dosyalarını seçin ⤵


Saving rcsb_pdb_allpdbidswithchain.fasta to rcsb_pdb_allpdbidswithchain (2).fasta
FASTA klasörü: /content/fasta_files
CIF (.cif/.cif.gz ya da zip içinden) seçin ⤵


Saving cif_files.zip to cif_files (3).zip
CIF klasörü: /content/cif_files
DSSP (.dssp ya da zip) seçin ⤵


Saving all_dssp.zip to all_dssp (3).zip
DSSP klasörü: /content/dssp_files
✓ 1CEE_B: 59 aa → 1CEE_B.csv
✓ 1DZL_A: 505 aa → 1DZL_A.csv
✓ 1EBO_E: 131 aa → 1EBO_E.csv
↷ atlandı (eksik dosya): 1G2C_F (cif:False dssp:True)
✓ 1H38_D: 883 aa → 1H38_D.csv
✓ 1HTM_B: 138 aa → 1HTM_B.csv
✓ 1IYT_A: 42 aa → 1IYT_A.csv
✓ 1JFK_A: 134 aa → 1JFK_A.csv
✓ 1JTI_B: 385 aa → 1JTI_B.csv
✓ 1K0N_A: 241 aa → 1K0N_A.csv
✓ 1KCT_A: 394 aa → 1KCT_A.csv
✓ 1MBY_A: 88 aa → 1MBY_A.csv
✓ 1MIQ_B: 375 aa → 1MIQ_B.csv
✗ 1MNM_C: 'label_seq_id'
✗ 1MNM_D: 'label_seq_id'
✓ 1NOC_A: 388 aa → 1NOC_A.csv
✓ 1NQD_A: 122 aa → 1NQD_A.csv
✓ 1NQJ_B: 119 aa → 1NQJ_B.csv
✓ 1NRJ_B: 218 aa → 1NRJ_B.csv
✓ 1OVA_A: 386 aa → 1OVA_A.csv
✓ 1QB3_A: 150 aa → 1QB3_A.csv
✓ 1QLN_A: 883 aa → 1QLN_A.csv
✓ 1QOM_B: 440 aa → 1QOM_B.csv
✓ 1QS8_B: 329 aa → 1QS8_B.csv
✓ 1REP_C: 251 aa → 1REP_C.csv
✓ 1RK4_B: 243 aa → 1RK4_B.csv
✓ 1RKP_A: 326 aa → 1RKP_A.csv
✓ 1SVF_C: 64 aa → 1SVF_C.csv
✓ 1UXM_K: 153 aa → 1UXM_K.csv
✓ 1WP8_C: 89 aa → 1WP8_C.csv
✓ 1WYY_B: 149 aa 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>