In [1]:
from __future__ import annotations
from pathlib import Path
from typing import Dict, List, Iterable
import csv
import re

# Prefix used for temporary .txt files (only these are deleted)
TAG = "AAFEAT_x9k42_"

# Header row for the combined CSV (no "Pocket Score")
HEADER_TITLE = (
    "Protein\tCyclical\tDrug Score\tNumber of alpha spheres\t"
    "Mean alpha-sphere radius\tMean alpha-sphere Solvent Acc.\tMean B-factor of pocket residues\t"
    "Hydrophobicity Score\tPolarity Score\tAmino Acid based volume Score\tPocket volume (Monte Carlo)\t"
    "Pocket volume (convex hull)\tCharge Score\tLocal hydrophobic density Score\tNumber of apolar alpha sphere\t"
    "Proportion of apolar alpha sphere\tAS Sequence"
)

# Amino-acid groups
AA_ORDER = list("ACDEFGHIKLMNPQRSTVWY")
NON_POLAR = set(list("GAVLIMFP"))
POLAR_UNCHARGED = set(["S", "T", "N", "Q", "Y", "C", "W"]) 
CHARGED_POS = set(list("RHK"))
CHARGED_NEG = set(list("DE"))
IONISABLE = set(["D", "E", "H", "C", "Y", "K", "R"]) 
AROMATIC = set(["F", "W", "Y", "H"]) 
HYDROPHOBIC = set(["F", "W", "Y", "H", "K", "M", "T", "L", "I", "V", "C", "G", "A"]) 
SMALL = set(["A", "C", "S", "G", "V", "T", "N", "D", "P"]) 
TINY = set(["A", "C", "S", "G"]) 

# Labels appended to Dataset.csv (fixed order and spelling)
APPEND_LABELS = [
    "Number of residues",
    "Ionisable groups",
    "Polar",
    "Non-polar",
    "Charged",
    "Positive charge",
    "Negative charge",
    "Pos-neg charge ratio",
    "Uncharged",
    "Aromatic",
    "Hydrophobic",
    "Small",
    "tiny",
]

# 3-letter to 1-letter amino acid mapping
MAP_3TO1 = {
    "ALA":"A","CYS":"C","ASP":"D","GLU":"E","PHE":"F","GLY":"G","HIS":"H",
    "HID":"H","HIE":"H","HIP":"H","ILE":"I","LYS":"K","LEU":"L","MET":"M",
    "ASN":"N","PRO":"P","GLN":"Q","ARG":"R","SER":"S","THR":"T","VAL":"V",
    "TRP":"W","TYR":"Y","MSE":"X"
}

_last_number_re = re.compile(r'([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)\s*$')
_cyc_pat = re.compile(r'(?:^|[_\-])([LC])(?:[_\-]|$)', re.IGNORECASE)

# File helpers

def read_text_lines(p: Path) -> List[str]:
    with p.open('r', encoding='utf-8', errors='ignore') as fh:
        return fh.readlines()


def write_lines(p: Path, lines: Iterable[str]) -> None:
    with p.open('w', encoding='utf-8', newline='') as fh:
        for ln in lines:
            if not ln.endswith('\n'):
                ln += '\n'
            fh.write(ln)

# Sequence helpers

def normalize_seq(seq: str) -> str:
    return ''.join([ch for ch in seq.upper() if ch.isalpha()])


def count_residues(seq: str) -> Dict[str, int]:
    counts = {aa: 0 for aa in AA_ORDER}
    for ch in seq:
        if ch in counts:
            counts[ch] += 1
    return counts

# Compute residue-based features

def feature_counts(seq: str) -> Dict[str, float]:
    s = normalize_seq(seq)
    n = len(s)
    c = count_residues(s)
    nonpolar = sum(c[a] for a in NON_POLAR)
    polar = sum(c[a] for a in POLAR_UNCHARGED)
    charged_pos = sum(c[a] for a in CHARGED_POS)
    charged_neg = sum(c[a] for a in CHARGED_NEG)
    charged = charged_pos + charged_neg
    uncharged = n - charged
    ionisable = sum(c[a] for a in IONISABLE)
    aromatic = sum(c[a] for a in AROMATIC)
    hydrophobic = sum(c[a] for a in HYDROPHOBIC)
    small = sum(c[a] for a in SMALL)
    tiny = sum(c[a] for a in TINY)
    pos_neg_ratio = (charged_pos / charged_neg) if charged_neg != 0 else 0
    return {
        "Number of residues": n,
        "Ionisable groups": ionisable,
        "Polar": polar,
        "Non-polar": nonpolar,
        "Charged": charged,
        "Positive charge": charged_pos,
        "Negative charge": charged_neg,
        "Pos-neg charge ratio": round(pos_neg_ratio, 3),
        "Uncharged": uncharged,
        "Aromatic": aromatic,
        "Hydrophobic": hydrophobic,
        "Small": small,
        "tiny": tiny,
    }

# Build sequence of unique residues from ATOM records using fixed PDB columns

def pdb_unique_res_sequence(pdb_path: Path) -> str:
    seen = set()
    order_resnames: List[str] = []
    for line in read_text_lines(pdb_path):
        if not line.startswith("ATOM"):
            continue
        resname = line[17:20].strip().upper()
        chainID = line[21:22]
        resSeq  = line[22:26].strip()
        iCode   = line[26:27]
        key = (chainID, resSeq, iCode)
        if key not in seen:
            seen.add(key)
            order_resnames.append(resname)
    return ''.join(MAP_3TO1.get(r, 'X') for r in order_resnames)

# Extract last numeric token from header lines 6â€“20 (15 total)

def extract_last_fields_block(pdb_path: Path) -> List[str]:
    lines = read_text_lines(pdb_path)
    block = lines[5:20]
    vals: List[str] = []
    for ln in block:
        m = _last_number_re.search(ln)
        if m:
            vals.append(m.group(1))
        else:
            toks = ln.strip().split()
            vals.append(toks[-1] if toks else "")
    if len(vals) < 15:
        vals += [""] * (15 - len(vals))
    elif len(vals) > 15:
        vals = vals[:15]
    return vals

# Create a tagged .txt file per .pdb

def stage1_make_txts():
    for pdb in sorted(Path('.').glob('*.pdb')):
        base = pdb.stem
        metrics = extract_last_fields_block(pdb)
        seq = pdb_unique_res_sequence(pdb)
        fields = [base, ""] + metrics + [seq]
        out_txt = Path(f"{TAG}{base}.txt")
        write_lines(out_txt, fields)

# Detect L/C tag from names. Returns "0" (linear), "1" (cyclic), or "" if unknown.

def _detect_cyc_from_names(txt_stem: str, protein_name: str) -> str:
    for candidate in (protein_name, txt_stem):
        m = _cyc_pat.search(candidate)
        if m:
            return "0" if m.group(1).upper() == "L" else "1"
    return ""

# Combine tagged .txt files into all.csv and fill Cyclical

def stage2_combine_to_csv(all_csv: Path = Path('all.csv')):
    header = HEADER_TITLE.replace('\t', ',')
    with all_csv.open('w', encoding='utf-8', newline='') as fh:
        fh.write(header + '\n')
        for txt in sorted(Path('.').glob(f'{TAG}*.txt')):
            values = [ln.strip() for ln in read_text_lines(txt) if ln.strip()]
            # values[0]=Protein, values[1]=Cyclical, values[2..16]=metrics, values[17]=seq
            if len(values) < 17:
                values += [""] * (17 - len(values))
            elif len(values) > 17:
                values = values[:17]

            txt_stem = Path(txt).stem
            protein_name = values[0]

            cyc_val = _detect_cyc_from_names(txt_stem, protein_name)
            values[1] = cyc_val

            fh.write(','.join(values) + '\n')

# Read all.csv, compute AA features and per-AA counts, write Dataset.csv

def stage3_append_features(in_csv: Path = Path('all.csv'), out_csv: Path = Path('Dataset.csv')):
    AA_COUNT_LABELS = [
        ("Glycine - G", "G"),
        ("Alanine - A", "A"),
        ("Leucine - L", "L"),
        ("Methionine - M", "M"),
        ("Phenylalanine - F", "F"),
        ("Tryptophan - W", "W"),
        ("Lysine - K", "K"),
        ("Glutamine - Q", "Q"),
        ("Glutamic Acid - E", "E"),
        ("Serine - S", "S"),
        ("Proline - P", "P"),
        ("Valine - V", "V"),
        ("Isoleucine - I", "I"),
        ("Cysteine - C", "C"),
        ("Tyrosine - Y", "Y"),
        ("Histidine - H", "H"),
        ("Arginine - R", "R"),
        ("Asparagine - N", "N"),
        ("Aspartic Acid - D", "D"),
        ("Threonine - T", "T"),
    ]

    with in_csv.open('r', encoding='utf-8', newline='') as fh:
        reader = csv.DictReader(fh)
        fields_in = reader.fieldnames or []

        drop_cols = {"Mean B-factor of pocket residues", "AS Sequence"}
        fields_kept = [f for f in fields_in if f not in drop_cols]

        aa_count_fields = [lbl for (lbl, _) in AA_COUNT_LABELS if lbl not in fields_kept]
        append_feature_fields = [c for c in APPEND_LABELS if c not in fields_kept and c not in aa_count_fields]
        fields_out = fields_kept + aa_count_fields + append_feature_fields

        with out_csv.open('w', encoding='utf-8', newline='') as outfh:
            writer = csv.DictWriter(outfh, fieldnames=fields_out, extrasaction='ignore')
            writer.writeheader()
            for row in reader:
                seq = row.get('AS Sequence', '')
                norm_seq = normalize_seq(seq)

                cdict = count_residues(norm_seq)
                aa_counts = {lbl: cdict.get(aa, 0) for (lbl, aa) in AA_COUNT_LABELS}

                feats = feature_counts(seq)

                out_row = {k: v for k, v in row.items() if k in fields_kept}
                out_row.update(aa_counts)
                for k in append_feature_fields:
                    out_row[k] = feats.get(k, 0)

                writer.writerow(out_row)

# Run stages
stage1_make_txts()
stage2_combine_to_csv()
stage3_append_features()

# Cleanup tagged .txt files and all.csv; keep Dataset.csv and .pdb files
for txt in Path('.').glob(f'{TAG}*.txt'):
    try:
        txt.unlink()
    except Exception:
        pass

all_csv = Path('all.csv')
if all_csv.exists():
    try:
        all_csv.unlink()
    except Exception:
        pass

print("Deleted tagged .txt files and all.csv, kept Dataset.csv and .pdb files.")


Deleted tagged .txt files and all.csv, kept Dataset.csv and .pdb files.
